import pandas as pd
df = pd.DataFrame(data=[[24, 49702], [80, 172351]], 
                  index=['rugby game', 'control'], 
                  columns=['number of days', 'attendees'])
df

df['attendees per day'] = df['attendees'] / df['number of days']
df

df['attendees per day'].loc['control'] - df['attendees per day'].loc['rugby game']

83.470833333333303

83 / 2100

0.039523809523809524

import numpy as np
np.random.seed(42)

def single_day(mean, std):
    "Simulates the number of ER attendees for a single day."
    return np.random.normal(loc=mean, scale=std)

single_day(2100, 100)

2149.6714153011235

dist = [single_day(2100, 100) for _ in range(24)]
dist

[2086.1735698828816,
 2164.7688538100692,
 2252.3029856408025,
 2076.5846625276663,
 2076.5863043050817,
 2257.921281550739,
 2176.743472915291,
 2053.052561406505,
 2154.2560043585963,
 2053.6582307187537,
 2053.427024642974,
 2124.1962271566035,
 1908.67197553422,
 1927.5082167486967,
 2043.7712470759027,
 1998.7168879665576,
 2131.4247332595273,
 2009.1975924478788,
 1958.769629866471,
 2246.5648768921556,
 2077.4223699513464,
 2106.7528204687924,
 1957.5251813786542,
 2045.5617275474817]

pd.Series(dist).describe()

count      24.000000
mean     2080.898268
std        96.697184
min      1908.671976
25%      2035.127833
50%      2076.585483
75%      2137.132551
max      2257.921282
dtype: float64

pd.Series([single_day(2100, 100) for _ in range(240)]).describe()

count     240.000000
mean     2100.234578
std        99.303645
min      1775.873266
25%      2029.066005
50%      2106.225511
75%      2162.773027
max      2485.273149
dtype: float64

pd.Series([single_day(2100, 100) for _ in range(2400)]).describe()

count    2400.000000
mean     2103.999784
std        98.471270
min      1798.048784
25%      2037.499539
50%      2102.853160
75%      2168.237130
max      2419.310757
dtype: float64

pd.Series([single_day(2100, 100) for _ in range(24000)]).describe()

count    24000.000000
mean      2099.943087
std         99.887928
min       1707.759975
25%       2032.403398
50%       2100.169540
75%       2167.442840
max       2547.908425
dtype: float64

df['attendees'].sum() / (df['number of days'].sum())

2135.125

class HypothesisTest():
    def __init__(self, data):
        self.data = data
        self.MakeModel()
        self.actual = self.TestStatistic(data)

    def PValue(self, iters=1000):
        "Returns p-value of actual data based on simulated data."
        self.test_stats = [self.TestStatistic(self.RunModel()) 
                           for _ in range(iters)]

        count = sum(1 for x in self.test_stats if x >= self.actual)
        return count / iters

    def TestStatistic(self, data):
        "Test statistic for the current test."
        raise UnimplementedMethodException()

    def MakeModel(self):
        pass

    def RunModel(self):
        "Returns a simulated data sample."
        raise UnimplementedMethodException()

variance_param = 100

class ERTest(HypothesisTest):
    "Model for replicating the Furyk et al. (2012) study."
    def __init__(self, data, variance_param):
        super().__init__(data)
        self.variance_param = variance_param
        
    def TestStatistic(self, simulated_data):
        "The statistic is the relative difference in control vs game day attendees."
        sum_game, sum_control = simulated_data
        test_stat = (sum_control / 80. - sum_game / 24.)
        return test_stat

    def RunModel(self):
        "Returns a simulated data set of 104 observations under the H0 hypothesis."
        simulated_data = (sum(single_day(2135.125, self.variance_param) for _ in range(24)),
                         sum(single_day(2135.125, self.variance_param) for _ in range(80)))
        return simulated_data

data = (49702, 172351)

ert = ERTest(data, variance_param)

ert.actual

83.4708333333333

ert.PValue(iters=10000)

0.0001

import matplotlib.pyplot as plt
plt.style.use('bmh')
%matplotlib inline

def plot_test_stat(test, title=""):
    "Plots the test statistic distribution and observed value."
    plt.hist(test.test_stats, bins=30, cumulative=False, normed=True)
    ylim = plt.ylim()
    plt.vlines(test.actual, *ylim, label='observed test stat')
    plt.legend(loc='upper left')
    plt.xlabel('test statistic: control - game days (in people per day)')
    plt.title(title)

plot_test_stat(ert, "variance: {} people per day".format(variance_param))

variance_param = 500
ert1 = ERTest(data, variance_param)
ert1.PValue(iters=10000)

0.2327

plot_test_stat(ert1, "variance: {} people per day".format(variance_param))

pvalues = []
variances = [10, 50, 100, 150, 200, 250, 300, 350, 400, 500, 750, 1000]
for variance_param in variances:
    ert2 = ERTest(data, variance_param)
    pvalues.append(ert2.PValue(iters=1000))

df2 = pd.DataFrame(data= [variances, pvalues], index=['variance (people per day)', 'p-value']).transpose()
df2

df2.plot.line(x='variance (people per day)', y='p-value', style='-o')

<matplotlib.axes._subplots.AxesSubplot at 0x111051b38>

	variance (people per day)	p-value
0	10.0	0.000
1	50.0	0.000
2	100.0	0.000
3	150.0	0.006
4	200.0	0.040
5	250.0	0.088
6	300.0	0.115
7	350.0	0.140
8	400.0	0.178
9	500.0	0.245
10	750.0	0.300
11	1000.0	0.391

Do People Attend Emergency Rooms Less During Sports Events?

What is this study about?¶

Single day model¶

Running statistical tests¶

Conclusions and last plot¶

Comments