from IPython.display import YouTubeVideo
YouTubeVideo('Iq9DzN6mvYA')

class HypothesisTest():
    def __init__(self, data):
        self.data = data
        self.MakeModel()
        self.actual = self.TestStatistic(data)

    def PValue(self, iters=1000):
        "Returns p-value of actual data based on simulated data."
        self.test_stats = [self.TestStatistic(self.RunModel()) 
                           for _ in range(iters)]

        count = sum(1 for x in self.test_stats if x >= self.actual)
        self.pval = count / iters
        return self.pval

    def TestStatistic(self, data):
        "Test statistic for the current test."
        raise UnimplementedMethodException()

    def MakeModel(self):
        pass

    def RunModel(self):
        "Returns a simulated data sample."
        raise UnimplementedMethodException()

import random

class CoinTossTest(HypothesisTest):
    """Class for testing if a coin is fair.
    Assumes data will be given as a list like: [number_of_heads, number_of_tails]."""
    
    def TestStatistic(self, data):
        "Counts the number of heads in the data."
        return data[0]

    def RunModel(self):
        "Returns data generated under the H0 hypothesis (the coin is fair)."
        n = sum(self.data)
        tosses = [0, 0]
        for _ in range(n):
            if random.random() > 0.5:
                tosses[0] += 1
            else:
                tosses[1] += 1
        return tosses

coin_test = CoinTossTest([22, 8])

coin_test.actual

22

coin_test.RunModel()

[13, 17]

coin_test.PValue()

0.01

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn-darkgrid')

def plot_simulated_distribution(test, title, bins=30, range=None):
    "Plots the simulated test statistic distribution."
    plt.hist(test.test_stats, bins=bins, range=range, 
             cumulative=False, normed=True,
            label='simulated test stat\ndistribution')
    ylim = plt.ylim()
    plt.vlines(test.actual, *ylim, label='actual test stat\n(p={:.3f})'.format(test.pval))
    plt.legend(loc='upper left')
    plt.xlabel('test statistic value')
    plt.title(title)

plot_simulated_distribution(coin_test, "Coin test", range=(0, 30), bins=31)

from itertools import accumulate

def ncr(n, r):
    """Returns from n choose r possibilities.
    Source: http://stackoverflow.com/questions/4941753/is-there-a-math-ncr-function-in-python"""
    r = min(r, n-r)
    if r == 0: return 1
    numer = list(accumulate(range(n, n-r, -1), int.__mul__))[-1]
    denom = list(accumulate(range(1, r+1), int.__mul__))[-1]
    return numer//denom

def plot_analytical_coin(n=30):
    "Plots analytical distribution under H_0 hypothesis of n coin tosses landing h heads."
    heads = list(range(n+1))
    probs = [ncr(n, h) * 0.5 ** h * 0.5 ** (n - h) for h in heads]
    plt.plot([h + 0.5 for h in heads], probs, drawstyle='steps', 
             label='analytical test stat\ndistribution')

plot_analytical_coin()
plt.legend()

<matplotlib.legend.Legend at 0x10f276358>

plot_analytical_coin()
plot_simulated_distribution(coin_test, "Coin test", range=(0, 30), bins=31)

plt.figure(figsize=(10, 7))
for ind, iters in enumerate([1000, 5000, 10000, 40000]):
    plt.subplot(2, 2, ind + 1)
    coin_test = CoinTossTest([22, 8])
    pval = coin_test.PValue(iters=iters)
    plot_analytical_coin()
    plot_simulated_distribution(coin_test, "Coin test {} samples".format(iters), 
                                range=(0, 30), bins=31)
    plt.xlim(0, 30)
plt.tight_layout()

import numpy as np

class MeanTest(HypothesisTest):
    """Test for checking if the difference in means of samples is significant."""
    
    def TestStatistic(self, data):
        "Computes Welch's t-test statistic."
        group1, group2 = data
        n1, n2 = len(group1), len(group2)
        s1, s2 = np.var(group1, ddof=1), np.var(group2, ddof=1)
        m1, m2 = np.mean(group1), np.mean(group2)
        return (m1 - m2) / np.sqrt(s1/n1 + s2/n2)

    def RunModel(self):
        "Shuffles the existing data and splits it into two subgroups."
        group1, group2 = self.data[:]
        pool = group1 + group2
        random.shuffle(pool)
        return pool[:len(group1)], pool[len(group1):]

mean_test = MeanTest([[84, 72, 57, 46, 63, 76, 99, 91], 
                      [81, 69, 74, 61, 56, 87, 69, 65, 66, 44, 62, 69]])

mean_test.actual

0.93161477717115826

mean_test.RunModel()

([81, 69, 62, 61, 69, 74, 87, 69],
 [72, 63, 57, 46, 65, 84, 99, 76, 56, 66, 91, 44])

mean_test.PValue()

0.176

plot_simulated_distribution(mean_test, "Difference in means test")

from scipy.special import gamma

def plot_analytical_mean(data=[[84, 72, 57, 46, 63, 76, 99, 91], 
                               [81, 69, 74, 61, 56, 87, 69, 65, 66, 44, 62, 69]]):
    "Plots analytical distribution under H_0 hypothesis of the Welch t-stat with given sample sizes."
    group1, group2 = data
    n1, n2 = len(group1), len(group2)
    s1, s2 = np.var(group1, ddof=1), np.var(group2, ddof=1)
    m1, m2 = np.mean(group1), np.mean(group2)
    
    nu = (s1/n1 + s2/n2)**2 / (s1**2 / n1**2 / (n1 - 1) + s2**2 / n2**2 / (n2 - 1))
    t = np.arange(-5, 5, 0.01)
    density = gamma((nu + 1) / 2) / (np.sqrt(nu * np.pi) * gamma(nu / 2)) * (1 + t**2/nu) ** (-(nu + 1)/2)
    
    plt.plot(t, density, drawstyle='steps', 
             label='analytical test stat\ndistribution')

plot_analytical_mean()
plt.legend()

<matplotlib.legend.Legend at 0x10fb8edd8>

plot_analytical_mean()
plot_simulated_distribution(mean_test, "Difference in means test")

plt.figure(figsize=(10, 7))
for ind, iters in enumerate([1000, 5000, 10000, 40000]):
    plt.subplot(2, 2, ind + 1)
    mean_test = MeanTest([[84, 72, 57, 46, 63, 76, 99, 91], 
                      [81, 69, 74, 61, 56, 87, 69, 65, 66, 44, 62, 69]])
    pval = mean_test.PValue(iters=iters)
    plot_analytical_mean()
    plot_simulated_distribution(mean_test, "Mean test {} samples".format(iters))
plt.tight_layout()

class MeanTest2(MeanTest):
    "Changes the test statistic to a difference in means."
    def TestStatistic(self, data):
        "Computes the difference in means between the two samples."
        group1, group2 = data
        m1, m2 = np.mean(group1), np.mean(group2)
        return m1 - m2

mean_test2 = MeanTest2([[84, 72, 57, 46, 63, 76, 99, 91], 
                      [81, 69, 74, 61, 56, 87, 69, 65, 66, 44, 62, 69]])

mean_test2.actual

6.5833333333333286

mean_test2.PValue()

0.157

plot_simulated_distribution(mean_test2, "Difference in means test using a simpler test statistic", bins=50)

pval = mean_test2.PValue(iters=10000)
plot_simulated_distribution(mean_test2, "Difference in means test using a simpler test statistic", bins=50)
plt.xlim(-20, 20)

(-20, 20)

class MeanTest3(MeanTest):
    "Changes the test statistic to a difference in means."
    def TestStatistic(self, data):
        "Computes the difference in means between the two samples."
        group1, group2 = data
        m1, m2 = np.mean(group1), np.mean(group2)
        return np.sin(m1 - m2)

mean_test3 = MeanTest3([[84, 72, 57, 46, 63, 76, 99, 91], 
                      [81, 69, 74, 61, 56, 87, 69, 65, 66, 44, 62, 69]])
mean_test3.PValue(iters=10000)

0.4139

plot_simulated_distribution(mean_test3, "Testing with a silly statistic", bins=50)

Replicating the Talk Statistics for Hackers

What framework? What's ThinkStats?¶

First test: coin toss¶

Second test: difference in means¶

Conclusions¶

Comments