import numpy as np
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
from IPython.display import HTML

# let’s generate the population

N = 14
np.random.seed(12345)
population = np.random.uniform(low=0, high=10, size=N)

population

array([9.29616093, 3.16375555, 1.83918812, 2.04560279, 5.67725029,
       5.95544703, 9.6451452 , 6.53177097, 7.48906638, 6.53569871,
       7.47714809, 9.61306736, 0.08388298, 1.06444377])

# let’s make an animation that samples the population
n = 3

fig, ax = plt.subplots()
xdata, ydata = [], []
ax.plot(population, np.zeros_like(population) + 0.5, '-o', mfc='white')
ax.vlines(population.mean(), 0., 0.9, linestyles='dashed', label='population mean μ')
ln, = ax.plot([], [], '-ro')
vln = ax.vlines(0, -0.9, 0.0, label=r'sample mean $\bar{x}$', color='r', linestyles='dashed')

ax.legend()

def init():
    """Draw population."""
    ax.set_xlim(0, 10)
    ax.set_ylim(-1, 1)
    return ln,

def update(frame):
    xdata = np.random.choice(population, size=n, replace=False)
    ydata = np.zeros_like(xdata) - 0.5
    ln.set_data(xdata, ydata)
    segments = vln.get_segments()
    segments[0][:, 0] = xdata.mean()
    vln.set_segments(segments)
    return ln,

ani = FuncAnimation(fig, update, frames=np.arange(20),
                    init_func=init, blit=True, interval=2000)
plt.close(fig)
HTML(ani.to_html5_video())

#ani.save('variance_anim.mp4')

import pandas as pd

average_dist_to_sample_mean = []
average_dist_to_pop_mean = []

for i in range(4000):
    sample = np.random.choice(population, size=n, replace=False)
    sample_mean = np.mean(sample)
    average_dist_to_sample_mean.append(np.mean(np.abs(sample - sample_mean)))
    average_dist_to_pop_mean.append(np.mean(np.abs(sample - np.mean(population))))
    
df = pd.DataFrame(data=np.array([average_dist_to_sample_mean, average_dist_to_pop_mean]).T, columns=['avg dist to sample mean', 'avg dist to pop mean'])

df.head()

from matplotlib.gridspec import GridSpec

fig = plt.figure(layout="constrained", figsize=(10, 5))

gs = GridSpec(1, 4, figure=fig)
ax1 = fig.add_subplot(gs[0, :3])
ax2 = fig.add_subplot(gs[0, 3])

df.plot.line(lw=1, ax=ax1)

for label, data in df.items():
    hist, bin_edges = np.histogram(data, bins=30)
    hist = np.append(hist, hist[-1])
    bottoms = np.zeros_like(hist)
    ax2.plot(hist, bin_edges, label=label)
    
ax2.legend(fontsize=8)
ax2.set_title('histogram of distances')
ax2.set_xlabel('counts')
ax2.set_ylabel('distance')

Text(0, 0.5, 'distance')

running_mean_df = df.cumsum() / (df.index.values + 1)[:, None]

running_mean_df.plot()

<Axes: >

simulated_data = []
for i in range(10000):
    sample = np.random.choice(population, size=n, replace=False)
    sample_mean = np.mean(sample)
    s2n = np.var(sample)
    simulated_data.append([sample_mean, s2n])
    

fig = plt.figure(layout="constrained", figsize=(10, 5))

gs = GridSpec(1, 4, figure=fig)
ax1 = fig.add_subplot(gs[0, :3])
ax2 = fig.add_subplot(gs[0, 3])


pd.DataFrame(simulated_data, columns=['sample mean', '$S^2_n$']).plot.scatter(x='sample mean', y='$S^2_n$', ax=ax1, alpha=.5)
ax1.vlines(population.mean(), *ax1.get_ylim(), label='population mean', color='red')
ax1.hlines(np.var(population), *ax1.get_xlim(), label='population variance', color='gray')
ax1.legend()

hist, bin_edges = np.histogram(np.array(simulated_data)[:, 1], bins=10)
hist = np.append(hist, hist[-1])
bottoms = np.zeros_like(hist)
ax2.plot(hist, bin_edges, label=label)
ax2.hlines(np.var(population), *ax2.get_xlim(), label='population variance', color='gray')
ax2.set_title('histogram of $S^2_n$')
ax2.set_xlabel('counts')
ax2.set_ylabel('$S^2_n$')

Text(0, 0.5, '$S^2_n$')

	avg dist to sample mean	avg dist to pop mean
0	2.475660	3.355701
1	1.074484	3.762793
2	2.206585	2.172278
3	1.535100	2.136968
4	0.319711	0.597730

The $n-1$ sample variance estimation problem intuition

The problem¶

Generating samples from a population and analyzing the effect on distances¶

Another vizualization of distances¶

Visualizing the error as a function of $\bar{x}$¶

TL;DR¶

Comments