# necessary imports
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns; sns.set()

points = np.vstack(((np.random.randn(150, 2) * 0.75 + np.array([1, 0])),
                  (np.random.randn(50, 2) * 0.25 + np.array([-0.5, 0.5])),
                  (np.random.randn(50, 2) * 0.5 + np.array([-0.5, -0.5]))))

plt.scatter(points[:, 0], points[:, 1])
ax = plt.gca()
ax.add_artist(plt.Circle(np.array([1, 0]), 0.75/2, fill=False, lw=3))
ax.add_artist(plt.Circle(np.array([-0.5, 0.5]), 0.25/2, fill=False, lw=3))
ax.add_artist(plt.Circle(np.array([-0.5, -0.5]), 0.5/2, fill=False, lw=3))

<matplotlib.patches.Circle at 0x1a7f0c18>

def initialize_centroids(points, k):
    """returns k centroids from the initial points"""
    centroids = points.copy()
    np.random.shuffle(centroids)
    return centroids[:k]

initialize_centroids(points, 3)

array([[-0.48840901, -0.14088659],
       [-0.53984176,  0.53868892],
       [ 0.98947381,  0.30255966]])

plt.scatter(points[:, 0], points[:, 1])
centroids = initialize_centroids(points, 3)
plt.scatter(centroids[:, 0], centroids[:, 1], c='r', s=100)

<matplotlib.collections.PathCollection at 0x1f01c940>

def closest_centroid(points, centroids):
    """returns an array containing the index to the nearest centroid for each point"""
    distances = np.sqrt(((points - centroids[:, np.newaxis])**2).sum(axis=2))
    return np.argmin(distances, axis=0)

c = initialize_centroids(points, 3)
closest_centroid(points, c)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2,
       1, 0, 2, 2, 2, 2, 0, 0, 2, 2, 0, 0, 0, 0, 2, 2, 0, 0, 0, 2, 2, 0, 0,
       0, 2, 2, 2, 0, 2, 1, 2, 0, 0, 0, 0, 0, 2, 0, 1, 1, 1, 1, 2, 1, 1, 1,
       1, 1, 2, 1, 2, 1, 1, 1, 0, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 1,
       2, 0, 1, 1, 2, 1, 2, 0, 1, 1, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1], dtype=int64)

c

array([[-0.04883724,  0.6316724 ],
       [ 0.27231992,  0.32268518],
       [-0.76809563,  0.82267701]])

c_extended = c[: , np.newaxis, :]
c_extended

array([[[-0.04883724,  0.6316724 ]],

       [[ 0.27231992,  0.32268518]],

       [[-0.76809563,  0.82267701]]])

c_extended.shape

(3L, 1L, 2L)

p = points[0]
p

array([ 1.13251313, -0.49784654])

c_extended - p

array([[[-1.18135037,  1.12951893]],

       [[-0.86019321,  0.82053172]],

       [[-1.90060876,  1.32052355]]])

(c_extended - p).shape

(3L, 1L, 2L)

points[:4] - c_extended

array([[[  1.18135037e+00,  -1.12951893e+00],
        [  3.19833710e-01,  -5.72897144e-02],
        [  1.66808263e+00,   4.60938179e-01],
        [  1.45933022e+00,  -1.33264298e+00]],

       [[  8.60193206e-01,  -8.20531719e-01],
        [ -1.32345648e-03,   2.51697498e-01],
        [  1.34692546e+00,   7.69925391e-01],
        [  1.13817306e+00,  -1.02365577e+00]],

       [[  1.90060876e+00,  -1.32052355e+00],
        [  1.03909210e+00,  -2.48294333e-01],
        [  2.38734102e+00,   2.69933560e-01],
        [  2.17858861e+00,  -1.52364760e+00]]])

np.sqrt(((points[:4] - c_extended)**2).sum(axis=2))

array([[ 1.63444233,  0.32492417,  1.73059633,  1.97625455],
       [ 1.18878284,  0.25170098,  1.55144878,  1.53078707],
       [ 2.31432412,  1.06834567,  2.40255307,  2.6585241 ]])

np.argmin(np.sqrt(((points[:4] - c_extended)**2).sum(axis=2)), axis=0)

array([1, 1, 1, 1], dtype=int64)

def move_centroids(points, closest, centroids):
    """returns the new centroids assigned from the points closest to them"""
    return np.array([points[closest==k].mean(axis=0) for k in range(centroids.shape[0])])

move_centroids(points, closest_centroid(points, c), c)

array([[-0.06971844,  0.66614582],
       [ 0.85831935, -0.18948577],
       [-0.82669027,  0.23483006]])

plt.subplot(121)
plt.scatter(points[:, 0], points[:, 1])
centroids = initialize_centroids(points, 3)
plt.scatter(centroids[:, 0], centroids[:, 1], c='r', s=100)

plt.subplot(122)
plt.scatter(points[:, 0], points[:, 1])
closest = closest_centroid(points, centroids)
centroids = move_centroids(points, closest, centroids)
plt.scatter(centroids[:, 0], centroids[:, 1], c='r', s=100)

<matplotlib.collections.PathCollection at 0x1f34d0f0>

from JSAnimation import IPython_display
from matplotlib import animation

# create a simple animation
fig = plt.figure()
ax = plt.axes(xlim=(-4, 4), ylim=(-4, 4))
centroids = initialize_centroids(points, 3)

def init():
    return

def animate(i):
    global centroids
    closest = closest_centroid(points, centroids)
    centroids = move_centroids(points, closest, centroids)
    ax.cla()
    ax.scatter(points[:, 0], points[:, 1], c=closest)
    ax.scatter(centroids[:, 0], centroids[:, 1], c='r', s=100)
    return 

animation.FuncAnimation(fig, animate, init_func=init,
                        frames=10, interval=200, blit=True)

fig = plt.figure()
ax = plt.axes(xlim=(-4, 4), ylim=(-4, 4))
centroids = initialize_centroids(points, 7)

def init():
    return

def animate(i):
    global centroids
    closest = closest_centroid(points, centroids)
    centroids = move_centroids(points, closest, centroids)
    ax.cla()
    ax.scatter(points[:, 0], points[:, 1], c=closest)
    ax.scatter(centroids[:, 0], centroids[:, 1], c='r', s=100)
    return 

animation.FuncAnimation(fig, animate, init_func=init,
                        frames=30, interval=200, blit=True)

Implementing the k-means algorithm with numpy

Comments