import pandas as pd

df = pd.read_csv("kanji_list.csv", sep='\t', header=None)

df.head(10)

kanji = df[1].dropna().values

type(kanji)

numpy.ndarray

kanji[:10]

array(['亜', '哀', '挨', '愛', '曖', '悪', '握', '圧', '扱', '宛'], dtype=object)

import matplotlib.pyplot as plt

%matplotlib inline

prop = fm.FontProperties(fname='ipam.ttc', size=50)

plt.figure(figsize=(1, 1))
plt.text(0, 0, kanji[0], ha='center', va='center', fontproperties=prop)
plt.xlim(-0.1, 0.1)
plt.ylim(-0.1, 0.1)

(-0.1, 0.1)

def rasterize_kanji(kanji, save_to):
    plt.figure(figsize=(1, 1))
    prop = fm.FontProperties(fname='ipam.ttc', size=70)
    plt.text(0, 0, kanji[0], ha='center', va='center', fontproperties=prop)
    plt.xlim(-0.1, 0.1)
    plt.ylim(-0.1, 0.1)
    plt.axis("off")
    plt.savefig(save_to)
    plt.close()

rasterize_kanji(kanji[0], "1.png")

from IPython.display import Image
Image(filename='1.png')

for i, k in enumerate(kanji):
    rasterize_kanji(k, "img/{0:04}.png".format(i));

import numpy as np
import sklearn
import os
from scipy import ndimage

image_names = list(filter(lambda s: s.endswith('.png', 0), os.listdir('img/')))

X = np.array([ndimage.imread(os.path.join('img/', fname), flatten=True).ravel() for fname in image_names])

X.shape

(2138, 5184)

72*72

5184

plt.imshow(X[0, :].reshape((72, 72)), cmap='gray')

<matplotlib.image.AxesImage at 0x10760ed68>

from sklearn.decomposition import PCA

pca = PCA(n_components=100)
pca.fit(X)
pca_score = pca.explained_variance_ratio_
V = pca.components_

plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance');

np.cumsum(pca_score)[-1]

0.66575521

plt.imshow(V[0, :].reshape((72, 72)), cmap='gray')
plt.colorbar()
plt.title('first principal component of kanji dataset');

from random import randint
def plot_random_kanji():
    for i, ind in zip(range(100), np.random.choice(np.arange(X.shape[0]), 100)):
        plt.subplot(10, 10, i + 1)
        plt.imshow(X[ind, :].reshape((72, 72)), cmap='gray')
        plt.axis('off')
plt.figure(figsize=(10, 10))
plot_random_kanji()

def plot_principal_components():
    for i in range(100):
        plt.subplot(10, 10, i + 1)
        plt.imshow(V[i, :].reshape((72, 72)), cmap='gray')
        plt.axis('off')

plt.figure(figsize=(10, 10))
plot_principal_components()

def decompose_character(kanji):
    weights = [(np.dot(kanji, V[i, :]), i) for i in range(100)]
    weights.sort(key=lambda s: abs(s[0]), reverse=True)
    
    for i, components in enumerate([1, 10, 50, 100]):
        approximation = np.zeros_like(kanji)
        for c in range(components):
            w, comp = weights[c]
            approximation += w * V[comp, :]  
            
        plt.subplot(2, 2, i + 1)
        plt.imshow(approximation.reshape((72, 72)), cmap='gray')
        plt.axis('off')

decompose_character(X[0, :])

import skimage

skimage.__version__

'0.11.3'

from skimage.filters import threshold_otsu

def decompose_character_threshold(kanji):
    weights = [(np.dot(kanji, V[i, :]), i) for i in range(100)]
    weights.sort(key=lambda s: abs(s[0]), reverse=True)
    
    for i, components in enumerate([1, 10, 25, 50, 100]):
        approximation = np.zeros_like(kanji)
        for c in range(components):
            w, comp = weights[c]
            approximation += w * V[comp, :]  
        thresh = threshold_otsu(approximation)
        binary = approximation > thresh
        plt.subplot(2, 3, i + 1)
        plt.imshow(binary.reshape((72, 72)), cmap='gray')
        plt.axis('off')
        
    plt.subplot(2, 3, 6)
    plt.imshow(kanji.reshape((72, 72)), cmap='gray')
    plt.axis('off')

decompose_character_threshold(X[0, :])

from IPython.html.widgets import interact

interact(lambda index: decompose_character_threshold(X[index, :]),
         index=(0, X.shape[0] - 1))

def approximate_reconstruction(kanji, n_components):
    weights = [(np.dot(kanji, V[i, :]), i) for i in range(100)]
    weights.sort(key=lambda s: abs(s[0]), reverse=True)
    
    approximation = np.zeros_like(kanji)
    for c in range(n_components):
        w, comp = weights[c]
        approximation += w * V[comp, :]  
    thresh = threshold_otsu(approximation)
    binary = approximation > thresh
    plt.imshow(binary.reshape((72, 72)), cmap='gray')
    plt.axis('off')

plt.figure(figsize=(10, 10))
for i in range(100):
    plt.subplot(10, 10, i + 1)
    approximate_reconstruction(X[np.random.choice(np.arange(X.shape[0])), :], 100)

plt.figure(figsize=(10, 10))
for i in range(100):
    plt.subplot(10, 10, i + 1)
    approximate_reconstruction(X[np.random.choice(np.arange(X.shape[0])), :], 20)

means = pca.transform(X[:, :]).mean(axis=0)
stds = np.std(pca.transform(X[:, :]), axis=0)

new_kanji = np.zeros_like(X[0, :])
# we select between 5 and 50 random components in our resulting kanji
for i in range(np.random.randint(5, 50)):
    component = np.random.choice(np.arange(V.shape[0]))
    weight = np.random.normal(means[component], stds[component])
    new_kanji += weight * V[component, :]

plt.imshow(new_kanji.reshape((72, 72)), cmap='gray')
plt.axis('off')

(-0.5, 71.5, 71.5, -0.5)

thresh = threshold_otsu(new_kanji)
binary = new_kanji > thresh
plt.imshow(binary.reshape((72, 72)), cmap='gray')
plt.axis('off')

(-0.5, 71.5, 71.5, -0.5)

new_kanji_smooth = ndimage.gaussian_filter(new_kanji.reshape((72, 72)), 1.5)
plt.imshow((new_kanji_smooth > threshold_otsu(new_kanji_smooth)).reshape((72, 72)), cmap='gray')
plt.axis('off')

(-0.5, 71.5, 71.5, -0.5)

def examine_smoothing(factor):
    new_kanji_smooth = ndimage.gaussian_filter(new_kanji.reshape((72, 72)), factor)
    plt.imshow((new_kanji_smooth > threshold_otsu(new_kanji_smooth)).reshape((72, 72)), cmap='gray')
    plt.axis('off')

interact(examine_smoothing,
         factor=(0.5, 5.5, 0.1))

new_kanji_smooth = ndimage.gaussian_filter(new_kanji.reshape((72, 72)), 1.8)
new_kanji_smooth = ndimage.grey_dilation(new_kanji_smooth, size=2)
new_kanji_smooth = ndimage.gaussian_filter(new_kanji_smooth, 1.8)
plt.imshow((new_kanji_smooth > threshold_otsu(new_kanji_smooth)).reshape((72, 72)), cmap='gray')
plt.axis('off')

(-0.5, 71.5, 71.5, -0.5)

def make_gallery(func):
    for i in range(100):
        plt.subplot(10, 10, i + 1)
        func()

def generate_new_kanji(factor):
    new_kanji = np.zeros_like(X[0, :])
    # we select between 5 and 50 random components in our resulting kanji
    for i in range(np.random.randint(5, 50)):
        component = np.random.choice(np.arange(V.shape[0]))
        weight = np.random.normal(means[component], stds[component])
        new_kanji += weight * V[component, :]

    new_kanji_smooth = ndimage.gaussian_filter(new_kanji.reshape((72, 72)), factor)
    plt.imshow((new_kanji_smooth > threshold_otsu(new_kanji_smooth)).reshape((72, 72)), cmap='gray')
    plt.axis('off')

plt.figure(figsize=(10, 10))
make_gallery(lambda : generate_new_kanji(1.8))

plt.figure(figsize=(10, 10))
make_gallery(lambda : generate_new_kanji(2.3))

def generate_new_kanji_binary_op(factor):
    new_kanji = np.zeros_like(X[0, :])
    # we select between 5 and 50 random components in our resulting kanji
    for i in range(np.random.randint(5, 50)):
        component = np.random.choice(np.arange(V.shape[0]))
        weight = np.random.normal(means[component], stds[component])
        new_kanji += weight * V[component, :]

    new_kanji_smooth = ndimage.gaussian_filter(new_kanji.reshape((72, 72)), factor)
    new_kanji_smooth = ndimage.grey_erosion(new_kanji_smooth, size=2)
    new_kanji_smooth = ndimage.gaussian_filter(new_kanji_smooth, factor)
    plt.imshow((new_kanji_smooth > threshold_otsu(new_kanji_smooth)).reshape((72, 72)), cmap='gray')
    plt.axis('off')

plt.figure(figsize=(10, 10))
make_gallery(lambda : generate_new_kanji_binary_op(2.1))

plt.figure(figsize=(10, 10))
make_gallery(lambda : generate_new_kanji_binary_op(2.8))

plt.figure(figsize=(10, 10))
make_gallery(lambda : generate_new_kanji_binary_op(1.5))

	0	1	2	3	4	5	6	7	8
0	1	亜	亞	二	7	S	NaN	sub-	ア
1	a	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	2	哀	NaN	口	9	S	NaN	pathetic	アイ、あわ-れ、あわ-れむ
3	ai, awa-re, awa-remu	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	3	挨	NaN	手	10	S	2010	push open	アイ
5	ai	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
6	4	愛	NaN	心	13	4	NaN	love	アイ
7	ai	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
8	5	曖	NaN	日	17	S	2010	not clear	アイ
9	ai	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

Exploring Japanese characters with principal component analysis

Making the images¶

Computing the principal component analysis on our images¶

Bonus: how to create new characters from the dataset¶

Conclusions¶

Comments