import pandas as pd

df_train = pd.read_json('train.json')

df_train.head()

%matplotlib inline

import matplotlib.pyplot as plt
plt.style.use('ggplot')

df_train['cuisine'].value_counts().plot(kind='bar')

<matplotlib.axes._subplots.AxesSubplot at 0x10dce51d0>

from collections import Counter

counters = {}
for cuisine in df_train['cuisine'].unique():
    counters[cuisine] = Counter()
    indices = (df_train['cuisine'] == cuisine)
    for ingredients in df_train[indices]['ingredients']:
        counters[cuisine].update(ingredients)

counters['italian'].most_common(10)

[('salt', 3454),
 ('olive oil', 3111),
 ('garlic cloves', 1619),
 ('grated parmesan cheese', 1580),
 ('garlic', 1471),
 ('ground black pepper', 1444),
 ('extra-virgin olive oil', 1362),
 ('onions', 1240),
 ('water', 1052),
 ('butter', 1030)]

top10 = pd.DataFrame([[items[0] for items in counters[cuisine].most_common(10)] for cuisine in counters],
            index=[cuisine for cuisine in counters],
            columns=['top{}'.format(i) for i in range(1, 11)])
top10

import re
import base64

import pdb

def repl(m):
    ingredient = m.groups()[0]
    image_path = 'img/' + ingredient + '.png'
    with open(image_path, "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read())
    return '<td><img width=100 src="data:image/png;base64,{}"></td>'.format(encoded_string.decode('utf-8'))

table_with_images = re.sub("<td>([ \-\w]+)</td>", repl, top10.to_html())

from IPython.display import HTML

HTML(table_with_images)

df_train['all_ingredients'] = df_train['ingredients'].map(";".join)

df_train.head()

df_train['all_ingredients'].str.contains('garlic cloves')

0        False
1        False
2        False
3        False
4        False
5        False
6        False
7        False
8        False
9        False
10       False
11       False
12        True
13       False
14       False
15       False
16       False
17       False
18       False
19       False
20       False
21       False
22       False
23       False
24       False
25       False
26       False
27       False
28       False
29       False
         ...  
39744     True
39745    False
39746    False
39747    False
39748    False
39749    False
39750    False
39751    False
39752    False
39753     True
39754     True
39755    False
39756    False
39757     True
39758    False
39759    False
39760    False
39761     True
39762    False
39763    False
39764    False
39765    False
39766    False
39767    False
39768    False
39769    False
39770    False
39771    False
39772    False
39773    False
Name: all_ingredients, dtype: bool

indices = df_train['all_ingredients'].str.contains('garlic cloves')
df_train[indices]['cuisine'].value_counts().plot(kind='bar',
                                                 title='garlic cloves as found per cuisine')

<matplotlib.axes._subplots.AxesSubplot at 0x10df36c18>

relative_freq = (df_train[indices]['cuisine'].value_counts() / df_train['cuisine'].value_counts())
relative_freq.sort(inplace=True)
relative_freq.plot(kind='bar')

/Users/kappamaki/anaconda/lib/python3.4/site-packages/ipykernel/__main__.py:2: FutureWarning: sort is deprecated, use sort_values(inplace=True) for for INPLACE sorting
  from ipykernel import kernelapp as app

<matplotlib.axes._subplots.AxesSubplot at 0x10e036630>

import numpy as np
unique = np.unique(top10.values.ravel())
unique

array(['all-purpose flour', 'avocado', 'baking powder', 'baking soda',
       'black pepper', 'butter', 'buttermilk', 'cachaca',
       'cajun seasoning', 'carrots', 'cayenne pepper', 'chili powder',
       'coconut milk', 'corn starch', 'corn tortillas', 'cumin seed',
       'dried oregano', 'dried thyme', 'eggs', 'extra-virgin olive oil',
       'feta cheese crumbles', 'fish sauce', 'fresh lemon juice',
       'fresh lime juice', 'garam masala', 'garlic', 'garlic cloves',
       'ginger', 'grated parmesan cheese', 'green bell pepper',
       'green onions', 'ground allspice', 'ground black pepper',
       'ground cinnamon', 'ground cumin', 'ground ginger',
       'ground turmeric', 'jalapeno chilies', 'large eggs', 'lime', 'milk',
       'mirin', 'oil', 'olive oil', 'onions', 'paprika', 'pepper',
       'potatoes', 'red bell pepper', 'rice vinegar', 'sake', 'salt',
       'scallions', 'sesame oil', 'sesame seeds', 'shallots', 'sour cream',
       'soy sauce', 'sugar', 'tomatoes', 'unsalted butter',
       'vegetable oil', 'water'], dtype=object)

fig, axes = plt.subplots(8, 8, figsize=(20, 20))
for ingredient, ax_index in zip(unique, range(64)):
    indices = df_train['all_ingredients'].str.contains(ingredient)
    relative_freq = (df_train[indices]['cuisine'].value_counts() / df_train['cuisine'].value_counts())
    relative_freq.plot(kind='bar', ax=axes.ravel()[ax_index], fontsize=7, title=ingredient)

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()

X = cv.fit_transform(df_train['all_ingredients'].values)

X.shape

(39774, 3010)

print(list(cv.vocabulary_.keys())[:100])

['lipton', 'hand', 'branzino', 'mayonnaise', 'tip', 'gouda', 'tamarind', 'meat', 'lea', 'saltine', 'style', 'ajinomoto', 'greek', 'chuck', 'ducklings', 'korean', 'tokyo', 'tender', 'shoots', 'prik', 'bitters', 'believ', 'satsuma', 'cardamon', 'pilaf', 'roll', 'crusts', 'brisée', 'penn', 'tamale', 'boneless', 'skate', 'picante', 'raw', 'swerve', 'sponge', 'filling', 'choi', 'sharp', 'cranberry', 'salt', 'manicotti', 'atar', 'quorn', 'ale', 'pound', 'nectar', 'iron', 'licorice', 'daiya', 'mince', 'chiffonade', 'japanese', 'yuzu', 'fleshed', 'picholine', 'ragu', 'buds', 'loosely', 'peppermint', 'cashews', 'yaki', 'stir', 'nam', 'blackberries', 'heirloom', 'terrine', 'crumbles', 'bonnet', 'clam', 'katsuo', 'cauliflowerets', 'crust', 'callaloo', 'doughs', 'wraps', 'pimentos', 'shaved', 'oregano', 'tomatillo', 'lump', 'cups', 'marin', 'olives', 'nectarines', 'greekstyl', 'bottom', 'fresca', 'cardoons', 'dijonnaise', 'delicata', 'lite', 'accompaniment', 'sprouts', 'flora', 'sansho', 'groundnut', 'seitan', 'tapioca', 'short']

from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()
y = enc.fit_transform(df_train.cuisine)

y[:100]

array([ 6, 16,  4,  7,  7, 10, 17,  9, 13,  9,  9,  3,  9, 13,  9,  7,  1,
        9, 18, 19, 18, 13, 16,  3,  9,  3,  2,  9,  3, 13,  9,  2, 13, 18,
        9,  2,  9,  4, 16, 16,  9,  0, 13,  7, 13,  3,  5, 16, 16, 16, 11,
       16,  9, 16,  9, 10, 11,  7,  9,  8, 18, 18,  7, 10,  9, 18, 12,  5,
        5, 16, 17,  7, 14,  9,  9, 14, 14, 19, 11, 13,  2, 16,  5,  7,  7,
        9,  9,  7, 12, 17,  9, 16, 16,  6, 13, 13, 16,  7,  9,  9])

enc.classes_

array(['brazilian', 'british', 'cajun_creole', 'chinese', 'filipino',
       'french', 'greek', 'indian', 'irish', 'italian', 'jamaican',
       'japanese', 'korean', 'mexican', 'moroccan', 'russian',
       'southern_us', 'spanish', 'thai', 'vietnamese'], dtype=object)

from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

from sklearn.linear_model import LogisticRegression

logistic = LogisticRegression()
logistic.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

logistic.score(X_test, y_test)

0.77737272155876802

from sklearn.metrics import confusion_matrix

plt.figure(figsize=(10, 10))

cm = confusion_matrix(y_test, logistic.predict(X_test))
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

plt.imshow(cm_normalized, interpolation='nearest')
plt.title("confusion matrix")
plt.colorbar(shrink=0.3)
cuisines = df_train['cuisine'].value_counts().index
tick_marks = np.arange(len(cuisines))
plt.xticks(tick_marks, cuisines, rotation=90)
plt.yticks(tick_marks, cuisines)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')

<matplotlib.text.Text at 0x119f8c208>

/Users/kappamaki/anaconda/lib/python3.4/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):

from sklearn.metrics import classification_report

y_pred = logistic.predict(X_test)
print(classification_report(y_test, y_pred, target_names=cuisines))

              precision    recall  f1-score   support

     italian       0.78      0.47      0.58        96
     mexican       0.58      0.41      0.48       165
 southern_us       0.80      0.66      0.73       289
      indian       0.78      0.85      0.82       542
     chinese       0.70      0.54      0.61       140
      french       0.62      0.63      0.63       555
cajun_creole       0.78      0.70      0.74       228
        thai       0.85      0.90      0.87       596
    japanese       0.69      0.47      0.56       123
       greek       0.80      0.89      0.84      1587
     spanish       0.84      0.73      0.78        97
      korean       0.81      0.72      0.76       289
  vietnamese       0.81      0.76      0.78       191
    moroccan       0.89      0.91      0.90      1262
     british       0.80      0.72      0.76       157
    filipino       0.57      0.40      0.47       100
       irish       0.69      0.81      0.74       860
    jamaican       0.67      0.50      0.57       195
     russian       0.74      0.72      0.73       321
   brazilian       0.69      0.48      0.57       162

 avg / total       0.77      0.78      0.77      7955

	cuisine	id	ingredients
0	greek	10259	[romaine lettuce, black olives, grape tomatoes...
1	southern_us	25693	[plain flour, ground pepper, salt, tomatoes, g...
2	filipino	20130	[eggs, pepper, salt, mayonaise, cooking oil, g...
3	indian	22213	[water, vegetable oil, wheat, salt]
4	indian	13162	[black pepper, shallots, cornflour, cayenne pe...

	cuisine	id	ingredients	all_ingredients
0	greek	10259	[romaine lettuce, black olives, grape tomatoes...	romaine lettuce;black olives;grape tomatoes;ga...
1	southern_us	25693	[plain flour, ground pepper, salt, tomatoes, g...	plain flour;ground pepper;salt;tomatoes;ground...
2	filipino	20130	[eggs, pepper, salt, mayonaise, cooking oil, g...	eggs;pepper;salt;mayonaise;cooking oil;green c...
3	indian	22213	[water, vegetable oil, wheat, salt]	water;vegetable oil;wheat;salt
4	indian	13162	[black pepper, shallots, cornflour, cayenne pe...	black pepper;shallots;cornflour;cayenne pepper...

The Kaggle What's Cooking challenge

Exploring the training data¶

Training a logistic regression classifier¶

Inspecting the classification results using a confusion matrix¶

Conclusions¶

Comments

	top1	top2	top3	top4	top5	top6	top7	top8	top9	top10
jamaican	salt	onions	water	garlic	ground allspice	pepper	scallions	dried thyme	black pepper	garlic cloves
moroccan	salt	olive oil	ground cumin	onions	ground cinnamon	garlic cloves	water	ground ginger	carrots	paprika
irish	salt	all-purpose flour	butter	onions	potatoes	sugar	baking soda	baking powder	milk	carrots
russian	salt	sugar	onions	all-purpose flour	sour cream	eggs	water	butter	unsalted butter	large eggs
greek	salt	olive oil	dried oregano	garlic cloves	feta cheese crumbles	extra-virgin olive oil	fresh lemon juice	ground black pepper	garlic	pepper
french	salt	sugar	all-purpose flour	unsalted butter	olive oil	butter	water	large eggs	garlic cloves	ground black pepper
italian	salt	olive oil	garlic cloves	grated parmesan cheese	garlic	ground black pepper	extra-virgin olive oil	onions	water	butter
korean	soy sauce	sesame oil	garlic	green onions	sugar	salt	water	sesame seeds	onions	scallions
thai	fish sauce	garlic	salt	coconut milk	vegetable oil	soy sauce	sugar	water	garlic cloves	fresh lime juice
vietnamese	fish sauce	sugar	salt	garlic	water	carrots	soy sauce	shallots	garlic cloves	vegetable oil
southern_us	salt	butter	all-purpose flour	sugar	large eggs	baking powder	water	unsalted butter	milk	buttermilk
cajun_creole	salt	onions	garlic	green bell pepper	butter	olive oil	cayenne pepper	cajun seasoning	all-purpose flour	water
indian	salt	onions	garam masala	water	ground turmeric	garlic	cumin seed	ground cumin	vegetable oil	oil
british	salt	all-purpose flour	butter	milk	eggs	unsalted butter	sugar	onions	baking powder	large eggs
mexican	salt	onions	ground cumin	garlic	olive oil	chili powder	jalapeno chilies	sour cream	avocado	corn tortillas
brazilian	salt	onions	olive oil	lime	water	garlic cloves	garlic	cachaca	sugar	tomatoes
spanish	salt	olive oil	garlic cloves	extra-virgin olive oil	onions	water	tomatoes	ground black pepper	red bell pepper	pepper
filipino	salt	garlic	water	onions	soy sauce	pepper	oil	sugar	carrots	ground black pepper
chinese	soy sauce	sesame oil	salt	corn starch	sugar	garlic	water	green onions	vegetable oil	scallions
japanese	soy sauce	salt	mirin	sugar	water	sake	rice vinegar	vegetable oil	scallions	ginger