import pandas as pd

df = pd.read_json('train.json')

df.head()

from collections import Counter

def count_ingredients(df):
    """Counts ingredients in given df."""
    
    c = Counter()
    for recipe in df['ingredients']:
        for ingredient in recipe:
            c.update([ingredient])
    
    return c

count_ingredients(df.iloc[0:1])

Counter({'black olives': 1,
         'feta cheese crumbles': 1,
         'garbanzo beans': 1,
         'garlic': 1,
         'grape tomatoes': 1,
         'pepper': 1,
         'purple onion': 1,
         'romaine lettuce': 1,
         'seasoning': 1})

df.iloc[0]['ingredients']

['romaine lettuce',
 'black olives',
 'grape tomatoes',
 'garlic',
 'pepper',
 'purple onion',
 'seasoning',
 'garbanzo beans',
 'feta cheese crumbles']

is_greek = df['cuisine'] == 'greek'
greek_counts = count_ingredients(df[is_greek])

[(key, greek_counts[key]) for key in greek_counts.keys()][:10]

[('chopped green bell pepper', 8),
 ('fresh bay leaves', 2),
 ('canned low sodium chicken broth', 5),
 ('cucumber', 187),
 ('thin pizza crust', 2),
 ('cooking spray', 67),
 ('top sirloin steak', 1),
 ('bone in skin on chicken thigh', 1),
 ('granulated garlic', 3),
 ('chopped parsley', 18)]

greek_counts.most_common(10)

[('salt', 572),
 ('olive oil', 504),
 ('dried oregano', 267),
 ('garlic cloves', 254),
 ('feta cheese crumbles', 252),
 ('extra-virgin olive oil', 229),
 ('fresh lemon juice', 222),
 ('ground black pepper', 221),
 ('garlic', 216),
 ('pepper', 203)]

non_greek_counts = count_ingredients(df[~is_greek])

from bokeh.plotting import output_notebook, figure, show
from bokeh.models import HoverTool, BoxSelectTool

output_notebook()

TOOLS = [BoxSelectTool(), HoverTool()]

p = figure(plot_width=600, plot_height=400, title='A test scatter plot with hover labels', tools=TOOLS)

p.circle([1, 2, 3, 4, 5], [2, 5, 8, 2, 7], size=10)

show(p)

from bokeh.plotting import ColumnDataSource
from bokeh.models import BoxZoomTool, ResetTool

def qqplot(x_list, y_list, specific_cuisine_name):
    """Makes a qqplot using the x_list for abscissa and y_list for ordinate ranks.
    Expects x_list and y_list to be sorted."""
    coords = []
    labels = []
    for y_index, y in enumerate(y_list):
        try:
            x_index = x_list.index(y)
        except ValueError:
            x_index = len(x_list)
        coords.append([x_index, y_index])
        labels.append(y)
    
    x_coords = [(len(x_list) - item[0]) / len(x_list) for item in coords]
    y_coords = [(len(y_list) - item[1]) / len(y_list) for item in coords]
    
    source = ColumnDataSource(
        data=dict(
            x=x_coords,
            y=y_coords,
            desc=labels,
        )
    )
    
    hover = HoverTool(
        tooltips="""
        <div>
            <div>
                <span style="font-size: 17px; font-weight: bold;">@desc</span>
                <span style="font-size: 12px; color: #966;">[$index]</span>
            </div>
            <div>
                <span style="font-size: 12px;">Location</span>
                <span style="font-size: 10px; color: #696;">($x, $y)</span>
            </div>
        </div>
        """
    )
    
    TOOLS = [BoxZoomTool(), ResetTool(), hover]

    p = figure(plot_width=600, plot_height=400, title='QQ plot for {} cuisine'.format(specific_cuisine_name), tools=TOOLS)
    
    p.circle('x', 'y', size=5, source=source)
    p.line([0, 1], [0, 1], line_width=2)
    p.xaxis.axis_label = "rank in all other cuisines"
    p.yaxis.axis_label = "rank in {} cuisine".format(specific_cuisine_name)
    show(p)

sorted_greek_ingredients = [item[0] for item in greek_counts.most_common()]

sorted_non_greek_ingredients = [item[0] for item in non_greek_counts.most_common()]

qqplot(sorted_non_greek_ingredients, sorted_greek_ingredients, 'greek')

for cuisine in df.cuisine.unique():
    is_cuisine = df.cuisine == cuisine
    cuisine_counts = count_ingredients(df[is_cuisine])
    other_counts = count_ingredients(df[~is_cuisine])
    sorted_cuisine = [item[0] for item in cuisine_counts.most_common()]
    sorted_other = [item[0] for item in other_counts.most_common()]
    qqplot(sorted_other, sorted_cuisine, cuisine)

	cuisine	id	ingredients
0	greek	10259	[romaine lettuce, black olives, grape tomatoes...
1	southern_us	25693	[plain flour, ground pepper, salt, tomatoes, g...
2	filipino	20130	[eggs, pepper, salt, mayonaise, cooking oil, g...
3	indian	22213	[water, vegetable oil, wheat, salt]
4	indian	13162	[black pepper, shallots, cornflour, cayenne pe...

Visualization the Kaggle What's Cooking recipes using Bokeh and QQ plots

Loading the data and counting it¶

Visualization using Bokeh¶

Visualization of the whole dataset¶

Comments