url = 'http://www.seloger.com/list.htm?org=advanced_search&idtt=1&idtypebien=1&cp=75&tri=initial&nb_pieces=2&naturebien=1,2,4'

headers = {'User-Agent': '*',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'}

import requests

s = requests.Session()
s.headers.update(headers)
s.get('http://www.seloger.com/')

<Response [200]>

r = s.get(url)
r

<Response [200]>

from bs4 import BeautifulSoup

soup = BeautifulSoup(r.text, 'html.parser')

for script_item in soup.find_all('script'):
    if 'var ava_data' in script_item.text:
        raw_json = script_item.text.split('=')[1][:-25]

raw_json[:400]

' \r\n\r\n\r\n\r\n\r\n\r\n    \r\n    \r\n\r\n    \r\n    \r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n{\r\n    "search" : {\r\n        "levier" : "Recherche depuis la liste",\r\n        "nbresults" : "2\xa0158",\r\n        "nbpage" : "1",\r\n        "typedetransaction" : ["location"],\r\n        "nbpieces" : ["2"],\r\n        "typedebien" : ["Appartement"],\r\n        "pays" : "F'

import json

data = json.loads(raw_json)

import pandas as pd

data.keys()

dict_keys(['search', 'products'])

df = pd.DataFrame(data['products'])

df.head(10)

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn')
import numpy as np

import numpy as np

formatted = df.dropna()
numeric_series = ['surface', 'prix']
for series in numeric_series:
    formatted[series] = pd.to_numeric(formatted[series].str.replace(',', '.'))
formatted.plot.scatter(x='surface', y='prix')

<matplotlib.axes._subplots.AxesSubplot at 0x10ec5c668>

next_page = soup.find('div', class_='pagination-number').find('a').attrs['href']
next_page

'http://www.seloger.com/list.htm?org=advanced_search&idtt=1&idtypebien=1&cp=75&tri=initial&nb_pieces=2&naturebien=1,2,4&LISTING-LISTpg=2'

appartment_number = soup.find('div', class_='pagination-title').find('span', class_='u-500').text.replace('\xa0', '')
appartment_number

'2158'

int(appartment_number)//20

107

import time

import tqdm

appartment_data = []
for i in tqdm.tqdm_notebook(range(2, int(appartment_number)//20)):
    url = next_page[:-1] + str(i)
    r = s.get(url, headers=headers)
    time.sleep(np.random.uniform(low=10, high=25)) 
    if r.status_code == 200:
        soup = BeautifulSoup(r.text, 'html.parser')
        for script_item in soup.find_all('script'):
            if 'var ava_data' in script_item.text:
                raw_json = script_item.text.split('=')[1][:-25]
        data = json.loads(raw_json)['products']
        appartment_data.append(data)

def create_df(appartment_data):
    """Creates a nicely formatted dataframe from our raw data."""
    df = pd.concat([pd.DataFrame(item) for item in appartment_data])
    df = df.dropna()
    df = df.drop(['affichagetype', 'typedetransaction'], axis=1)
    df = df.drop_duplicates()
    df = df[['codeinsee', 'codepostal', 'etage', 'idagence', 'idannonce', 'idtiers', 'nb_photos',
            'position', 'prix', 'si_balcon', 'surface']]
    df = df.apply(lambda s: pd.to_numeric(s.str.replace(',', '.')))
    # filter out zero surface appartments
    df = df[~(df.surface == 0)]
    return df

df = create_df(appartment_data)

df.shape

(1963, 11)

df.head()

import seaborn as sns

sns.pairplot(df)

<seaborn.axisgrid.PairGrid at 0x1110e6e48>

df['surface_group'] = pd.cut(df.surface, range(0, 100, 20))

fig, ax = plt.subplots(figsize=(15, 6))
sns.stripplot(x="codeinsee", y="prix", hue='surface_group', data=df, jitter=True, ax=ax);

fig, ax = plt.subplots(figsize=(15, 6))
sns.boxplot(x="codeinsee", y="prix", hue='surface_group', data=df, ax=ax);

fig, ax = plt.subplots(figsize=(15, 6))
sns.boxplot(x="codeinsee", y="prix", data=df[df.surface_group == df.surface_group.cat.categories[1]], ax=ax);

pivot_table = df.pivot_table(index='codeinsee', columns=['surface_group'], values='prix')
pivot_table

sns.heatmap(pivot_table)

<matplotlib.axes._subplots.AxesSubplot at 0x11ab4ce48>

geo_df = pd.DataFrame({'id': [i for i in range(1, 21)], 
                       'prix': pivot_table[30].values})
geo_df.head()

import folium

m = folium.Map(location=[48.87, 2.35], zoom_start=12)
m.choropleth(
    geo_data='https://raw.githubusercontent.com/codeforamerica/click_that_hood/master/public/data/paris.geojson',
    key_on='feature.properties.cartodb_id',
    data=geo_df,
    columns=['id', 'prix'],
    fill_color='OrRd', 
    threshold_scale=[0, 900, 1100, 1300, 1500, 1700],
    highlight=True)
m

sns.lmplot(x='surface', y='prix', data=df, aspect=2)

<seaborn.axisgrid.FacetGrid at 0x11adbe550>

sns.lmplot(x='surface', y='prix', hue='codeinsee', data=df, aspect=2)

<seaborn.axisgrid.FacetGrid at 0x11aec41d0>

from sklearn.linear_model import LinearRegression

coefs = {}
for code in df.codeinsee.unique():
    x = df[df.codeinsee == code]['surface']
    y = df[df.codeinsee == code]['prix']
    reg = LinearRegression()
    reg.fit(x[:, np.newaxis], y[:, np.newaxis])
    coefs[int(str(code)[-2:]) - 1] = reg.coef_[0][0]

geo_df = pd.concat([geo_df['id'], geo_df['prix'], pd.Series(coefs, name='coef')], axis=1)

geo_df.head()

geo_df.describe()

m = folium.Map(location=[48.87, 2.35], zoom_start=12)
m.choropleth(
    geo_data='https://raw.githubusercontent.com/codeforamerica/click_that_hood/master/public/data/paris.geojson',
    key_on='feature.properties.cartodb_id',
    data=geo_df,
    columns=['id', 'coef'],
    fill_color='BuPu', 
    threshold_scale=[10, 20, 30, 40],
    legend_name='coefficient de croissance (€/m^2)',
    highlight=True)
m

fig, ax = plt.subplots(figsize=(15, 6))
sns.stripplot(x="codeinsee", y="etage", hue='surface_group', data=df, jitter=True, ax=ax);

sns.lmplot(x='etage', y='prix', data=df, aspect=2)

<seaborn.axisgrid.FacetGrid at 0x11ba67400>

sns.lmplot(x='etage', y='prix', hue='codeinsee', data=df, aspect=2)

<seaborn.axisgrid.FacetGrid at 0x11bf67b00>

sns.violinplot(x='si_balcon', y='prix', data=df, aspect=2)

<matplotlib.axes._subplots.AxesSubplot at 0x11a761d30>

	affichagetype	codeinsee	codepostal	cp	etage	idagence	idannonce	idtiers	idtypechauffage	...	nb_pieces	position	prix	produitsvisibilite	si_balcon	si_sdEau	si_sdbain	surface	typedebien	typedetransaction
0	[{'name': 'list', 'value': True}]	750118	75018	75018	2	481	124015305	122982	central radiateur	...	2	0	970	AD:AC:AG:BB:BX:AW	0	0	1	42	Appartement	[location]
1	[{'name': 'list', 'value': True}]	750120	75020	75020	1	109745	124073427	165465	0	...	2	1	1015	AD:AC:AG:BB:AW	0	0	0	26	Appartement	[location]
2	[{'name': 'list', 'value': True}]	750119	75019	75019	2	36283	123618371	136353	0	...	2	2	1222	AD:AC:BB:BX:AW	0	1	0	54,1	Appartement	[location]
3	[{'name': 'list', 'value': True}]	750116	75016	75016	7	1509	123616423	66095	central	...	2	3	1100	AD:AC:AH:BB:BX:AW	0	1	0	40	Appartement	[location]
4	[{'name': 'list', 'value': True}]	750115	75015	75015	5	1097	123594453	136882	gaz	...	2	4	1456	AD:AC:AG:BB:BX:AW	0	0	0	46,04	Appartement	[location]
5	[{'name': 'list', 'value': True}]	750115	75015	75015	5	50643	123580081	78276	central	...	2	5	1495	AD:AC:AG:BB:AW	1	0	1	54,02	Appartement	[location]
6	[{'name': 'list', 'value': True}]	750118	75018	75018	4	109745	120725333	165465	0	...	2	6	1420	AD:AC:AG:BB:AW	0	1	1	37,43	Appartement	[location]
7	[{'name': 'list', 'value': True}]	750107	75007	75007	1	60741	123841041	65362	0	...	2	7	1500	AD:AC:AG:BB:BX:AW	0	0	0	44	Appartement	[location]
8	[{'name': 'list', 'value': True}]	750116	75116	75116	2	35635	123643103	26189	individuel électrique	...	2	8	1600	AD:AC:AG:BB:AW	0	0	1	53	Appartement	[location]
9	[{'name': 'list', 'value': True}]	750116	75016	75016	3	43088	123737121	18264	central	...	2	9	1500	AD:AC:AH:AG:BB:AW	1	0	1	58	Appartement	[location]

	codeinsee	codepostal	etage	idagence	idannonce	idtiers	nb_photos	position	prix	si_balcon	surface
0	750116	75016	4	120206	122998107	179279	4	0	1400	0	41.0
1	750116	75016	1	38937	122351153	26029	4	1	1520	1	53.0
2	750116	75016	0	146641	123517815	210640	20	2	1970	0	64.0
3	750116	75016	1	146641	122006055	210640	24	3	3000	0	85.0
4	750115	75015	0	146641	124083973	210640	1	4	1499	0	54.0

surface_group	(0, 20]	(20, 40]	(40, 60]	(60, 80]
codeinsee
750101	NaN	1550.812500	1829.450000	2390.000000
750102	NaN	1412.727273	1768.058824	2261.000000
750103	NaN	1287.800000	1720.636364	2056.200000
750104	NaN	1447.913043	1763.064516	2597.875000
750105	NaN	1265.758621	1629.700000	1999.500000
750106	867.0	1393.840000	1802.258621	2367.125000
750107	900.0	1378.093750	1744.038462	2487.733333
750108	741.0	1606.517241	2026.939024	2435.476190
750109	NaN	1134.444444	1631.058824	1825.600000
750110	NaN	1074.647059	1384.241379	1926.000000
750111	1200.0	1166.628571	1398.452381	1700.000000
750112	NaN	1027.370370	1351.175000	1585.363636
750113	NaN	1027.190476	1297.088235	1525.000000
750114	NaN	1075.744186	1344.830189	1740.000000
750115	NaN	1073.352941	1412.928177	1754.333333
750116	NaN	1338.977273	1599.133333	2083.391304
750117	613.0	1235.140000	1537.494949	1811.437500
750118	695.0	1034.493671	1297.775510	1776.200000
750119	NaN	988.904762	1148.090909	1169.000000
750120	NaN	1014.187500	1160.437500	1809.000000

	id	prix
0	1	1550.812500
1	2	1412.727273
2	3	1287.800000
3	4	1447.913043
4	5	1265.758621

	id	prix	coef
0	1	1550.812500	32.384756
1	2	1412.727273	27.892517
2	3	1287.800000	28.215213
3	4	1447.913043	28.349866
4	5	1265.758621	28.509817

Paris Appartment Prices: Scrape and Plot!

Getting the data: prototyping¶

Getting the data: for-looping¶

It's plotting time¶

Neighboorhoods¶

Appartment sizes¶

Floors¶

Balconies¶

Conclusions¶

Comments

	id	prix	coef
count	20.00000	20.000000	20.000000
mean	10.50000	1226.727184	26.145085
std	5.91608	192.131252	11.392574
min	1.00000	988.904762	11.407384
25%	5.75000	1063.638124	20.269739
50%	10.50000	1200.884286	23.244399
75%	15.25000	1382.030313	28.389854
max	20.00000	1606.517241	65.560504