# import des packages 
from bs4 import BeautifulSoup
import requests
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn')
import pandas as pd
from collections import OrderedDict
from scipy import stats
import numpy as np

url = 'http://elections.interieur.gouv.fr/legislatives-2017/'

soup = BeautifulSoup(requests.get(url).text, 'html.parser')

links = [url + tag.attrs['href'][2:] for tag in soup.find_all('a', class_='Style6')]

from functools import lru_cache

@lru_cache(maxsize=None)
def fetch_page(url):
    "Fetches url for webpage."
    r = requests.get(url)
    return r

circo_links = []
for link in links:
    soup = BeautifulSoup(fetch_page(link).text, 'html.parser')
    circo_links += [url + tag.attrs['href'][3:] for tag in soup.find_all('a') if 'circonscription' in tag.text.lower()]

len(circo_links)

577

r = fetch_page(circo_links[0])

soup = BeautifulSoup(r.text, 'html.parser')

soup.find('h3').text.replace('\n', '').replace('\t', '').split(' circonscription')[0]

'Ain (01) - 1ère'

table = soup.find_all('tbody')[1]
votes = OrderedDict()
for row in table.find_all('tr'):
    votes[row.td.text] = int(row.td.next_sibling.next_sibling.text.replace(' ', ''))

pd.Series(votes).to_frame()

table = soup.find_all('tbody')[0]
candidates = OrderedDict()
for row in table.find_all('tr'):
    candidates[row.td.text] = []
    for td in row.find_all('td')[1:]:
        stripped = td.text.strip().replace(',', '.').replace(' ', '')
        candidates[row.td.text].append(stripped)

pd.DataFrame(candidates).transpose()

def extract_circo_data(url):
    "Returns data extracted from url: name of circonscription, candidates, votes."
    r = fetch_page(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    circo_name = soup.find('h3').text.replace('\n', '').replace('\t', '').split(' circonscription')[0]
    table = soup.find_all('tbody')[0]
    candidates = OrderedDict()
    for row in table.find_all('tr'):
        candidates[row.td.text] = []
        for td in row.find_all('td')[1:]:
            stripped = td.text.strip().replace(',', '.').replace(' ', '')
            candidates[row.td.text].append(stripped)
    table = soup.find_all('tbody')[1]
    votes = OrderedDict()
    for row in table.find_all('tr'):
        votes[row.td.text] = int(row.td.next_sibling.next_sibling.text.replace(' ', ''))
    votes = pd.Series(votes).to_frame()
    votes.columns = [circo_name]
    return circo_name, pd.DataFrame(candidates).transpose(), votes

circo_name, candidates, votes = extract_circo_data(circo_links[245])

circo_name

'Haute-Marne (52) - 2ème'

votes

candidates

all_votes_data = [extract_circo_data(url)[2] for url in circo_links]

all_votes = pd.concat(all_votes_data, axis=1)

all_votes

all_votes.sum(axis=1).to_frame(name='France entière')

all_votes.sum(axis=1).to_frame(name='France entière').plot.bar()

<matplotlib.axes._subplots.AxesSubplot at 0x113248e10>

fig, ax = plt.subplots(figsize=(8, 75), dpi=100)
all_votes.transpose()[['Abstentions', 'Blancs', 'Nuls', 'Exprimés']].iloc[::-1].plot.barh(stacked=True, ax=ax, width=1.0)

<matplotlib.axes._subplots.AxesSubplot at 0x113292f98>

all_votes_relative = all_votes.transpose().copy()
for col in all_votes_relative:
    all_votes_relative[col] = all_votes_relative[col] / all_votes.transpose()['Inscrits']

all_votes_relative

fig, ax = plt.subplots(figsize=(8, 75), dpi=100)
all_votes_relative[['Abstentions', 'Blancs', 'Nuls', 'Exprimés']].iloc[::-1].plot.barh(stacked=True, ax=ax, width=1.0)

<matplotlib.axes._subplots.AxesSubplot at 0x1114f47f0>

import seaborn as sns

sns.pairplot(all_votes.transpose())

<seaborn.axisgrid.PairGrid at 0x11b82f048>

all_candidates_data = []
for url in circo_links:
    circo_name, candidates, votes = extract_circo_data(url)
    candidates.columns = ['Nuances', 'Voix', '% Inscrits', '% Exprimés', 'Ballotage']
    for col in ['Voix', '% Inscrits', '% Exprimés']:
        candidates[col] = pd.to_numeric(candidates[col])
    candidates['Circonscription'] = circo_name
    all_candidates_data.append(candidates)

all_candidates = pd.concat(all_candidates_data)

all_candidates[all_candidates.Ballotage == 'Oui']

all_candidates.groupby('Nuances').sum()['Voix'].sort_values().plot.bar()

<matplotlib.axes._subplots.AxesSubplot at 0x12451e198>

all_candidates[all_candidates.Ballotage == 'Ballotage*']['Nuances'].value_counts()

REM    454
LR     265
FN     120
FI      67
SOC     65
MDM     62
UDI     35
DVD     22
DVG     18
COM     12
REG      9
DIV      9
RDG      5
ECO      2
EXD      1
DLF      1
Name: Nuances, dtype: int64

all_candidates[all_candidates.Ballotage == 'Ballotage*'].groupby('Nuances').count()['Voix'].sort_values().plot.bar()

<matplotlib.axes._subplots.AxesSubplot at 0x12450d2e8>

max_mps = all_candidates[all_candidates.Ballotage == 'Ballotage*'].groupby('Nuances').count()['Voix']
by_voice = all_candidates.groupby('Nuances').sum()['Voix']

mp_voice = pd.DataFrame([max_mps, by_voice]).transpose()
mp_voice.columns = ['Députés max.', 'Voix']
mp_voice = mp_voice.fillna(value=0)

mp_voice.plot.scatter(x='Voix', y='Députés max.')
for label, data in mp_voice.iterrows():
    plt.annotate(xy=(data['Voix'], data['Députés max.']), s=label)
plt.plot([0, all_votes.transpose().sum()['Votants']], [0, 577], label='proportionnelle')
plt.legend(loc='lower right')
plt.xlim(0, 7e6)
plt.ylim(-10, 500)

(-10, 500)

all_candidates[(all_candidates.Circonscription == 'Ain (01) - 1ère') & (all_candidates.Ballotage == 'Ballotage*')]

voices = [13534, 10693]

xk = np.arange(len(voices))
pk = np.array(voices) / np.sum(voices)
custm = stats.rv_discrete(name='custm', values=(xk, pk))

pd.Series(custm.rvs(size=(13534 + 10693))).value_counts()

0    13604
1    10623
dtype: int64

def predict_winner(circo_name, size=1):
    "Simulates who will win a given circonscription."
    df_circo = all_candidates[all_candidates.Circonscription == circo_name]
    if 'Oui' in df_circo['Ballotage'].any():
        return np.array([v for v in df_circo.Nuances[df_circo.Ballotage == 'Oui'].values] * size)
    else:
        mask = df_circo.Ballotage == 'Ballotage*'
        voices = df_circo.Voix[mask].values
        nuances = df_circo.Nuances[mask].values
        xk = np.arange(voices.size)
        pk = np.array(voices) / np.sum(voices)
        custm = stats.rv_discrete(name='custm', values=(xk, pk))
        return nuances[custm.rvs(size=size)]

predict_winner(circo_name, size=100)

array(['REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'LR', 'REM', 'LR', 'LR',
       'REM', 'LR', 'LR', 'REM', 'REM', 'REM', 'REM', 'LR', 'REM', 'REM',
       'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM',
       'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'LR', 'REM', 'REM', 'REM',
       'REM', 'LR', 'REM', 'LR', 'REM', 'REM', 'LR', 'REM', 'REM', 'REM',
       'LR', 'LR', 'LR', 'REM', 'LR', 'REM', 'REM', 'REM', 'REM', 'REM',
       'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM',
       'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM',
       'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM',
       'REM', 'REM', 'REM', 'REM', 'LR', 'LR', 'LR', 'LR', 'REM', 'REM',
       'LR', 'REM', 'REM', 'LR'], dtype=object)

predict_winner('Morbihan (56) - 4ème', size=100)

array(['REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM',
       'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM',
       'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM',
       'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM',
       'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM',
       'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM',
       'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM',
       'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM',
       'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM',
       'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM',
       'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM', 'REM'], 
      dtype='<U3')

simulation_data = [predict_winner(circo_name, 10000) for circo_name in all_candidates.Circonscription.unique()]

parliaments = pd.DataFrame(simulation_data, index=all_candidates.Circonscription.unique()).transpose()

parliament_counts = pd.concat([series.value_counts() for ind, series in parliaments.iterrows()], axis=1).fillna(value=0)

parliament_counts.transpose().describe().transpose().sort_values(by='mean')[['min', '25%', '50%', '75%', 'max']].transpose().astype(int)

sorted_columns = parliament_counts.transpose().describe().transpose().sort_values(by='mean').index.values
sns.violinplot(parliament_counts.transpose()[sorted_columns])

/Users/kappamaki/anaconda/lib/python3.5/site-packages/seaborn/categorical.py:2342: UserWarning: The violinplot API has been changed. Attempting to adjust your arguments for the new API (which might not work). Please update your code. See the version 0.6 release notes for more info.
  warnings.warn(msg, UserWarning)

<matplotlib.axes._subplots.AxesSubplot at 0x12dce0898>

parliament_counts.transpose().describe().transpose().sort_values(by='mean')[['50%']].transpose().astype(int)

proportion_seats = all_candidates.groupby('Nuances').sum()['Voix'] / all_votes.transpose().sum()['Votants'] * 577

fig, ax = plt.subplots(figsize=(10, 7))
proportion_seats[sorted_columns].to_frame(name='sièges proportionnels aux voix du 1er tour').plot.bar(ax=ax)
sns.violinplot(parliament_counts.transpose()[sorted_columns], label='modèle probabiliste', ax=ax)
plt.ylabel("sièges prédits à l'assemblée")
plt.legend(loc='upper left')
plt.title("sièges à l'assemblée après les résultats du premier tour :\ncomparaison entre un modèle proportionnel et un modèle probabiliste");

/Users/kappamaki/anaconda/lib/python3.5/site-packages/seaborn/categorical.py:2342: UserWarning: The violinplot API has been changed. Attempting to adjust your arguments for the new API (which might not work). Please update your code. See the version 0.6 release notes for more info.
  warnings.warn(msg, UserWarning)

	0	1	2	3	4
M. Laurent MALLET	MDM	13534	16.37	33.89	Ballotage*
M. Xavier BRETON	LR	10693	12.93	26.78	Ballotage*
M. Jérôme BUISSON	FN	6174	7.47	15.46	Non
Mme Fabrine MARTIN ZEMLIK	FI	3874	4.68	9.70	Non
Mme Florence BLATRIX-CONTAT	SOC	3687	4.46	9.23	Non
M. Jacques FONTAINE	COM	656	0.79	1.64	Non
Mme Laurane RAIMONDO	ECO	562	0.68	1.41	Non
Mme Maude LÉPAGNOT	EXG	293	0.35	0.73	Non
Mme Marie CARLIER	DIV	247	0.30	0.62	Non
M. Gilbert BONNOT	DIV	211	0.26	0.53	Non

	0	1	2	3	4
M. François CORNUT-GENTILLE	LR	9808	15.92	33.81	Ballotage*
M. Frédéric FABRE	FN	8431	13.69	29.06	Ballotage*
M. Vincent BERTHET	REM	6148	9.98	21.19	Non
M. Daniel MONNIER	FI	2380	3.86	8.20	Non
M. Antoine DESFRETIER	SOC	966	1.57	3.33	Non
Mme Valérie ROFFIDAL	ECO	466	0.76	1.61	Non
M. Edouard GONZALEZ	COM	414	0.67	1.43	Non
Mme Anne HALIN	EXG	214	0.35	0.74	Non
Mme Laurence OLIVIER	DIV	183	0.30	0.63	Non

	Ain (01) - 1ère	Ain (01) - 2ème	Ain (01) - 3ème	Ain (01) - 4ème	Ain (01) - 5ème	Aisne (02) - 1ère	Aisne (02) - 2ème	Aisne (02) - 3ème	Aisne (02) - 4ème	Aisne (02) - 5ème	...	Français établis hors de France (99) - 2ème	Français établis hors de France (99) - 3ème	Français établis hors de France (99) - 4ème	Français établis hors de France (99) - 5ème	Français établis hors de France (99) - 6ème	Français établis hors de France (99) - 7ème	Français établis hors de France (99) - 8ème	Français établis hors de France (99) - 9ème	Français établis hors de France (99) - 10ème	Français établis hors de France (99) - 11ème
Inscrits	82694	93520	75614	89390	75359	72345	73981	68099	79116	82223	...	75029	120696	122765	91374	127486	105955	121399	107796	99374	92766
Abstentions	42063	47291	41131	45625	38409	36770	39857	35369	43878	41864	...	63414	95202	94943	76810	101742	78999	109986	92085	79955	67141
Votants	40631	46229	34483	43765	36950	35575	34124	32730	35238	40359	...	11615	25494	27822	14564	25744	26956	11413	15711	19419	25625
Blancs	545	471	359	521	374	517	685	639	530	609	...	36	43	68	71	47	50	41	140	175	93
Nuls	155	160	116	211	168	173	270	287	208	251	...	79	81	90	48	92	332	72	122	79	152
Exprimés	39931	45598	34008	43033	36408	34885	33169	31804	34500	39499	...	11500	25370	27664	14445	25605	26574	11300	15449	19165	25380

	Inscrits	Abstentions	Votants	Blancs	Nuls	Exprimés
Ain (01) - 1ère	1.0	0.508658	0.491342	0.006591	0.001874	0.482877
Ain (01) - 2ème	1.0	0.505678	0.494322	0.005036	0.001711	0.487575
Ain (01) - 3ème	1.0	0.543960	0.456040	0.004748	0.001534	0.449758
Ain (01) - 4ème	1.0	0.510404	0.489596	0.005828	0.002360	0.481407
Ain (01) - 5ème	1.0	0.509680	0.490320	0.004963	0.002229	0.483127
Aisne (02) - 1ère	1.0	0.508259	0.491741	0.007146	0.002391	0.482203
Aisne (02) - 2ème	1.0	0.538746	0.461254	0.009259	0.003650	0.448345
Aisne (02) - 3ème	1.0	0.519376	0.480624	0.009383	0.004214	0.467026
Aisne (02) - 4ème	1.0	0.554603	0.445397	0.006699	0.002629	0.436069
Aisne (02) - 5ème	1.0	0.509152	0.490848	0.007407	0.003053	0.480389
Allier (03) - 1ère	1.0	0.475640	0.524360	0.010407	0.004450	0.509503
Allier (03) - 2ème	1.0	0.493498	0.506502	0.010770	0.005415	0.490317
Allier (03) - 3ème	1.0	0.490003	0.509997	0.009018	0.003701	0.497278
Alpes-de-Haute-Provence (04) - 1ère	1.0	0.474919	0.525081	0.011416	0.005152	0.508513
Alpes-de-Haute-Provence (04) - 2ème	1.0	0.468405	0.531595	0.008037	0.003689	0.519870
Hautes-Alpes (05) - 1ère	1.0	0.484582	0.515418	0.009956	0.003463	0.502000
Hautes-Alpes (05) - 2ème	1.0	0.445456	0.554544	0.007343	0.002895	0.544306
Alpes-Maritimes (06) - 1ère	1.0	0.509892	0.490108	0.003977	0.002150	0.483981
Alpes-Maritimes (06) - 2ème	1.0	0.515099	0.484901	0.007497	0.002058	0.475346
Alpes-Maritimes (06) - 3ème	1.0	0.544768	0.455232	0.005161	0.002558	0.447513
Alpes-Maritimes (06) - 4ème	1.0	0.533855	0.466145	0.005351	0.003657	0.457136
Alpes-Maritimes (06) - 5ème	1.0	0.551346	0.448654	0.009226	0.003891	0.435537
Alpes-Maritimes (06) - 6ème	1.0	0.523864	0.476136	0.007500	0.001167	0.467470
Alpes-Maritimes (06) - 7ème	1.0	0.514676	0.485324	0.006835	0.001206	0.477283
Alpes-Maritimes (06) - 8ème	1.0	0.554208	0.445792	0.006281	0.001817	0.437693
Alpes-Maritimes (06) - 9ème	1.0	0.526472	0.473528	0.005642	0.001251	0.466635
Ardèche (07) - 1ère	1.0	0.486994	0.513006	0.009304	0.003811	0.499891
Ardèche (07) - 2ème	1.0	0.464164	0.535836	0.006737	0.002719	0.526380
Ardèche (07) - 3ème	1.0	0.442153	0.557847	0.007487	0.003539	0.546820
Ardennes (08) - 1ère	1.0	0.521263	0.478737	0.004990	0.002822	0.470924
...	...	...	...	...	...	...
Guyane (973) - 1ère	1.0	0.761466	0.238534	0.008741	0.004715	0.225078
Guyane (973) - 2ème	1.0	0.739048	0.260952	0.006900	0.003805	0.250247
La Réunion (974) - 1ère	1.0	0.639062	0.360938	0.013179	0.009800	0.337958
La Réunion (974) - 2ème	1.0	0.678201	0.321799	0.010668	0.011196	0.299935
La Réunion (974) - 3ème	1.0	0.621123	0.378877	0.019038	0.019944	0.339895
La Réunion (974) - 4ème	1.0	0.582800	0.417200	0.016269	0.020393	0.380539
La Réunion (974) - 5ème	1.0	0.701249	0.298751	0.011915	0.013827	0.273009
La Réunion (974) - 6ème	1.0	0.696498	0.303502	0.013459	0.012945	0.277099
La Réunion (974) - 7ème	1.0	0.670173	0.329827	0.016119	0.015630	0.298078
Mayotte (976) - 1ère	1.0	0.577151	0.422849	0.016896	0.031502	0.374451
Mayotte (976) - 2ème	1.0	0.539025	0.460975	0.017543	0.026381	0.417052
Nouvelle-Calédonie (988) - 1ère	1.0	0.660650	0.339350	0.008843	0.002748	0.327759
Nouvelle-Calédonie (988) - 2ème	1.0	0.628350	0.371650	0.009672	0.004138	0.357841
Polynésie française (987) - 1ère	1.0	0.582930	0.417070	0.004780	0.003747	0.408543
Polynésie française (987) - 2ème	1.0	0.589746	0.410254	0.004063	0.004138	0.402054
Polynésie française (987) - 3ème	1.0	0.564492	0.435508	0.006634	0.004073	0.424800
Saint-Pierre-et-Miquelon (975) - 1ère	1.0	0.405509	0.594491	0.006031	0.004021	0.584439
Wallis et Futuna (986) - 1ère	1.0	0.187264	0.812736	0.003656	0.002594	0.806486
Saint-Martin/Saint-Barthélemy (977) - 1ère	1.0	0.759797	0.240203	0.004490	0.002875	0.232838
Français établis hors de France (99) - 1ère	1.0	0.813646	0.186354	0.000290	0.001269	0.184796
Français établis hors de France (99) - 2ème	1.0	0.845193	0.154807	0.000480	0.001053	0.153274
Français établis hors de France (99) - 3ème	1.0	0.788775	0.211225	0.000356	0.000671	0.210198
Français établis hors de France (99) - 4ème	1.0	0.773372	0.226628	0.000554	0.000733	0.225341
Français établis hors de France (99) - 5ème	1.0	0.840611	0.159389	0.000777	0.000525	0.158087
Français établis hors de France (99) - 6ème	1.0	0.798064	0.201936	0.000369	0.000722	0.200846
Français établis hors de France (99) - 7ème	1.0	0.745590	0.254410	0.000472	0.003133	0.250805
Français établis hors de France (99) - 8ème	1.0	0.905988	0.094012	0.000338	0.000593	0.093081
Français établis hors de France (99) - 9ème	1.0	0.854252	0.145748	0.001299	0.001132	0.143317
Français établis hors de France (99) - 10ème	1.0	0.804587	0.195413	0.001761	0.000795	0.192857
Français établis hors de France (99) - 11ème	1.0	0.723767	0.276233	0.001003	0.001639	0.273592

	Nuances	Voix	% Inscrits	% Exprimés	Ballotage	Circonscription
M. Paul MOLAC	REM	30166	28.70	54.00	Oui	Morbihan (56) - 4ème
M. Sylvain MAILLARD	REM	24037	29.76	50.80	Oui	Paris (75) - 1ère
M. Stéphane DEMILLY	UDI	21505	26.40	53.85	Oui	Somme (80) - 5ème
M. Napole POLUTELE	DVG	3436	40.52	50.24	Oui	Wallis et Futuna (986) - 1ère

Les résultats du premier tour de l'élection législative 2017

Téléchargement des données depuis le site du ministère de l'intérieur¶

Analyse des statistiques du vote par circonscription¶

Par candidat¶

Modèle probabiliste pour les résultats du second tour des législatives¶

Conclusions¶

Comments

	Haute-Marne (52) - 2ème
Inscrits	61604
Abstentions	32008
Votants	29596
Blancs	441
Nuls	145
Exprimés	29010

	France entière
Inscrits	47570988
Abstentions	24403480
Votants	23167508
Blancs	357018
Nuls	156326
Exprimés	22654164

	EXD	DLF	ECO	RDG	DIV	REG	COM	DVD	DVG	UDI	FI	SOC	MDM	FN	LR	REM
min	0	0	0	0	0	0	0	3	4	5	7	12	24	31	81	242
25%	0	0	1	1	3	3	4	8	9	14	19	24	35	47	103	272
50%	0	0	1	2	4	4	6	9	10	16	22	26	38	50	109	279
75%	1	1	1	3	5	5	7	11	11	18	25	29	40	54	114	286
max	1	1	2	5	9	9	11	18	16	28	38	42	52	72	138	313

	EXD	DLF	ECO	RDG	DIV	REG	COM	DVD	DVG	UDI	FI	SOC	MDM	FN	LR	REM
min	0	0	0	0	0	0	0	3	4	5	7	12	24	31	81	242
25%	0	0	1	1	3	3	4	8	9	14	19	24	35	47	103	272
50%	0	0	1	2	4	4	6	9	10	16	22	26	38	50	109	279
75%	1	1	1	3	5	5	7	11	11	18	25	29	40	54	114	286
max	1	1	2	5	9	9	11	18	16	28	38	42	52	72	138	313

	EXD	DLF	ECO	RDG	DIV	REG	COM	DVD	DVG	UDI	FI	SOC	MDM	FN	LR	REM
min	0	0	0	0	0	0	0	3	4	5	7	12	24	31	81	242
25%	0	0	1	1	3	3	4	8	9	14	19	24	35	47	103	272
50%	0	0	1	2	4	4	6	9	10	16	22	26	38	50	109	279
75%	1	1	1	3	5	5	7	11	11	18	25	29	40	54	114	286
max	1	1	2	5	9	9	11	18	16	28	38	42	52	72	138	313