Analyzing the European Parliament

[1]:
import json

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import matplotlib.patheffects as PathEffects
from matplotlib.ticker import FuncFormatter, FixedLocator

import bokeh.models as bmo
import bokeh.plotting as bpl
from bokeh.palettes import d3

import prince
import palettable

from tqdm.auto import tqdm
[2]:
sns.set_context('talk')
bpl.output_notebook()
Loading BokehJS ...

Download data

All data is readily available on Parltrack.

[3]:
# %%bash

# wget --no-clobber https://parltrack.org/dumps/ep_votes.json.lz
# lzip -d ep_votes.json.lz

# wget --no-clobber https://parltrack.org/dumps/ep_meps.json.lz
# lzip -d ep_meps.json.lz

Transform JSON to dataframes

To easily work with the data, we transform it from JSON to pandas dataframes.

MEPs

[4]:
fname = 'ep_meps.json'

tmp = []
with open(fname) as fd:
    for line in tqdm(fd.readlines()):
        line = line.lstrip('[,]')
        if len(line) == 0:
            continue

        data = json.loads(line)

        #         if not data['active']:
        #             continue

        tmp.append(
            {
                'UserID': data['UserID'],
                'name': data['Name']['full'],
                'birthday': data['Birth']['date'] if 'Birth' in data else np.nan,
                'active': data['active'],
                'group': data.get('Groups', [{'groupid': np.nan}])[-1][
                    'groupid'
                ],  # assumption: last group is latest one. Is this true?
            }
        )
[5]:
df_meps = pd.DataFrame(tmp)
df_meps['birthday'] = pd.to_datetime(df_meps['birthday'])

df_meps.set_index('UserID', inplace=True)

df_meps['group'].replace(
    {'Group of the European United Left - Nordic Green Left': 'GUE/NGL'}, inplace=True
)  # is there a difference?

df_meps.head()
[5]:
name birthday active group
UserID
2307 Hubert PIRKER 1948-10-03 False PPE
111496 María Auxiliadora CORREA ZAMORA 1972-05-24 False PPE
110987 Gino TREMATERRA 1940-09-03 False PPE
1965 Jan MULDER 1943-10-03 False ALDE
39321 Vicente Miguel GARCÉS RAMÓN 1946-11-10 False S&D

Votes

[6]:
fname = 'ep_votes.json'

tmp = []
tmp_matrix = {}
with open(fname) as fd:
    for line in tqdm(fd.readlines()):
        line = line.lstrip('[,]')
        if len(line) == 0:
            continue

        data = json.loads(line)
        tmp.append(
            {'date': data['ts'], 'voteid': data['voteid'], 'title': data['title']}
        )

        if 'votes' in data:
            tmp_matrix[data['voteid']] = {
                **{
                    mep['mepid']: '+'
                    for mep_list in data['votes']
                    .get('+', {'groups': {'foo': []}})['groups']
                    .values()
                    for mep in mep_list
                    if 'mepid' in mep
                },
                **{
                    mep['mepid']: '-'
                    for mep_list in data['votes']
                    .get('-', {'groups': {'foo': []}})['groups']
                    .values()
                    for mep in mep_list
                    if 'mepid' in mep
                },
                **{
                    mep['mepid']: '0'
                    for mep_list in data['votes']
                    .get('0', {'groups': {'foo': []}})['groups']
                    .values()
                    for mep in mep_list
                    if 'mepid' in mep
                },
            }
[7]:
df_votematrix = pd.DataFrame.from_dict(tmp_matrix, orient='index')

df_votematrix.index.name = 'voteid'
df_votematrix.columns.name = 'mepid'

# df_votematrix.sort_values('voteid', axis=0, inplace=True)
df_votematrix.sort_values('mepid', axis=1, inplace=True)

df_votematrix.head()
[7]:
mepid 1 234 684 729 840 945 966 988 997 1002 ... 204416 204418 204419 204420 204421 204443 204449 204733 205452 206158
voteid
7754 NaN NaN + + NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
7818 - + - - NaN - NaN NaN + - ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
7759 + + + + NaN + + NaN + NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
7755 NaN 0 + NaN NaN + 0 NaN + + ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
7760 - - - - NaN - + NaN + - ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 2329 columns

[8]:
df_votes = pd.DataFrame(tmp)
df_votes['date'] = pd.to_datetime(df_votes['date'])

df_votes.set_index('voteid', inplace=True)

df_votes.tail()
[8]:
date title
voteid
116359 2020-07-23 12:49:32 B9-0229/2020 - Am 23
116398 2020-07-23 12:49:32 B9-0229/2020 - § 26/1
116399 2020-07-23 12:49:32 B9-0229/2020 - § 26/2
116360 2020-07-23 12:49:32 B9-0229/2020 - Am 1
116401 2020-07-23 16:52:06 B9-0229/2020 - Résolution

Exploration

We can then look at a few simple statistics.

MEP party distribution

How many members (MEPs) does each party of the European parliament have?

[9]:
df_meps['active'].sum()
[9]:
705
[10]:
group_counts = df_meps.loc[df_meps['active'], 'group'].value_counts()
labels = group_counts.to_frame().apply(lambda x: f'{x.name} ({x.iloc[0]})', axis=1)

ax = group_counts.plot.pie(figsize=(8, 6), labels=labels, wedgeprops=dict(width=0.5))
ax.axis('equal')
[10]:
(-1.1107175100739686,
 1.1005103586792213,
 -1.1057638158926402,
 1.1094386022707399)
../_images/mining_visualization_MEP_Statistics_16_1.png

MEP age distribution

And how old are these members?

[11]:
df_meps['age'] = (pd.Timestamp.today() - df_meps['birthday']) / np.timedelta64(1, 'Y')
[12]:
g = sns.displot(
    data=df_meps[df_meps['active']],
    x='age',
    col='group',
    col_wrap=3,
    height=3,
    aspect=4 / 3,
)

g.set_xlabels('MEP age [years]')
[12]:
<seaborn.axisgrid.FacetGrid at 0x1635ff130>
../_images/mining_visualization_MEP_Statistics_19_1.png

Voting patterns

Let’s now take a look at how these MEPs vote.

[13]:
df_votematrix.head()
[13]:
mepid 1 234 684 729 840 945 966 988 997 1002 ... 204416 204418 204419 204420 204421 204443 204449 204733 205452 206158
voteid
7754 NaN NaN + + NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
7818 - + - - NaN - NaN NaN + - ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
7759 + + + + NaN + + NaN + NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
7755 NaN 0 + NaN NaN + 0 NaN + + ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
7760 - - - - NaN - + NaN + - ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 2329 columns

Who is the most active MEP?

Here we equate “active” with “has voted most often”. This is most likely quite misleading.

[14]:
df_hasvoted = ~df_votematrix[df_meps[df_meps['active']].index].isna()
[15]:
df_hasvoted.sum(axis=0).sort_values(ascending=False).to_frame('vote_count').merge(
    df_meps, how='left', left_index=True, right_index=True
).head(10)
[15]:
vote_count name birthday active group age
mepid
28266 22954 Sophia in 't VELD 1963-09-13 True RE 57.711345
1913 22703 Evelyne GEBHARDT 1954-01-19 True S&D 67.359729
2323 22394 Rainer WIELAND 1957-02-19 True PPE 64.274108
4246 22324 Othmar KARAS 1957-12-24 True PPE 63.430833
28219 22324 Daniel CASPARY 1976-04-04 True PPE 45.152566
2341 22269 Michael GAHLER 1960-04-22 True PPE 61.103612
28224 22164 Markus PIEPER 1963-05-15 True PPE 58.042632
28298 21992 Iratxe GARCÍA PÉREZ 1974-10-07 True S&D 46.644725
23821 21912 József SZÁJER 1961-09-07 True PPE 59.726445
28223 21910 Andreas SCHWAB 1973-04-09 True PPE 48.139622

Select subset of data

For the subsequent vote clustering, we restrict ourselves to recent votes of active MEPs starting with 2020 (and remove MEPs with no votes at all).

[16]:
df_subset = df_votematrix.loc[
    df_votematrix.index.intersection(df_votes[df_votes['date'] > '2020'].index),
    df_meps[df_meps['active']].index,
].dropna(how='all', axis=1)

print(df_subset.shape)
df_subset.head()
(1808, 703)
[16]:
UserID 96750 4746 23788 96810 96808 4560 38595 1992 125106 4391 ... 204413 204334 204331 204346 204449 204400 197780 204733 205452 206158
voteid
111241 0 NaN NaN NaN + NaN NaN NaN + NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
111141 + NaN NaN + + + NaN NaN + NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
111142 + NaN NaN + + + NaN + + NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
111143 + NaN NaN + + + NaN + + NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
111781 - NaN NaN + - - NaN - + NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 703 columns

Clustered data overview

Here, we cluster both MEPs and votes as well as highlight each MEP column with their respective party association.

[17]:
# process data to work with clustermap
tmp = df_subset.fillna(0).replace({'+': 1, '0': 0, '-': -1})  # .iloc[:10, :10]
tmp.columns.rename('MEP', inplace=True)
tmp.index.rename('Vote', inplace=True)

# infer party colors
party_colors = {
    party: sns.color_palette('tab10')[i]
    for i, party in enumerate(df_meps.loc[tmp.columns, 'group'].unique())
}
party_cmap = (
    tmp.T.merge(df_meps['group'], how='inner', left_index=True, right_index=True)[
        'group'
    ]
    .map(party_colors)
    .rename('Party')
)

# main plot
g = sns.clustermap(
    tmp,
    col_colors=party_cmap,
    cmap=palettable.tableau.TrafficLight_9.hex_colors[3:6],
    figsize=(12, 12),
)

# plot improvements
g.ax_heatmap.tick_params(bottom=False, labelbottom=False, right=False, labelright=False)


@FuncFormatter
def formatter(x, pos):
    return {-1: 'no', 0: 'abstain', 1: ' yes'}[x]


g.cax.yaxis.set_major_locator(FixedLocator([-1, 0, 1]))
g.cax.yaxis.set_major_formatter(formatter)

# party color legend
g.ax_heatmap.legend(
    handles=[
        Patch(facecolor=color, label=name) for name, color in party_colors.items()
    ],
    title='Party',
    bbox_to_anchor=(1.05, 1),
    loc='upper left',
)

# TODO: improve overall colorbar/legend placement
[17]:
<matplotlib.legend.Legend at 0x155e363a0>
../_images/mining_visualization_MEP_Statistics_28_1.png

Project MEPs based on vote patterns

We will now visualize the landscape of MEPs in two dimensions based on the voting patterns using Multiple Correspondence Analysis.

Apply Multiple Correspondence Analysis (MCA)

[18]:
mca = prince.MCA(n_components=2)
df_mca = mca.fit_transform(df_subset.T)

df_mca.columns = ('MCA0', 'MCA1')
df_mca.index.rename('mepid', inplace=True)
df_mca['group'] = df_meps.loc[df_meps['active'], 'group']

print(df_mca.shape)
df_mca.head()
(703, 3)
[18]:
MCA0 MCA1 group
mepid
96750 -0.371809 0.631197 Verts/ALE
4746 0.593274 -0.471376 ECR
23788 0.398250 -0.753624 ECR
96810 0.604044 -0.488594 ECR
96808 -0.231881 -0.476251 PPE

Static visualization

[19]:
# obtained from the information box on each party's Wikipedia entry
party_ideology = {
    'Verts/ALE': 'green',
    'ECR': 'conservative',
    'PPE': 'liberal\nconservative',
    'S&D': 'socialist',
    'RE': 'liberal',
    'ID': 'right-wing',
    'GUE/NGL': 'left-wing',
}
[20]:
fig, ax = plt.subplots(figsize=(8, 6))

sns.scatterplot(data=df_mca, x='MCA0', y='MCA1', hue='group', ax=ax)

for party, row in df_mca.groupby('group').mean().iterrows():
    if party == 'NA':
        continue
    ax.text(
        row.MCA0,
        row.MCA1,
        party_ideology.get(party),
        ha='center',
        va='center',
        fontsize=10,
        path_effects=[PathEffects.withStroke(linewidth=3, foreground="w")],
    )

ax.legend(loc='upper left', bbox_to_anchor=(1.05, 1), ncol=1, title='Party')
ax.set_title('MCA of MEP Voting Patterns')
[20]:
Text(0.5, 1.0, 'MCA of MEP Voting Patterns')
../_images/mining_visualization_MEP_Statistics_34_1.png

Interactive visualization

You can zoom and pan the visualization. Hovering over each point (corresponding to a MEP) will display relevant information.

[21]:
# generate data for each tooltip
hover_data = df_meps.loc[df_mca.index]

hover_data['name'] = hover_data['name'].str.title()
hover_data['birthday'] = hover_data['birthday'].apply(
    lambda x: x.strftime("%Y-%m-%d") if not pd.isnull(x) else 'undef'
)
hover_data['age'] = hover_data['age'].apply(
    lambda x: int(x) if not pd.isnull(x) else -1
)

df_data = df_mca.merge(
    hover_data.drop('group', axis=1), left_index=True, right_index=True
)
df_data.head()
[21]:
MCA0 MCA1 group name birthday active age
mepid
96750 -0.371809 0.631197 Verts/ALE François Alfonsi 1953-09-14 True 67
4746 0.593274 -0.471376 ECR Sergio Berlato 1959-07-27 True 61
23788 0.398250 -0.753624 ECR Adam Bielan 1974-09-12 True 46
96810 0.604044 -0.488594 ECR Carlo Fidanza 1976-09-21 True 44
96808 -0.231881 -0.476251 PPE Pablo Arias Echeverría 1970-06-30 True 50
[22]:
# set up point colors
palette = d3['Category10'][df_data['group'].nunique()]
color_map = bmo.CategoricalColorMapper(
    factors=df_data['group'].unique(), palette=palette
)
[23]:
# create figure
p = bpl.figure(
    tools='hover,pan,reset,wheel_zoom,box_zoom',
    active_scroll='wheel_zoom',
    tooltips=[(col, f'@{col}') for col in hover_data],
    background_fill_color='black',
)

p.scatter(
    x='MCA0',
    y='MCA1',
    color={'field': 'group', 'transform': color_map},
    legend_field='group',
    size=5,
    source=df_data,
)

p.grid.visible = False
p.axis.visible = False

bpl.show(p)