Investigating TV Series ratings using IMDB

[1]:

import imdb

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patheffects as PathEffects

from tqdm.auto import tqdm

[2]:

sns.set_context('talk')

Load ratings

[3]:

query_list = [
    'Lost',
    'Game of Thrones',
    'Stargate SG-1',
    'Stargate: Atlantis',
    'Stargate Universe',
    'Westworld',
    'Black Mirror',
    'Breaking Bad',
    'The Witcher',
    'Dark',
    'The Expanse',
    'Buffy the Vampire Slayer',
    'Tribes of Europa',
    'Death Note',
    'Rick and Morty',
    ('Doctor Who', 2005),
]

[4]:

ia = imdb.IMDb()

[5]:

def match_series(query, entry):
    year = None
    if not isinstance(query, str):
        query, year = query

    match = True

    match &= entry['title'] == query  # must match query
    match &= entry['kind'] == kind  # must be TV series
    match &= entry.has_key('year')  # exclude weird entries without year

    if entry.has_key('year') and year is not None:
        match &= entry['year'] == year

    return match

[6]:

kind = 'tv series'
series_list = [
    x
    for query in tqdm(query_list)
    for x in ia.search_movie(query if isinstance(query, str) else query[0])
    if match_series(query, x)
]

assert len(query_list) == len(series_list)

[7]:

series_list

[7]:

[<Movie id:0411008[http] title:_"Lost" (2004)_>,
 <Movie id:0944947[http] title:_"Game of Thrones" (2011)_>,
 <Movie id:0118480[http] title:_"Stargate SG-1" (1997)_>,
 <Movie id:0374455[http] title:_"Stargate: Atlantis" (2004)_>,
 <Movie id:1286039[http] title:_"Stargate Universe" (2009)_>,
 <Movie id:0475784[http] title:_"Westworld" (2016)_>,
 <Movie id:2085059[http] title:_"Black Mirror" (2011)_>,
 <Movie id:0903747[http] title:_"Breaking Bad" (2008)_>,
 <Movie id:5180504[http] title:_"The Witcher" (2019)_>,
 <Movie id:5753856[http] title:_"Dark" (2017)_>,
 <Movie id:3230854[http] title:_"The Expanse" (2015)_>,
 <Movie id:0118276[http] title:_"Buffy the Vampire Slayer" (1997)_>,
 <Movie id:9184982[http] title:_"Tribes of Europa" (2021)_>,
 <Movie id:0877057[http] title:_"Death Note" (2006)_>,
 <Movie id:2861424[http] title:_"Rick and Morty" (2013)_>,
 <Movie id:0436992[http] title:_"Doctor Who" (2005)_>]

[8]:

tmp = []
for series in tqdm(series_list, desc='Series'):
    tqdm.write(series['title'])
    ia.update(series, 'episodes')

    for season_nr in series['episodes']:
        season = series['episodes'][season_nr]

        for episode_nr in series['episodes'][season_nr]:
            episode = season[episode_nr]

            tmp.append(
                {
                    'series': series['title'],
                    'season_nr': season_nr,
                    'episode_nr': episode_nr,
                    'episode': episode['title'],
                    'rating': episode.get('rating', np.nan),
                    'date': pd.to_datetime(episode.get('original air date')),
                    # 'raw': series.data['episodes'][season][episode]
                }
            )

df = pd.DataFrame(tmp).sort_values(by=['series', 'season_nr', 'episode_nr'])
df['idx'] = pd.Categorical(df['season_nr'].map(str) + ':' + df['episode_nr'].map(str))

# fix for latest seaborn version (otherwise all cetegory levels are plotted for all series)
df['idx'] = df['idx'].astype(str)

# only display ten seasons to not break color palette
df = df[(df['season_nr'] >= 1) & (df['season_nr'] <= 10)]

Lost
Game of Thrones
Stargate SG-1
Stargate: Atlantis
Stargate Universe
Westworld
Black Mirror
Breaking Bad
The Witcher
Dark
The Expanse
Buffy the Vampire Slayer
Tribes of Europa
Death Note
Rick and Morty
Doctor Who

[9]:

df.head()

[9]:

	series	season_nr	episode_nr	episode	rating	date	idx
572	Black Mirror	1	1	The National Anthem	7.701235	2011-12-04	1:1
573	Black Mirror	1	2	Fifteen Million Merits	8.101235	2011-12-11	1:2
574	Black Mirror	1	3	The Entire History of You	8.601235	2011-12-18	1:3
575	Black Mirror	2	1	Be Right Back	8.001235	2013-02-11	2:1
576	Black Mirror	2	2	White Bear	8.001235	2013-02-18	2:2

Visualize results

[10]:

def annotate_episode(entry, ax, m):
    return ax.annotate(
        entry.episode,
        xy=(entry.idx, entry.rating),
        xytext=(0, 10 * m),
        xycoords='data',
        textcoords='offset points',
        fontsize=10,
        ha='center',
        va='center',
        arrowprops=dict(arrowstyle='->'),
        path_effects=[PathEffects.withStroke(linewidth=3, foreground='w')],
    )

[11]:

g = sns.FacetGrid(
    df.dropna(),
    row='series',
    hue='season_nr',
    sharex=False,
    sharey=True,
    aspect=2,
    height=5,
)

g.map_dataframe(sns.lineplot, x='idx', y='rating', marker='o', estimator=None)

g.set_xticklabels([])
g.set_ylabels('IMDB score')

g.add_legend()

# annotate episodes
for (i, j, k), data in g.facet_data():
    # row, col, hue
    ax = g.facet_axis(i, j)

    if not data.empty:
        annotate_episode(data.loc[data['rating'].idxmax()], ax, 1)
        annotate_episode(data.loc[data['rating'].idxmin()], ax, -1)