Benchmarking common operations

Helper class

This class will do the benchmark and summarizing visualization.

[1]:
import timeit
import datetime
import textwrap
import dataclasses

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter, NullFormatter

from tabulate import tabulate
from tqdm.auto import tqdm
[2]:
sns.set_context('talk')
[3]:
@dataclasses.dataclass
class BenchmarkResult:
    name: str
    execution_time: float  # [s]
    parameter: float


class Benchmaker:
    """Light version of https://github.com/nschloe/perfplot."""

    def __init__(
        self,
        setup,
        snippets,
        parameter_range=10 ** np.arange(2, 7),
        parameter_name='Array size',
    ):
        self.setup = setup
        self.snippets = snippets

        self.parameter_range = parameter_range
        self.parameter_name = parameter_name

        self.results = []

    def _benchmark(self, code, setup, repeat_number, exec_number):
        """Return minimal execution time in seconds."""
        timer = timeit.Timer(stmt=code, setup=setup)
        exec_number = exec_number or timer.autorange()[0]

        res = timer.repeat(repeat=repeat_number, number=exec_number)
        perf = min(res)

        return perf

    def run(self, repeat_number=5, exec_number=None):
        """Run benchmark for each snippet for all parameters."""
        if self.results:
            raise RuntimeError('This benchmark has already been executed.')

        for name, code in tqdm(self.snippets, desc='Snippets'):
            for parameter in tqdm(self.parameter_range, desc='Parameters', leave=False):
                code_fmt = textwrap.dedent(code.format(parameter=parameter))
                setup_fmt = textwrap.dedent(self.setup.format(parameter=parameter))

                perf = self._benchmark(
                    code_fmt,
                    setup_fmt,
                    repeat_number=repeat_number,
                    exec_number=exec_number,
                )
                self.results.append(BenchmarkResult(name, perf, parameter))

    def print_results(self):
        """Format results as table."""
        table = tabulate(
            [dataclasses.astuple(b) for b in self.results],
            headers=['Name', 'Execution Time [s]', 'Parameter'],
        )
        print(table)

    def visualize(self):
        """Create plot summary."""
        # prepare data
        df = pd.DataFrame([dataclasses.asdict(b) for b in self.results])

        # plot
        fig, ax = plt.subplots(figsize=(8, 6))

        sns.lineplot(data=df, x='parameter', y='execution_time', hue='name', marker='o')

        ax.set_xlabel(f'Parameter: {self.parameter_name}')
        ax.set_ylabel('Execution time [h:mm:ss]')

        ax.set_xscale('log')
        ax.set_yscale('log')

        @FuncFormatter
        def time_formatter(x, pos):
            str_ = str(datetime.timedelta(seconds=x))
            return str_.rstrip('0') if '.' in str_ else str_

        ax.yaxis.set_minor_formatter(NullFormatter())
        ax.yaxis.set_major_formatter(time_formatter)

        fig.tight_layout()

Simple example

[4]:
setup = 'import time'

snippets = [
    ('short sleep', 'time.sleep({parameter})'),
    ('long sleep', 'time.sleep({parameter} * 2)'),
]
[5]:
bench = Benchmaker(
    setup, snippets, parameter_range=[1, 2, 3], parameter_name='Sleep time [s]'
)

bench.run(repeat_number=1, exec_number=1)
bench.print_results()
Name           Execution Time [s]    Parameter
-----------  --------------------  -----------
short sleep               1.00081            1
short sleep               2.00446            2
short sleep               3.00067            3
long sleep                2.00468            1
long sleep                4.00008            2
long sleep                6.00353            3
[6]:
bench.visualize()
../_images/misc_BenchmarkingCommonOperations_8_0.png

Benchmarks

Assembling a dataframe

[7]:
setup = """
    import pandas as pd
"""

snippets = [
    (
        'pd.append',
        """
            df = pd.DataFrame()
            for i in range({parameter}):
                df = df.append(pd.DataFrame({{'i': [i]}}))
        """,
    ),
    (
        'pd.concat',
        """
            df_list = []
            for i in range({parameter}):
                df_list.append(pd.DataFrame({{'i': [i]}}))
            df = pd.concat(df_list)
        """,
    ),
    (
        'tuple-list',
        """
            tuple_list = []
            for i in range({parameter}):
                tuple_list.append((i,))
            df = pd.DataFrame(tuple_list, columns=['i'])
        """,
    ),
    (
        'dict-list',
        """
            dict_list = []
            for i in range({parameter}):
                dict_list.append({{
                    'i': i
                }})
            df = pd.DataFrame(dict_list)
        """,
    ),
]

bench = Benchmaker(setup, snippets)
[8]:
bench.run(repeat_number=1, exec_number=1)
[9]:
bench.visualize()
../_images/misc_BenchmarkingCommonOperations_13_0.png

Iterating over dataframe

[10]:
setup = """
    import numpy as np
    import pandas as pd
    import swifter
    from pandarallel import pandarallel

    pandarallel.initialize(verbose=0)

    df = pd.DataFrame(np.random.normal(size=({parameter}, 10)))

    def work(x):
        return x[0] ** 2 + x[9]
"""

snippets = [
    (
        'iterrows',
        """
            for _, row in df.iterrows():
                work(row)
        """,
    ),
    (
        'itertuples',
        """
            for row in df.itertuples():
                work(row)
        """,
    ),
    (
        'apply',
        """
            df.apply(work, axis=1)
        """,
    ),
    (
        'swifter',
        """
            df.swifter.progress_bar(False).apply(work, axis=1)
        """,
    ),
    (
        'pandarallel',
        """
            df.parallel_apply(work, axis=1)
        """,
    ),
]

bench = Benchmaker(setup, snippets)
[11]:
bench.run(repeat_number=1, exec_number=1)
[12]:
bench.visualize()
../_images/misc_BenchmarkingCommonOperations_17_0.png

Setting values to NA

[13]:
setup = """
    import numpy as np
    import pandas as pd

    df = pd.DataFrame(np.random.randint(5, 15, size=({parameter}, {parameter})))
"""

snippets = [
    (
        'indexing',
        """
            df[df == 10] = pd.NA
        """,
    ),
    (
        'replace',
        """
            df.replace(10, pd.NA, inplace=True)
        """,
    ),
]

bench = Benchmaker(setup, snippets, parameter_range=10 ** np.arange(2, 5))
[14]:
bench.run(repeat_number=1, exec_number=1)
[15]:
bench.visualize()
../_images/misc_BenchmarkingCommonOperations_21_0.png