{ "cells": [ { "cell_type": "markdown", "id": "extra-france", "metadata": { "tags": [] }, "source": [ "# Benchmarking common operations" ] }, { "cell_type": "markdown", "id": "raised-ordering", "metadata": { "tags": [] }, "source": [ "## Helper class\n", "\n", "This class will do the benchmark and summarizing visualization." ] }, { "cell_type": "code", "execution_count": 1, "id": "annoying-daily", "metadata": { "tags": [] }, "outputs": [], "source": [ "import timeit\n", "import datetime\n", "import textwrap\n", "import dataclasses\n", "\n", "import numpy as np\n", "import pandas as pd\n", "\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "from matplotlib.ticker import FuncFormatter, NullFormatter\n", "\n", "from tabulate import tabulate\n", "from tqdm.auto import tqdm" ] }, { "cell_type": "code", "execution_count": 2, "id": "challenging-penalty", "metadata": { "tags": [] }, "outputs": [], "source": [ "sns.set_context('talk')" ] }, { "cell_type": "code", "execution_count": 3, "id": "brief-depth", "metadata": { "tags": [] }, "outputs": [], "source": [ "@dataclasses.dataclass\n", "class BenchmarkResult:\n", " name: str\n", " execution_time: float # [s]\n", " parameter: float\n", "\n", "\n", "class Benchmaker:\n", " \"\"\"Light version of https://github.com/nschloe/perfplot.\"\"\"\n", "\n", " def __init__(\n", " self,\n", " setup,\n", " snippets,\n", " parameter_range=10 ** np.arange(2, 7),\n", " parameter_name='Array size',\n", " ):\n", " self.setup = setup\n", " self.snippets = snippets\n", "\n", " self.parameter_range = parameter_range\n", " self.parameter_name = parameter_name\n", "\n", " self.results = []\n", "\n", " def _benchmark(self, code, setup, repeat_number, exec_number):\n", " \"\"\"Return minimal execution time in seconds.\"\"\"\n", " timer = timeit.Timer(stmt=code, setup=setup)\n", " exec_number = exec_number or timer.autorange()[0]\n", "\n", " res = timer.repeat(repeat=repeat_number, number=exec_number)\n", " perf = min(res)\n", "\n", " return perf\n", "\n", " def run(self, repeat_number=5, exec_number=None):\n", " \"\"\"Run benchmark for each snippet for all parameters.\"\"\"\n", " if self.results:\n", " raise RuntimeError('This benchmark has already been executed.')\n", "\n", " for name, code in tqdm(self.snippets, desc='Snippets'):\n", " for parameter in tqdm(self.parameter_range, desc='Parameters', leave=False):\n", " code_fmt = textwrap.dedent(code.format(parameter=parameter))\n", " setup_fmt = textwrap.dedent(self.setup.format(parameter=parameter))\n", "\n", " perf = self._benchmark(\n", " code_fmt,\n", " setup_fmt,\n", " repeat_number=repeat_number,\n", " exec_number=exec_number,\n", " )\n", " self.results.append(BenchmarkResult(name, perf, parameter))\n", "\n", " def print_results(self):\n", " \"\"\"Format results as table.\"\"\"\n", " table = tabulate(\n", " [dataclasses.astuple(b) for b in self.results],\n", " headers=['Name', 'Execution Time [s]', 'Parameter'],\n", " )\n", " print(table)\n", "\n", " def visualize(self):\n", " \"\"\"Create plot summary.\"\"\"\n", " # prepare data\n", " df = pd.DataFrame([dataclasses.asdict(b) for b in self.results])\n", "\n", " # plot\n", " fig, ax = plt.subplots(figsize=(8, 6))\n", "\n", " sns.lineplot(data=df, x='parameter', y='execution_time', hue='name', marker='o')\n", "\n", " ax.set_xlabel(f'Parameter: {self.parameter_name}')\n", " ax.set_ylabel('Execution time [h:mm:ss]')\n", "\n", " ax.set_xscale('log')\n", " ax.set_yscale('log')\n", "\n", " @FuncFormatter\n", " def time_formatter(x, pos):\n", " str_ = str(datetime.timedelta(seconds=x))\n", " return str_.rstrip('0') if '.' in str_ else str_\n", "\n", " ax.yaxis.set_minor_formatter(NullFormatter())\n", " ax.yaxis.set_major_formatter(time_formatter)\n", "\n", " fig.tight_layout()" ] }, { "cell_type": "markdown", "id": "mineral-apparatus", "metadata": { "tags": [] }, "source": [ "### Simple example" ] }, { "cell_type": "code", "execution_count": 4, "id": "essential-guyana", "metadata": { "tags": [] }, "outputs": [], "source": [ "setup = 'import time'\n", "\n", "snippets = [\n", " ('short sleep', 'time.sleep({parameter})'),\n", " ('long sleep', 'time.sleep({parameter} * 2)'),\n", "]" ] }, { "cell_type": "code", "execution_count": 5, "id": "lyric-lightweight", "metadata": { "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "32ba9adc3a5945d99def259ba5107822", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Snippets: 0%| | 0/2 [00:00" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "bench.visualize()" ] }, { "cell_type": "markdown", "id": "packed-fitness", "metadata": { "tags": [] }, "source": [ "## Benchmarks" ] }, { "cell_type": "markdown", "id": "jewish-coast", "metadata": { "tags": [] }, "source": [ "### Assembling a dataframe" ] }, { "cell_type": "code", "execution_count": 7, "id": "capable-ordinance", "metadata": { "tags": [] }, "outputs": [], "source": [ "setup = \"\"\"\n", " import pandas as pd\n", "\"\"\"\n", "\n", "snippets = [\n", " (\n", " 'pd.append',\n", " \"\"\"\n", " df = pd.DataFrame()\n", " for i in range({parameter}):\n", " df = df.append(pd.DataFrame({{'i': [i]}}))\n", " \"\"\",\n", " ),\n", " (\n", " 'pd.concat',\n", " \"\"\"\n", " df_list = []\n", " for i in range({parameter}):\n", " df_list.append(pd.DataFrame({{'i': [i]}}))\n", " df = pd.concat(df_list)\n", " \"\"\",\n", " ),\n", " (\n", " 'tuple-list',\n", " \"\"\"\n", " tuple_list = []\n", " for i in range({parameter}):\n", " tuple_list.append((i,))\n", " df = pd.DataFrame(tuple_list, columns=['i'])\n", " \"\"\",\n", " ),\n", " (\n", " 'dict-list',\n", " \"\"\"\n", " dict_list = []\n", " for i in range({parameter}):\n", " dict_list.append({{\n", " 'i': i\n", " }})\n", " df = pd.DataFrame(dict_list)\n", " \"\"\",\n", " ),\n", "]\n", "\n", "bench = Benchmaker(setup, snippets)" ] }, { "cell_type": "code", "execution_count": 8, "id": "previous-warner", "metadata": { "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "3cd8e2eae9f9479198e6998c00baf8f2", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Snippets: 0%| | 0/4 [00:00" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "bench.visualize()" ] }, { "cell_type": "markdown", "id": "correct-developer", "metadata": { "tags": [] }, "source": [ "### Iterating over dataframe" ] }, { "cell_type": "code", "execution_count": 10, "id": "blessed-anatomy", "metadata": { "tags": [] }, "outputs": [], "source": [ "setup = \"\"\"\n", " import numpy as np\n", " import pandas as pd\n", " import swifter\n", " from pandarallel import pandarallel\n", " \n", " pandarallel.initialize(verbose=0)\n", "\n", " df = pd.DataFrame(np.random.normal(size=({parameter}, 10)))\n", "\n", " def work(x):\n", " return x[0] ** 2 + x[9]\n", "\"\"\"\n", "\n", "snippets = [\n", " (\n", " 'iterrows',\n", " \"\"\"\n", " for _, row in df.iterrows():\n", " work(row)\n", " \"\"\",\n", " ),\n", " (\n", " 'itertuples',\n", " \"\"\"\n", " for row in df.itertuples():\n", " work(row)\n", " \"\"\",\n", " ),\n", " (\n", " 'apply',\n", " \"\"\"\n", " df.apply(work, axis=1)\n", " \"\"\",\n", " ),\n", " (\n", " 'swifter',\n", " \"\"\"\n", " df.swifter.progress_bar(False).apply(work, axis=1)\n", " \"\"\",\n", " ),\n", " (\n", " 'pandarallel',\n", " \"\"\"\n", " df.parallel_apply(work, axis=1)\n", " \"\"\",\n", " ),\n", "]\n", "\n", "bench = Benchmaker(setup, snippets)" ] }, { "cell_type": "code", "execution_count": 11, "id": "reliable-reader", "metadata": { "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "2afe1f81f1d0469d8145a4b4f9c5cc02", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Snippets: 0%| | 0/5 [00:00" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "bench.visualize()" ] }, { "cell_type": "markdown", "id": "loved-minutes", "metadata": { "tags": [] }, "source": [ "### Setting values to NA" ] }, { "cell_type": "code", "execution_count": 13, "id": "defensive-pasta", "metadata": { "tags": [] }, "outputs": [], "source": [ "setup = \"\"\"\n", " import numpy as np\n", " import pandas as pd\n", "\n", " df = pd.DataFrame(np.random.randint(5, 15, size=({parameter}, {parameter})))\n", "\"\"\"\n", "\n", "snippets = [\n", " (\n", " 'indexing',\n", " \"\"\"\n", " df[df == 10] = pd.NA\n", " \"\"\",\n", " ),\n", " (\n", " 'replace',\n", " \"\"\"\n", " df.replace(10, pd.NA, inplace=True)\n", " \"\"\",\n", " ),\n", "]\n", "\n", "bench = Benchmaker(setup, snippets, parameter_range=10 ** np.arange(2, 5))" ] }, { "cell_type": "code", "execution_count": 14, "id": "incorporate-incentive", "metadata": { "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "88054f7196b845cb8f495d3384946ae9", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Snippets: 0%| | 0/2 [00:00" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "bench.visualize()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.6" } }, "nbformat": 4, "nbformat_minor": 5 }