diff --git a/_toc.yml b/_toc.yml index 8755167..d6fb845 100644 --- a/_toc.yml +++ b/_toc.yml @@ -22,7 +22,11 @@ subtrees: - file: ch-dask/task-graph-partitioning - file: ch-dask-dataframe/index entries: + - file: ch-dask-dataframe/dask-pandas - file: ch-dask-dataframe/read-write + - file: ch-dask-dataframe/indexing + - file: ch-dask-dataframe/map-partitions + - file: ch-dask-dataframe/shuffle - file: ch-ray-core/index entries: - file: ch-ray-core/ray-intro @@ -45,5 +49,8 @@ subtrees: - file: ch-mpi/remote-memory-access - file: ch-mpi-large-model/index entries: + - file: ch-mpi-large-model/large-model - file: ch-mpi-large-model/nccl + - file: ch-mpi-large-model/data-parallel + - file: ch-mpi-large-model/pipeline-parallel - file: ref diff --git a/ch-dask-dataframe/dask-expr.ipynb b/ch-dask-dataframe/dask-expr.ipynb new file mode 100644 index 0000000..45f7f70 --- /dev/null +++ b/ch-dask-dataframe/dask-expr.ipynb @@ -0,0 +1,471 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "(dask-expr)=\n", + "# Dask Expressions\n", + "\n", + "在数据工程领域,比如 Spark 或者 SQL 数据库,有一些通用的、经典的优化方法,比如谓词下推(Predicate pushdown)。这些优化技术又被称为查询优化(Query Optimization),已经被数据工程深入研究过,它们可有效加速数据处理的速度。\n", + "\n", + "早期的 Dask DataFrame 并没有做这些优化。2023年开始,Dask DataFrame 推出了 Dask Expressions (dask-expr),专门用于查询优化,加速数据处理速度。Dask Expressions 保留了 Dask DataFrame 的 API,用户仍然使用原来的 API,只不过 Dask 在背后帮忙进行了查询优化。\n", + "\n", + "安装好后,可以直接 `import dask_expr as dd` 引入包,为了区别 `dask.dataframe` 也可以 `import dask_expr as dx`" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple\n", + "Requirement already satisfied: dask-expr in /Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages (1.0.4)\n", + "Requirement already satisfied: dask==2024.3.1 in /Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages (from dask-expr) (2024.3.1)\n", + "Requirement already satisfied: pyarrow>=7.0.0 in /Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages (from dask-expr) (15.0.2)\n", + "Requirement already satisfied: pandas>=2 in /Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages (from dask-expr) (2.2.1)\n", + "Requirement already satisfied: click>=8.1 in /Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages (from dask==2024.3.1->dask-expr) (8.1.7)\n", + "Requirement already satisfied: cloudpickle>=1.5.0 in /Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages (from dask==2024.3.1->dask-expr) (3.0.0)\n", + "Requirement already satisfied: fsspec>=2021.09.0 in /Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages (from dask==2024.3.1->dask-expr) (2023.12.2)\n", + "Requirement already satisfied: packaging>=20.0 in /Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages (from dask==2024.3.1->dask-expr) (23.2)\n", + "Requirement already satisfied: partd>=1.2.0 in /Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages (from dask==2024.3.1->dask-expr) (1.4.1)\n", + "Requirement already satisfied: pyyaml>=5.3.1 in /Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages (from dask==2024.3.1->dask-expr) (6.0.1)\n", + "Requirement already satisfied: toolz>=0.10.0 in /Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages (from dask==2024.3.1->dask-expr) (0.12.0)\n", + "Requirement already satisfied: importlib-metadata>=4.13.0 in /Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages (from dask==2024.3.1->dask-expr) (7.0.1)\n", + "Requirement already satisfied: numpy<2,>=1.23.2 in /Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages (from pandas>=2->dask-expr) (1.26.3)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages (from pandas>=2->dask-expr) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages (from pandas>=2->dask-expr) (2023.3.post1)\n", + "Requirement already satisfied: tzdata>=2022.7 in /Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages (from pandas>=2->dask-expr) (2023.4)\n", + "Requirement already satisfied: zipp>=0.5 in /Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages (from importlib-metadata>=4.13.0->dask==2024.3.1->dask-expr) (3.17.0)\n", + "Requirement already satisfied: locket in /Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages (from partd>=1.2.0->dask==2024.3.1->dask-expr) (1.0.0)\n", + "Requirement already satisfied: six>=1.5 in /Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages (from python-dateutil>=2.8.2->pandas>=2->dask-expr) (1.16.0)\n" + ] + } + ], + "source": [ + "!pip install dask-expr" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import dask.dataframe as dd\n", + "import dask_expr as dx" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import shutil\n", + "import urllib.request\n", + "\n", + "folder_path = os.path.join(os.getcwd(), \"../data/nyc-taxi\")\n", + "download_url = [\n", + " \"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet\",\n", + " \"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet\",\n", + " \"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet\",\n", + " \"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-04.parquet\",\n", + " \"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-05.parquet\",\n", + " \"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-06.parquet\",\n", + " \"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-07.parquet\",\n", + " \"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-08.parquet\",\n", + " \"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-09.parquet\",\n", + " \"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-10.parquet\",\n", + " \"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-11.parquet\",\n", + " \"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-12.parquet\"\n", + "]\n", + "if not os.path.exists(folder_path):\n", + " os.makedirs(folder_path)\n", + "for url in download_url:\n", + " file_name = url.split(\"/\")[-1]\n", + " parquet_file_path = os.path.join(folder_path, file_name)\n", + " if not os.path.exists(os.path.join(folder_path, file_name)):\n", + " with urllib.request.urlopen(url) as response, open(parquet_file_path, 'wb') as out_file:\n", + " shutil.copyfileobj(response, out_file)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "先看一个没有经过优化的例子:" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "284 ms ± 2.06 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit\n", + "ddf = dd.read_parquet(os.path.join(folder_path, \"*.parquet\"))\n", + "payment_filtered = (\n", + " ddf[ddf.payment_type == 1]['tip_amount']\n", + ")\n", + "payment_filtered_mean = payment_filtered.mean().compute()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "首先将所有数据读取到内存中,然后过滤 `payment_type` 为 1 的行,并只需要 `tip_amount` 列。行和列的过滤本来可以在数据读取阶段提前进行。也就是查询优化中的谓词下推:在离数据读取越近的地方,提前进行数据的过滤,避免那些不会被用到的数据被读取进来。\n", + "\n", + "如果 Dask DataFrame 用户对查询优化比较熟悉,可以将上面的代码修改为:" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "146 ms ± 1.26 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "%%timeit\n", + "ddf = dd.read_parquet(\n", + " os.path.join(folder_path, \"*.parquet\"),\n", + " filters=[(\"payment_type\", \"==\", 1)],\n", + " columns=[\"tip_amount\"],\n", + ")\n", + "payment_filtered_mean = ddf.tip_amount.mean().compute()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "幸运地是,Dask DataFrame 现在提供了一种自动的优化方式:Dask Expressions,它不需要用户了解查询优化。Dask Expressions " + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Sum: numeric_only=True\n", + " Projection: columns='tip_amount'\n", + " Filter:\n", + " ReadParquetFSSpec: path='/Users/luweizheng/Projects/godaai/distributed-python/ch-dask-dataframe/../data/nyc-taxi/*.parquet' kwargs={'dtype_backend': None}\n", + " EQ: right=1\n", + " Projection: columns='payment_type'\n", + " ReadParquetFSSpec: path='/Users/luweizheng/Projects/godaai/distributed-python/ch-dask-dataframe/../data/nyc-taxi/*.parquet' kwargs={'dtype_backend': None}\n" + ] + } + ], + "source": [ + "ddf = dx.read_parquet(os.path.join(folder_path, \"*.parquet\"))\n", + "payment_filtered = (\n", + " ddf[ddf.payment_type == 1]['tip_amount']\n", + ")\n", + "payment_filtered_mean = payment_filtered.sum(numeric_only=True)\n", + "payment_filtered_mean.pprint()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "148 ms ± 1.11 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "%%timeit\n", + "ddf = dx.read_parquet(\n", + " os.path.join(folder_path, \"*.parquet\"),\n", + " filters=[(\"payment_type\", \"==\", 1)],\n", + " columns=[\"tip_amount\"],\n", + ")\n", + "payment_filtered_mean = ddf.tip_amount.mean().optimize().compute()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "'numpy.float64' object has no attribute 'compute'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m:1\u001b[0m\n", + "\u001b[0;31mAttributeError\u001b[0m: 'numpy.float64' object has no attribute 'compute'" + ] + } + ], + "source": [ + "%%time\n", + "payment_filtered_mean.compute()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "'numpy.float64' object has no attribute 'optimize'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m:1\u001b[0m\n", + "\u001b[0;31mAttributeError\u001b[0m: 'numpy.float64' object has no attribute 'optimize'" + ] + } + ], + "source": [ + "%%time\n", + "optimized_payment_filtered_mean = payment_filtered_mean.optimize()\n", + "optimized_payment_filtered_mean.pprint()\n", + "optimized_payment_filtered_mean.compute()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'optimized_payment_filtered_mean' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[10], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43moptimized_payment_filtered_mean\u001b[49m\u001b[38;5;241m.\u001b[39msimplify()\u001b[38;5;241m.\u001b[39mpprint()\n", + "\u001b[0;31mNameError\u001b[0m: name 'optimized_payment_filtered_mean' is not defined" + ] + } + ], + "source": [ + "optimized_payment_filtered_mean.simplify().pprint()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "281 ms ± 3.24 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit\n", + "optimized_payment_filtered_mean.compute()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Dask DataFrame Structure:
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nameidxy
npartitions=364
2024-01-01stringint64float64float64
2024-01-02............
...............
2024-12-29............
2024-12-30............
\n", + "
Dask Name: to_string_dtype, 2 expressions
" + ], + "text/plain": [ + "Dask DataFrame Structure:\n", + " name id x y\n", + "npartitions=364 \n", + "2024-01-01 string int64 float64 float64\n", + "2024-01-02 ... ... ... ...\n", + "... ... ... ... ...\n", + "2024-12-29 ... ... ... ...\n", + "2024-12-30 ... ... ... ...\n", + "Dask Name: to_string_dtype, 2 expressions\n", + "Expr=ArrowStringConversion(frame=Timeseries(5e5bcf2))" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = dx.datasets.timeseries(\n", + " start=\"2024-01-01\", \n", + " end=\"2024-12-30\", \n", + " freq=\"100ms\",\n", + ")\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "ename": "TypeError", + "evalue": "'ArrowStringArray' with dtype string does not support reduction 'sum' with pyarrow version 15.0.2. 'sum' may be supported by upgrading pyarrow.", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mArrowNotImplementedError\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/pandas/core/arrays/arrow/array.py:1685\u001b[0m, in \u001b[0;36mArrowExtensionArray._reduce_pyarrow\u001b[0;34m(self, name, skipna, **kwargs)\u001b[0m\n\u001b[1;32m 1684\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1685\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mpyarrow_meth\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata_to_reduce\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mskip_nulls\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mskipna\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1686\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mAttributeError\u001b[39;00m, \u001b[38;5;167;01mNotImplementedError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m err:\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/pyarrow/compute.py:263\u001b[0m, in \u001b[0;36m_make_generic_wrapper..wrapper\u001b[0;34m(memory_pool, options, *args, **kwargs)\u001b[0m\n\u001b[1;32m 262\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m Expression\u001b[38;5;241m.\u001b[39m_call(func_name, \u001b[38;5;28mlist\u001b[39m(args), options)\n\u001b[0;32m--> 263\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcall\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmemory_pool\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/pyarrow/_compute.pyx:385\u001b[0m, in \u001b[0;36mpyarrow._compute.Function.call\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/pyarrow/error.pxi:154\u001b[0m, in \u001b[0;36mpyarrow.lib.pyarrow_internal_check_status\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/pyarrow/error.pxi:91\u001b[0m, in \u001b[0;36mpyarrow.lib.check_status\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mArrowNotImplementedError\u001b[0m: Function 'sum' has no kernel matching input types (large_string)", + "\nThe above exception was the direct cause of the following exception:\n", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[4], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[43mdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mid\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m==\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m1000\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msum\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mx\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/dask_expr/_collection.py:1328\u001b[0m, in \u001b[0;36mFrameBase.sum\u001b[0;34m(self, axis, skipna, numeric_only, min_count, split_every, **kwargs)\u001b[0m\n\u001b[1;32m 1319\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m axis \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 1320\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmap_partitions(\n\u001b[1;32m 1321\u001b[0m M\u001b[38;5;241m.\u001b[39msum,\n\u001b[1;32m 1322\u001b[0m skipna\u001b[38;5;241m=\u001b[39mskipna,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1325\u001b[0m min_count\u001b[38;5;241m=\u001b[39mmin_count,\n\u001b[1;32m 1326\u001b[0m )\n\u001b[0;32m-> 1328\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mnew_collection\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexpr\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msum\u001b[49m\u001b[43m(\u001b[49m\u001b[43mskipna\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnumeric_only\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msplit_every\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1329\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_apply_min_count(result, min_count)\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/dask_expr/_collection.py:4578\u001b[0m, in \u001b[0;36mnew_collection\u001b[0;34m(expr)\u001b[0m\n\u001b[1;32m 4576\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mnew_collection\u001b[39m(expr):\n\u001b[1;32m 4577\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Create new collection from an expr\"\"\"\u001b[39;00m\n\u001b[0;32m-> 4578\u001b[0m meta \u001b[38;5;241m=\u001b[39m \u001b[43mexpr\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_meta\u001b[49m\n\u001b[1;32m 4579\u001b[0m expr\u001b[38;5;241m.\u001b[39m_name \u001b[38;5;66;03m# Ensure backend is imported\u001b[39;00m\n\u001b[1;32m 4580\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m get_collection_type(meta)(expr)\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/functools.py:1001\u001b[0m, in \u001b[0;36mcached_property.__get__\u001b[0;34m(self, instance, owner)\u001b[0m\n\u001b[1;32m 999\u001b[0m val \u001b[38;5;241m=\u001b[39m cache\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mattrname, _NOT_FOUND)\n\u001b[1;32m 1000\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m val \u001b[38;5;129;01mis\u001b[39;00m _NOT_FOUND:\n\u001b[0;32m-> 1001\u001b[0m val \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43minstance\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1002\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1003\u001b[0m cache[\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mattrname] \u001b[38;5;241m=\u001b[39m val\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/dask_expr/_reductions.py:429\u001b[0m, in \u001b[0;36mApplyConcatApply._meta\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 427\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mcached_property\n\u001b[1;32m 428\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_meta\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m--> 429\u001b[0m meta \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_meta_chunk\u001b[49m\n\u001b[1;32m 430\u001b[0m aggregate \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maggregate \u001b[38;5;129;01mor\u001b[39;00m (\u001b[38;5;28;01mlambda\u001b[39;00m x: x)\n\u001b[1;32m 431\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcombine:\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/functools.py:1001\u001b[0m, in \u001b[0;36mcached_property.__get__\u001b[0;34m(self, instance, owner)\u001b[0m\n\u001b[1;32m 999\u001b[0m val \u001b[38;5;241m=\u001b[39m cache\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mattrname, _NOT_FOUND)\n\u001b[1;32m 1000\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m val \u001b[38;5;129;01mis\u001b[39;00m _NOT_FOUND:\n\u001b[0;32m-> 1001\u001b[0m val \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43minstance\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1002\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1003\u001b[0m cache[\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mattrname] \u001b[38;5;241m=\u001b[39m val\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/dask_expr/_reductions.py:425\u001b[0m, in \u001b[0;36mApplyConcatApply._meta_chunk\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 422\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mcached_property\n\u001b[1;32m 423\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_meta_chunk\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 424\u001b[0m meta \u001b[38;5;241m=\u001b[39m meta_nonempty(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mframe\u001b[38;5;241m.\u001b[39m_meta)\n\u001b[0;32m--> 425\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mchunk\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmeta\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mchunk_kwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/dask_expr/_reductions.py:740\u001b[0m, in \u001b[0;36mReduction.chunk\u001b[0;34m(cls, df, **kwargs)\u001b[0m\n\u001b[1;32m 738\u001b[0m \u001b[38;5;129m@classmethod\u001b[39m\n\u001b[1;32m 739\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mchunk\u001b[39m(\u001b[38;5;28mcls\u001b[39m, df, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m--> 740\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreduction_chunk\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 741\u001b[0m \u001b[38;5;66;03m# Return a dataframe so that the concatenated version is also a dataframe\u001b[39;00m\n\u001b[1;32m 742\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m out\u001b[38;5;241m.\u001b[39mto_frame()\u001b[38;5;241m.\u001b[39mT \u001b[38;5;28;01mif\u001b[39;00m is_series_like(out) \u001b[38;5;28;01melse\u001b[39;00m out\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/dask/utils.py:1241\u001b[0m, in \u001b[0;36mmethodcaller.__call__\u001b[0;34m(self, _methodcaller__obj, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1240\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, __obj, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m-> 1241\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m__obj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmethod\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/pandas/core/frame.py:11657\u001b[0m, in \u001b[0;36mDataFrame.sum\u001b[0;34m(self, axis, skipna, numeric_only, min_count, **kwargs)\u001b[0m\n\u001b[1;32m 11648\u001b[0m \u001b[38;5;129m@doc\u001b[39m(make_doc(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msum\u001b[39m\u001b[38;5;124m\"\u001b[39m, ndim\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m2\u001b[39m))\n\u001b[1;32m 11649\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21msum\u001b[39m(\n\u001b[1;32m 11650\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 11655\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m 11656\u001b[0m ):\n\u001b[0;32m> 11657\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msum\u001b[49m\u001b[43m(\u001b[49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mskipna\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnumeric_only\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmin_count\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 11658\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m result\u001b[38;5;241m.\u001b[39m__finalize__(\u001b[38;5;28mself\u001b[39m, method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msum\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/pandas/core/generic.py:12503\u001b[0m, in \u001b[0;36mNDFrame.sum\u001b[0;34m(self, axis, skipna, numeric_only, min_count, **kwargs)\u001b[0m\n\u001b[1;32m 12495\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21msum\u001b[39m(\n\u001b[1;32m 12496\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 12497\u001b[0m axis: Axis \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 12501\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m 12502\u001b[0m ):\n\u001b[0;32m> 12503\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_min_count_stat_function\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 12504\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43msum\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnanops\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnansum\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mskipna\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnumeric_only\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmin_count\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\n\u001b[1;32m 12505\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/pandas/core/generic.py:12486\u001b[0m, in \u001b[0;36mNDFrame._min_count_stat_function\u001b[0;34m(self, name, func, axis, skipna, numeric_only, min_count, **kwargs)\u001b[0m\n\u001b[1;32m 12483\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m axis \u001b[38;5;129;01mis\u001b[39;00m lib\u001b[38;5;241m.\u001b[39mno_default:\n\u001b[1;32m 12484\u001b[0m axis \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[0;32m> 12486\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_reduce\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 12487\u001b[0m \u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 12488\u001b[0m \u001b[43m \u001b[49m\u001b[43mname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 12489\u001b[0m \u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 12490\u001b[0m \u001b[43m \u001b[49m\u001b[43mskipna\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mskipna\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 12491\u001b[0m \u001b[43m \u001b[49m\u001b[43mnumeric_only\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mnumeric_only\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 12492\u001b[0m \u001b[43m \u001b[49m\u001b[43mmin_count\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmin_count\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 12493\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/pandas/core/frame.py:11549\u001b[0m, in \u001b[0;36mDataFrame._reduce\u001b[0;34m(self, op, name, axis, skipna, numeric_only, filter_type, **kwds)\u001b[0m\n\u001b[1;32m 11545\u001b[0m df \u001b[38;5;241m=\u001b[39m df\u001b[38;5;241m.\u001b[39mT\n\u001b[1;32m 11547\u001b[0m \u001b[38;5;66;03m# After possibly _get_data and transposing, we are now in the\u001b[39;00m\n\u001b[1;32m 11548\u001b[0m \u001b[38;5;66;03m# simple case where we can use BlockManager.reduce\u001b[39;00m\n\u001b[0;32m> 11549\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[43mdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_mgr\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreduce\u001b[49m\u001b[43m(\u001b[49m\u001b[43mblk_func\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 11550\u001b[0m out \u001b[38;5;241m=\u001b[39m df\u001b[38;5;241m.\u001b[39m_constructor_from_mgr(res, axes\u001b[38;5;241m=\u001b[39mres\u001b[38;5;241m.\u001b[39maxes)\u001b[38;5;241m.\u001b[39miloc[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 11551\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m out_dtype \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m out\u001b[38;5;241m.\u001b[39mdtype \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mboolean\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/pandas/core/internals/managers.py:1500\u001b[0m, in \u001b[0;36mBlockManager.reduce\u001b[0;34m(self, func)\u001b[0m\n\u001b[1;32m 1498\u001b[0m res_blocks: \u001b[38;5;28mlist\u001b[39m[Block] \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m 1499\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m blk \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mblocks:\n\u001b[0;32m-> 1500\u001b[0m nbs \u001b[38;5;241m=\u001b[39m \u001b[43mblk\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreduce\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1501\u001b[0m res_blocks\u001b[38;5;241m.\u001b[39mextend(nbs)\n\u001b[1;32m 1503\u001b[0m index \u001b[38;5;241m=\u001b[39m Index([\u001b[38;5;28;01mNone\u001b[39;00m]) \u001b[38;5;66;03m# placeholder\u001b[39;00m\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/pandas/core/internals/blocks.py:404\u001b[0m, in \u001b[0;36mBlock.reduce\u001b[0;34m(self, func)\u001b[0m\n\u001b[1;32m 398\u001b[0m \u001b[38;5;129m@final\u001b[39m\n\u001b[1;32m 399\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mreduce\u001b[39m(\u001b[38;5;28mself\u001b[39m, func) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mlist\u001b[39m[Block]:\n\u001b[1;32m 400\u001b[0m \u001b[38;5;66;03m# We will apply the function and reshape the result into a single-row\u001b[39;00m\n\u001b[1;32m 401\u001b[0m \u001b[38;5;66;03m# Block with the same mgr_locs; squeezing will be done at a higher level\u001b[39;00m\n\u001b[1;32m 402\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mndim \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m2\u001b[39m\n\u001b[0;32m--> 404\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalues\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 406\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvalues\u001b[38;5;241m.\u001b[39mndim \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 407\u001b[0m res_values \u001b[38;5;241m=\u001b[39m result\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/pandas/core/frame.py:11457\u001b[0m, in \u001b[0;36mDataFrame._reduce..blk_func\u001b[0;34m(values, axis)\u001b[0m\n\u001b[1;32m 11455\u001b[0m dtype_has_keepdims[values\u001b[38;5;241m.\u001b[39mdtype] \u001b[38;5;241m=\u001b[39m has_keepdims\n\u001b[1;32m 11456\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m has_keepdims:\n\u001b[0;32m> 11457\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mvalues\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_reduce\u001b[49m\u001b[43m(\u001b[49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mskipna\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mskipna\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkeepdims\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 11458\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 11459\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[1;32m 11460\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(values)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m._reduce will require a `keepdims` parameter \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 11461\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124min the future\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 11462\u001b[0m \u001b[38;5;167;01mFutureWarning\u001b[39;00m,\n\u001b[1;32m 11463\u001b[0m stacklevel\u001b[38;5;241m=\u001b[39mfind_stack_level(),\n\u001b[1;32m 11464\u001b[0m )\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/pandas/core/arrays/string_arrow.py:567\u001b[0m, in \u001b[0;36mArrowStringArray._reduce\u001b[0;34m(self, name, skipna, keepdims, **kwargs)\u001b[0m\n\u001b[1;32m 564\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_reduce\u001b[39m(\n\u001b[1;32m 565\u001b[0m \u001b[38;5;28mself\u001b[39m, name: \u001b[38;5;28mstr\u001b[39m, \u001b[38;5;241m*\u001b[39m, skipna: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m, keepdims: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs\n\u001b[1;32m 566\u001b[0m ):\n\u001b[0;32m--> 567\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_reduce_calc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mskipna\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mskipna\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkeepdims\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkeepdims\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 568\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m (\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124margmin\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124margmax\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, pa\u001b[38;5;241m.\u001b[39mArray):\n\u001b[1;32m 569\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_convert_int_dtype(result)\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/pandas/core/arrays/arrow/array.py:1761\u001b[0m, in \u001b[0;36mArrowExtensionArray._reduce_calc\u001b[0;34m(self, name, skipna, keepdims, **kwargs)\u001b[0m\n\u001b[1;32m 1758\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_reduce_calc\u001b[39m(\n\u001b[1;32m 1759\u001b[0m \u001b[38;5;28mself\u001b[39m, name: \u001b[38;5;28mstr\u001b[39m, \u001b[38;5;241m*\u001b[39m, skipna: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m, keepdims: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs\n\u001b[1;32m 1760\u001b[0m ):\n\u001b[0;32m-> 1761\u001b[0m pa_result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_reduce_pyarrow\u001b[49m\u001b[43m(\u001b[49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mskipna\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mskipna\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1763\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m keepdims:\n\u001b[1;32m 1764\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(pa_result, pa\u001b[38;5;241m.\u001b[39mScalar):\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/pandas/core/arrays/arrow/array.py:1693\u001b[0m, in \u001b[0;36mArrowExtensionArray._reduce_pyarrow\u001b[0;34m(self, name, skipna, **kwargs)\u001b[0m\n\u001b[1;32m 1686\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mAttributeError\u001b[39;00m, \u001b[38;5;167;01mNotImplementedError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m err:\n\u001b[1;32m 1687\u001b[0m msg \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 1688\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(\u001b[38;5;28mself\u001b[39m)\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m with dtype \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1689\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdoes not support reduction \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m with pyarrow \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1690\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mversion \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpa\u001b[38;5;241m.\u001b[39m__version__\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m. \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m may be supported by \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1691\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mupgrading pyarrow.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1692\u001b[0m )\n\u001b[0;32m-> 1693\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(msg) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m 1694\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m name \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmedian\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 1695\u001b[0m \u001b[38;5;66;03m# GH 52679: Use quantile instead of approximate_median; returns array\u001b[39;00m\n\u001b[1;32m 1696\u001b[0m result \u001b[38;5;241m=\u001b[39m result[\u001b[38;5;241m0\u001b[39m]\n", + "\u001b[0;31mTypeError\u001b[0m: 'ArrowStringArray' with dtype string does not support reduction 'sum' with pyarrow version 15.0.2. 'sum' may be supported by upgrading pyarrow." + ] + } + ], + "source": [ + "out = df[df.id == 1000].sum()[\"x\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dispy", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/ch-dask-dataframe/dask-pandas.md b/ch-dask-dataframe/dask-pandas.md new file mode 100644 index 0000000..e83c41e --- /dev/null +++ b/ch-dask-dataframe/dask-pandas.md @@ -0,0 +1,3 @@ +# Dask DataFrame 与 pandas + +pandas 已经成为 DataFrame 的标准,但无法利用多核和集群,Dask DataFrame 试图解决 pandas 并行计算的问题。Dask DataFrame 尽量提供与 pandas 一致的 API,但使用起来,Dask DataFrame 仍有很多不同。Dask DataFrame 将大数据切分成小的 Partition,每个 Partition 是一个 pandas DataFrame。Dask 会把 DataFrame 的元数据记录下来,存储在 `_meta` 中。多个 Partition 的信息存储在 `partitions` 里和 `divisions` 里。Dask 用 Task Graph 管理计算图。对于用户来说,其实不需要太深入理解 Dask DataFrame 具体如何实现的,只需要调用类 pandas 的上层 API。本章假设用户已经了解并熟悉 pandas,并重点讨论 Dask DataFrame 与 pandas 的区别。 \ No newline at end of file diff --git a/ch-dask-dataframe/index.md b/ch-dask-dataframe/index.md index 11ead8b..5a59379 100644 --- a/ch-dask-dataframe/index.md +++ b/ch-dask-dataframe/index.md @@ -1,6 +1,4 @@ # Dask DataFrame -pandas 已经成为 DataFrame 的标准,但无法利用多核和集群,Dask DataFrame 试图解决 pandas 并行计算的问题。Dask DataFrame 尽量提供与 pandas 一致的 API,但使用起来,Dask DataFrame 仍有很多不同。本章假设用户已经了解并熟悉 pandas,并重点讨论 Dask DataFrame 与 pandas 的区别。 - ```{tableofcontents} ``` \ No newline at end of file diff --git a/ch-dask-dataframe/indexing.ipynb b/ch-dask-dataframe/indexing.ipynb new file mode 100644 index 0000000..e90f3e6 --- /dev/null +++ b/ch-dask-dataframe/indexing.ipynb @@ -0,0 +1,1043 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "(dask-dataframe-indexing)=\n", + "# 索引" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "tags": [ + "hide-cell" + ] + }, + "outputs": [], + "source": [ + "%config InlineBackend.figure_format = 'svg'\n", + "import os\n", + "import urllib\n", + "import shutil\n", + "from zipfile import ZipFile\n", + "\n", + "import dask\n", + "import dask.dataframe as dd\n", + "import pandas as pd\n", + "from dask.distributed import LocalCluster, Client\n", + "\n", + "cluster = LocalCluster()\n", + "client = Client(cluster)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "如 {numref}`pandas-dataframe-model` 所示,pandas DataFrame 主要对二维的表进行处理,有列标签和行标签。行标签通常会被用户忽视,但实际上起着至关重要的作用,比如索引(Indexing)。大多数 pandas DataFrame 的行标签是排好序的索引,比如从 0 开始递增。 DataFrame 里面的数据也是有序的。\n", + "\n", + "```{figure} ../img/ch-dask-dataframe/dataframe-model.svg\n", + "---\n", + "width: 200px\n", + "name: pandas-dataframe-model\n", + "---\n", + "pandas DataFrame 数据模型\n", + "```\n", + "\n", + "创建 pandas DataFrame 时,会在最左侧自动生成了索引列,它不是 DataFrame 的“官方”字段,因为索引列并没有列名。" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCD
0fooone110
1barone220
2baztwo330
3quxthree440
\n", + "
" + ], + "text/plain": [ + " A B C D\n", + "0 foo one 1 10\n", + "1 bar one 2 20\n", + "2 baz two 3 30\n", + "3 qux three 4 40" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.DataFrame({\n", + " 'A': ['foo', 'bar', 'baz', 'qux'],\n", + " 'B': ['one', 'one', 'two', 'three'],\n", + " 'C': [1, 2, 3, 4],\n", + " 'D': [10, 20, 30, 40]\n", + "})\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "也可以设置一个字段作为索引列:" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
BCD
A
fooone110
barone220
baztwo330
quxthree440
\n", + "
" + ], + "text/plain": [ + " B C D\n", + "A \n", + "foo one 1 10\n", + "bar one 2 20\n", + "baz two 3 30\n", + "qux three 4 40" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = df.set_index('A')\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "或者重置回原来的结构:" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCD
0fooone110
1barone220
2baztwo330
3quxthree440
\n", + "
" + ], + "text/plain": [ + " A B C D\n", + "0 foo one 1 10\n", + "1 bar one 2 20\n", + "2 baz two 3 30\n", + "3 qux three 4 40" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = df.reset_index()\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 有序行索引\n", + "\n", + "Dask DataFrame 由多个 pandas DataFrame 组成,但如何在全局维度维护整个 Dask DataFrame 行标签和行顺序是一个很大的挑战。Dask DataFrame 并没有刻意保留全局有序性,也使得它无法支持所有 pandas DataFrame 的功能。\n", + "\n", + "如 {numref}`dask-dataframe-divisions` 所示,Dask DataFrame 在切分时有 `divisions`。 \n", + "\n", + "```{figure} ../img/ch-dask-dataframe/divisions.png\n", + "---\n", + "width: 400px\n", + "name: dask-dataframe-divisions\n", + "---\n", + "Dask DataFrame 的 `divisions`\n", + "```\n", + "\n", + "以 Dask 提供的样例数据函数 `dask.datasets.timeseries` 为例,它生成了时间序列,使用时间戳作为行标签,每个 Partition 的边界都被记录下来,存储在 `.divisions` 里。`len(divisons)` 等于 `npartitions + 1`。" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "df.npartitions: 1826\n", + "df.divisions: 1827\n" + ] + } + ], + "source": [ + "ts_df = dask.datasets.timeseries(\"2018-01-01\", \"2023-01-01\")\n", + "print(f\"df.npartitions: {ts_df.npartitions}\")\n", + "print(f\"df.divisions: {len(ts_df.divisions)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Dask DataFrame 没有记录每个 Partition 中有多少行,因此无法在全局角度支持基于行索引的操作,比如 `iloc`。" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "NotImplementedError, 'DataFrame.iloc' only supports selecting columns. It must be used like 'df.iloc[:, column_indexer]'.\n" + ] + } + ], + "source": [ + "try:\n", + " ts_df.iloc[3].compute()\n", + "except Exception as e:\n", + " print(f\"{type(e).__name__}, {e}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "但是可以支持列标签,或者 `:` 这样的通配符:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idx
timestamp
2018-01-01 00:00:009840.660595
2018-01-01 00:00:01960-0.747564
2018-01-01 00:00:0210390.777117
2018-01-01 00:00:031038-0.501949
2018-01-01 00:00:049920.767979
.........
2022-12-31 23:59:551005-0.102774
2022-12-31 23:59:561040-0.648857
2022-12-31 23:59:571019-0.310174
2022-12-31 23:59:589870.889037
2022-12-31 23:59:59977-0.078216
\n", + "

157766400 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " id x\n", + "timestamp \n", + "2018-01-01 00:00:00 984 0.660595\n", + "2018-01-01 00:00:01 960 -0.747564\n", + "2018-01-01 00:00:02 1039 0.777117\n", + "2018-01-01 00:00:03 1038 -0.501949\n", + "2018-01-01 00:00:04 992 0.767979\n", + "... ... ...\n", + "2022-12-31 23:59:55 1005 -0.102774\n", + "2022-12-31 23:59:56 1040 -0.648857\n", + "2022-12-31 23:59:57 1019 -0.310174\n", + "2022-12-31 23:59:58 987 0.889037\n", + "2022-12-31 23:59:59 977 -0.078216\n", + "\n", + "[157766400 rows x 2 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ts_df.iloc[:, [1, 2]].compute()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "对于 CSV 文件,Dask DataFrame 并没有自动生成 `divisions`。" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(None, None, None, None, None, None, None)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "folder_path = os.path.join(os.getcwd(), \"../data/\")\n", + "download_url = \"https://dp.godaai.org/nyc-flights.zip\"\n", + "zip_file_path = os.path.join(folder_path, \"nyc-flights.zip\")\n", + "if not os.path.exists(os.path.join(folder_path, \"nyc-flights\")):\n", + " with urllib.request.urlopen(download_url) as response, open(zip_file_path, 'wb') as out_file:\n", + " shutil.copyfileobj(response, out_file)\n", + " zf = ZipFile(zip_file_path, 'r')\n", + " zf.extractall(folder_path)\n", + " zf.close()\n", + "file_path = os.path.join(folder_path, \"nyc-flights\", \"*.csv\")\n", + "flights_ddf = dd.read_csv(file_path,\n", + " parse_dates={'Date': [0, 1, 2]},\n", + " dtype={'TailNum': object,\n", + " 'CRSElapsedTime': float,\n", + " 'Cancelled': bool})\n", + "flights_ddf.divisions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "因为没有记录每个 Partition 有多少条数据,Dask DataFrame 无法很好地支持一些操作,比如 `median()` 这样的百分位操作,因为这些操作需要:(1) 对数据排序;(2) 定位到特定的行。" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "NotImplementedError, Dask doesn't implement an exact median in all cases as this is hard to do in parallel. See the `median_approximate` method instead, which uses an approximate algorithm.\n" + ] + } + ], + "source": [ + "try:\n", + " flights_ddf['DepDelay'].median()\n", + "except Exception as e:\n", + " print(f\"{type(e).__name__}, {e}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 设置索引列\n", + "\n", + "### `set_index()`\n", + "\n", + "在 Dask DataFrame 中,我们可以使用 `set_index()` 方法手动设置某一列为索引列,这个操作除了设置某个字段为索引列,还会根据这个字段对全局数据进行排序,它打乱了原来每个 Partition 的数据排序,因此会有很高的成本。\n", + "\n", + "下面的例子展示了 `set_index()` 带来的变化:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " col1 col2\n", + "0 01 a\n", + "1 05 b\n", + "2 02 c\n", + " col1 col2\n", + "3 03 d\n", + "4 04 e\n" + ] + } + ], + "source": [ + "def print_partitions(ddf):\n", + " for i in range(ddf.npartitions):\n", + " print(ddf.partitions[i].compute())\n", + "\n", + "df = pd.DataFrame(\n", + " {\"col1\": [\"01\", \"05\", \"02\", \"03\", \"04\"], \"col2\": [\"a\", \"b\", \"c\", \"d\", \"e\"]}\n", + ")\n", + "ddf = dd.from_pandas(df, npartitions=2)\n", + "print_partitions(ddf)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " col2\n", + "col1 \n", + "01 a\n", + " col2\n", + "col1 \n", + "02 c\n", + "03 d\n", + "04 e\n", + "05 b\n" + ] + } + ], + "source": [ + "ddf2 = ddf.set_index(\"col1\")\n", + "print_partitions(ddf2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "这个例子设置 `col1` 列为索引列,2 个 Partition 中的数据被打乱重排。如果是在数据量很大的场景,全局数据排序和重分布的成本极高。因此应该尽量避免这个操作。`set_index()` 也有它的优势,它可以加速下游的计算。" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "回到时间序列数据,该数据使用时间戳作为索引列。下面使用了两种方式对这份数据 `set_index()`。第一种没有设置 `divisions`,第二种设置了 `divisions`。\n", + "\n", + "第一种不设置 `divisions` 耗时很长,因为 Dask DataFrame 计算了所有 Partiton 的数据分布,并根据分布重排列了所有的 Partition,可以看到,Partition 的数目也发生了变化。" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "before set_index npartitions: 1826\n", + "after set_index npartitions: 163\n", + "CPU times: user 6.1 s, sys: 3.47 s, total: 9.57 s\n", + "Wall time: 19.6 s\n" + ] + } + ], + "source": [ + "%%time\n", + "ts_df1 = ts_df.set_index(\"id\")\n", + "nu = ts_df1.loc[[1001]].name.nunique().compute()\n", + "print(f\"before set_index npartitions: {ts_df.npartitions}\")\n", + "print(f\"after set_index npartitions: {ts_df1.npartitions}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "第二种方式先提前获取了 `divisions`,然后将这些 `divisions` 用于设置 `set_index()`。设定 `division` 的 `set_index()` 速度更快。" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "dask_computed_divisions = ts_df.set_index(\"id\").divisions\n", + "unique_divisions = list(dict.fromkeys(list(dask_computed_divisions)))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 3.25 s, sys: 1.09 s, total: 4.34 s\n", + "Wall time: 11.7 s\n" + ] + } + ], + "source": [ + "%%time\n", + "ts_df2 = ts_df.set_index(\"id\", divisions=unique_divisions)\n", + "nuids = ts_df2.loc[[1001]].name.nunique().compute()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "如果不设置索引列,直接对 `id` 列进行查询,发现反而更快。" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 1.94 s, sys: 743 ms, total: 2.68 s\n", + "Wall time: 8.18 s\n" + ] + } + ], + "source": [ + "%%time\n", + "nu = ts_df.loc[ts_df[\"id\"] == 1001].name.nunique().compute()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "所以 Dask DataFrame 要慎重使用 `set_index()`,如果 `set_index()` 之后有很多以下操作,可以考虑使用 `set_index()`。\n", + "\n", + "* 使用 `loc` 对索引列进行过滤\n", + "* 两个 Dask DataFrame 在索引列上合并(`merge()`)\n", + "* 在索引列上进行分组聚合(`groupby()`)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `reset_index()`\n", + "\n", + "在 pandas 中,默认 `as_index=True` 时,分组字段经过 `groupby()` 之后成为索引列。索引列在 DataFrame 中并不是正式的数据列,如果分组聚合之后只有一个字段(不考虑分组字段),分组聚合的结果就成了一个 `Series`。比如下面 pandas 的例子,`Origin` 列就是分组字段,如果不设置 `as_index=False`,`groupby(\"Origin\", as_index=False)[\"DepDelay\"].mean()` 生成的是一个 `Series`。" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
OriginAvgDepDelay
2LGA5.726304
0EWR6.916220
1JFK9.311532
\n", + "
" + ], + "text/plain": [ + " Origin AvgDepDelay\n", + "2 LGA 5.726304\n", + "0 EWR 6.916220\n", + "1 JFK 9.311532" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# pandas\n", + "file_path = os.path.join(folder_path, \"nyc-flights\", \"1991.csv\")\n", + "pdf = pd.read_csv(file_path,\n", + " parse_dates={'Date': [0, 1, 2]},\n", + " dtype={'TailNum': object,\n", + " 'CRSElapsedTime': float,\n", + " 'Cancelled': bool})\n", + "uncancelled_pdf = pdf[pdf[\"Cancelled\"] == False]\n", + "avg_pdf = uncancelled_pdf.groupby(\"Origin\", as_index=False)[\"DepDelay\"].mean()\n", + "avg_pdf.columns = [\"Origin\", \"AvgDepDelay\"]\n", + "avg_pdf.sort_values(\"AvgDepDelay\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "或者是 `reset_index()`,来取消索引列,分组字段会成为 `DataFrame` 的一个正式的字段。" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
OriginAvgDepDelay
2LGA5.726304
0EWR6.916220
1JFK9.311532
\n", + "
" + ], + "text/plain": [ + " Origin AvgDepDelay\n", + "2 LGA 5.726304\n", + "0 EWR 6.916220\n", + "1 JFK 9.311532" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "avg_pdf = uncancelled_pdf.groupby(\"Origin\")[\"DepDelay\"].mean().reset_index()\n", + "avg_pdf.columns = [\"Origin\", \"AvgDepDelay\"]\n", + "avg_pdf.sort_values(\"AvgDepDelay\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Dask DataFrame 的 `groupby()` 不支持 `as_index` 参数。Dask DataFrame 只能使用 `reset_index()` 来取消索引列。" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
OriginAvgDepDelay
2LGA6.944939
0EWR9.997188
1JFK10.766914
\n", + "
" + ], + "text/plain": [ + " Origin AvgDepDelay\n", + "2 LGA 6.944939\n", + "0 EWR 9.997188\n", + "1 JFK 10.766914" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "uncancelled_ddf = flights_ddf[flights_ddf[\"Cancelled\"] == False]\n", + "avg_ddf = uncancelled_ddf.groupby(\"Origin\")[\"DepDelay\"].mean().reset_index()\n", + "avg_ddf.columns = [\"Origin\", \"AvgDepDelay\"]\n", + "avg_ddf = avg_ddf.compute()\n", + "# pandas 只使用了一年数据,因此结果不一样\n", + "avg_ddf.sort_values(\"AvgDepDelay\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.shutdown()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dispy", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/ch-dask-dataframe/map-partitions.ipynb b/ch-dask-dataframe/map-partitions.ipynb new file mode 100644 index 0000000..4077e66 --- /dev/null +++ b/ch-dask-dataframe/map-partitions.ipynb @@ -0,0 +1,391 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# `map_partitions`\n", + "\n", + "除了 {numref}`dask-dataframe-shuffle` 中提到的一些需要通信的计算外,有一种最简单的并行方式,英文术语为 Embarrassingly Parallel,中文可翻译为易并行。它指的是该类计算不需要太多跨 Worker 的协调和通信。比如,对某个字段加一,每个 Worker 内执行加法操作即可,Worker 之间没有通信的开销。Dask DataFrame 中可以使用 `map_partitions()` 来做这类 Embarrassingly Parallel 的操作。`map_partitions(func)` 的参数是一个 `func`,这个 `func` 将在每个 Partition 上执行。\n", + "\n", + "下面的案例对缺失值进行填充,它没有跨 Worker 的通信开销,因此是一种 Embarrassingly Parallel 的典型应用场景。" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import urllib\n", + "import shutil\n", + "from zipfile import ZipFile\n", + "import warnings\n", + "\n", + "warnings.simplefilter(action='ignore', category=FutureWarning)\n", + "\n", + "folder_path = os.path.join(os.getcwd(), \"../data/\")\n", + "download_url_prefix = \"https://gender-pay-gap.service.gov.uk/viewing/download-data/\"\n", + "file_path_prefix = os.path.join(folder_path, \"gender-pay\")\n", + "if not os.path.exists(file_path_prefix):\n", + " os.makedirs(file_path_prefix)\n", + "for year in [2017, 2018, 2019, 2020, 2021, 2022]:\n", + " download_url = download_url_prefix + str(year)\n", + " file_path = os.path.join(file_path_prefix, f\"{str(year)}.csv\")\n", + " if not os.path.exists(file_path):\n", + " with urllib.request.urlopen(download_url) as response, open(file_path, 'wb') as out_file:\n", + " shutil.copyfileobj(response, out_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages/distributed/node.py:182: UserWarning: Port 8787 is already in use.\n", + "Perhaps you already have a cluster running?\n", + "Hosting the HTTP server on port 57481 instead\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "import dask.dataframe as dd\n", + "import pandas as pd\n", + "from dask.distributed import LocalCluster, Client\n", + "\n", + "cluster = LocalCluster()\n", + "client = Client(cluster)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "ddf = dd.read_csv(os.path.join(file_path_prefix, \"*.csv\"),\n", + " dtype={'CompanyNumber': 'str', 'DiffMeanHourlyPercent': 'float64'})\n", + "\n", + "def fillna(df):\n", + " return df.fillna(value={\"PostCode\": \"UNKNOWN\"})\n", + " \n", + "ddf = ddf.map_partitions(fillna)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Dask DataFrame 模拟了 pandas DataFrame,如果这个 API 的计算模式是 Embarrassingly Parallel,它的底层很可能就是使用 `map_partitions()` 实现的。\n", + "\n", + "{numref}`dask-dataframe-indexing` 提到过,Dask DataFrame 会在某个列上进行切分。我们可以在 `map_partitions()` 的 `func` 中实现任何我们想做的事情,但如果对这些切分的列做了改动,需要 `clear_divisions()` 或者重新 `set_index()`。" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Dask DataFrame Structure:
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
EmployerNameEmployerIdAddressPostCodeCompanyNumberSicCodesDiffMeanHourlyPercentDiffMedianHourlyPercentDiffMeanBonusPercentDiffMedianBonusPercentMaleBonusPercentFemaleBonusPercentMaleLowerQuartileFemaleLowerQuartileMaleLowerMiddleQuartileFemaleLowerMiddleQuartileMaleUpperMiddleQuartileFemaleUpperMiddleQuartileMaleTopQuartileFemaleTopQuartileCompanyLinkToGPGInfoResponsiblePersonEmployerSizeCurrentNameSubmittedAfterTheDeadlineDueDateDateSubmitted
npartitions=6
stringint64stringstringstringstringfloat64float64float64float64float64float64float64float64float64float64float64float64float64float64stringstringstringstringboolstringstring
.................................................................................
....................................................................................
.................................................................................
.................................................................................
\n", + "
\n", + "
Dask Name: fillna, 3 graph layers
" + ], + "text/plain": [ + "Dask DataFrame Structure:\n", + " EmployerName EmployerId Address PostCode CompanyNumber SicCodes DiffMeanHourlyPercent DiffMedianHourlyPercent DiffMeanBonusPercent DiffMedianBonusPercent MaleBonusPercent FemaleBonusPercent MaleLowerQuartile FemaleLowerQuartile MaleLowerMiddleQuartile FemaleLowerMiddleQuartile MaleUpperMiddleQuartile FemaleUpperMiddleQuartile MaleTopQuartile FemaleTopQuartile CompanyLinkToGPGInfo ResponsiblePerson EmployerSize CurrentName SubmittedAfterTheDeadline DueDate DateSubmitted\n", + "npartitions=6 \n", + " string int64 string string string string float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 string string string string bool string string\n", + " ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", + "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", + " ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", + " ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", + "Dask Name: fillna, 3 graph layers" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ddf.clear_divisions()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "client.shutdown()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dispy", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/ch-dask-dataframe/read-write.ipynb b/ch-dask-dataframe/read-write.ipynb index c11d3a4..964ff11 100644 --- a/ch-dask-dataframe/read-write.ipynb +++ b/ch-dask-dataframe/read-write.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "(dask-read-write)=\n", + "(dask-dataframe-read-write)=\n", "# 读写数据\n", "\n", "Dask DataFrame 支持 pandas 中几乎所有的数据读写操作,包括从本地、NFS、HDFS 或 S3 上读写文本文件、Parquet、HDF、JSON 等格式的文件。 {numref}`dask-read-write-operations` 是几个常见的读写操作。\n", @@ -45,7 +45,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -62,13 +62,9 @@ "import urllib\n", "import shutil\n", "from zipfile import ZipFile\n", + "import warnings\n", "\n", - "import dask.dataframe as dd\n", - "import pandas as pd\n", - "from dask.distributed import LocalCluster, Client\n", - "\n", - "cluster = LocalCluster()\n", - "client = Client(cluster)\n", + "warnings.simplefilter(action='ignore', category=FutureWarning)\n", "\n", "folder_path = os.path.join(os.getcwd(), \"../data/\")\n", "download_url = \"https://dp.godaai.org/nyc-flights.zip\"\n", @@ -84,6 +80,31 @@ "print(file_path)" ] }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages/distributed/node.py:182: UserWarning: Port 8787 is already in use.\n", + "Perhaps you already have a cluster running?\n", + "Hosting the HTTP server on port 55077 instead\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "import dask.dataframe as dd\n", + "import pandas as pd\n", + "from dask.distributed import LocalCluster, Client\n", + "\n", + "cluster = LocalCluster()\n", + "client = Client(cluster)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -93,7 +114,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -109,7 +130,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -132,7 +153,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -281,7 +302,7 @@ "[3 rows x 21 columns]" ] }, - "execution_count": 17, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -292,7 +313,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -436,7 +457,7 @@ "[3 rows x 21 columns]" ] }, - "execution_count": 18, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -456,7 +477,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -465,255 +486,255 @@ "\n", "\n", "\n", - "\n", + "\n", "\n", - "-4992946418407614279\n", + "-7150382809904974857\n", "\n", "read-csv\n", "\n", - "\n", + "\n", "\n", - "-2791454735931807732\n", + "-6194689680917011400\n", "\n", "0\n", "\n", - "\n", + "\n", "\n", - "-4992946418407614279->-2791454735931807732\n", + "-7150382809904974857->-6194689680917011400\n", "\n", "\n", "\n", - "\n", + "\n", "\n", - "-1856934773411932769\n", + "6071941830101125281\n", "\n", "to_pyarrow_string\n", "\n", - "\n", + "\n", "\n", - "-2791454735931807732->-1856934773411932769\n", + "-6194689680917011400->6071941830101125281\n", "\n", "\n", "\n", - "\n", + "\n", "\n", - "-66498125748364619\n", + "-65660599333282282\n", "\n", "read-csv\n", "\n", - "\n", + "\n", "\n", - "3716186020170190693\n", + "312951075184987025\n", "\n", "1\n", "\n", - "\n", + "\n", "\n", - "-66498125748364619->3716186020170190693\n", + "-65660599333282282->312951075184987025\n", "\n", "\n", "\n", - "\n", + "\n", "\n", - "5227787437159759806\n", + "-8135483071169533727\n", "\n", "to_pyarrow_string\n", "\n", - "\n", + "\n", "\n", - "3716186020170190693->5227787437159759806\n", + "312951075184987025->-8135483071169533727\n", "\n", "\n", "\n", - "\n", + "\n", "\n", - "4172821046690527989\n", + "-1102500011605602925\n", "\n", "read-csv\n", "\n", - "\n", + "\n", "\n", - "-1176888008802505673\n", + "-8978480336772077469\n", "\n", "2\n", "\n", - "\n", + "\n", "\n", - "4172821046690527989->-1176888008802505673\n", + "-1102500011605602925->-8978480336772077469\n", "\n", "\n", "\n", - "\n", + "\n", "\n", - "5068749226614284286\n", + "-1050760860597841152\n", "\n", "to_pyarrow_string\n", "\n", - "\n", + "\n", "\n", - "-1176888008802505673->5068749226614284286\n", + "-8978480336772077469->-1050760860597841152\n", "\n", "\n", "\n", - "\n", + "\n", "\n", - "-7189200816447331052\n", + "-1906626916754175967\n", "\n", "read-csv\n", "\n", - "\n", + "\n", "\n", - "5330752747299492752\n", + "-2470839580670079044\n", "\n", "3\n", "\n", - "\n", + "\n", "\n", - "-7189200816447331052->5330752747299492752\n", + "-1906626916754175967->-2470839580670079044\n", "\n", "\n", "\n", - "\n", + "\n", "\n", - "3386821119738866121\n", + "4071937302134756076\n", "\n", "to_pyarrow_string\n", "\n", - "\n", + "\n", "\n", - "5330752747299492752->3386821119738866121\n", + "-2470839580670079044->4071937302134756076\n", "\n", "\n", "\n", - "\n", + "\n", "\n", - "5177257767402434271\n", + "5178095293817516608\n", "\n", "read-csv\n", "\n", - "\n", + "\n", "\n", - "7440036120417123049\n", + "4036801175431919381\n", "\n", "4\n", "\n", - "\n", + "\n", "\n", - "5177257767402434271->7440036120417123049\n", + "5178095293817516608->4036801175431919381\n", "\n", "\n", "\n", - "\n", + "\n", "\n", - "3425514041675701871\n", + "8508987607055959954\n", "\n", "to_pyarrow_string\n", "\n", - "\n", + "\n", "\n", - "7440036120417123049->3425514041675701871\n", + "4036801175431919381->8508987607055959954\n", "\n", "\n", "\n", - "\n", + "\n", "\n", - "-9030167133868224737\n", + "-9029329607453142400\n", "\n", "read-csv\n", "\n", - "\n", + "\n", "\n", - "2546962091444426683\n", + "-856272853540776985\n", "\n", "5\n", "\n", - "\n", + "\n", "\n", - "-9030167133868224737->2546962091444426683\n", + "-9029329607453142400->-856272853540776985\n", "\n", "\n", "\n", - "\n", + "\n", "\n", - "7664833214114594479\n", + "-2363636268343853305\n", "\n", "to_pyarrow_string\n", "\n", - "\n", + "\n", "\n", - "2546962091444426683->7664833214114594479\n", + "-856272853540776985->-2363636268343853305\n", "\n", "\n", "\n", - "\n", + "\n", "\n", - "-810036887397515834\n", + "8774990811659256194\n", "\n", "0\n", "\n", - "\n", + "\n", "\n", - "-1856934773411932769->-810036887397515834\n", + "6071941830101125281->8774990811659256194\n", "\n", "\n", "\n", - "\n", + "\n", "\n", - "5697603868704482591\n", + "3881916782686559828\n", "\n", "1\n", "\n", - "\n", + "\n", "\n", - "5227787437159759806->5697603868704482591\n", + "-8135483071169533727->3881916782686559828\n", "\n", "\n", "\n", - "\n", + "\n", "\n", - "804529839731786225\n", + "-8057186534920993363\n", "\n", "2\n", "\n", - "\n", + "\n", "\n", - "5068749226614284286->804529839731786225\n", + "-1050760860597841152->-8057186534920993363\n", "\n", "\n", "\n", - "\n", + "\n", "\n", - "2913813212849416522\n", + "1098126126831493759\n", "\n", "3\n", "\n", - "\n", + "\n", "\n", - "3386821119738866121->2913813212849416522\n", + "4071937302134756076->1098126126831493759\n", "\n", "\n", "\n", - "\n", + "\n", "\n", - "-9025290104758136669\n", + "7605766882933492184\n", "\n", "4\n", "\n", - "\n", + "\n", "\n", - "3425514041675701871->-9025290104758136669\n", + "8508987607055959954->7605766882933492184\n", "\n", "\n", "\n", - "\n", + "\n", "\n", - "4528379939978718581\n", + "-4333336434674061007\n", "\n", "5\n", "\n", - "\n", + "\n", "\n", - "7664833214114594479->4528379939978718581\n", + "-2363636268343853305->-4333336434674061007\n", "\n", "\n", "\n", @@ -724,7 +745,7 @@ "" ] }, - "execution_count": 19, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -758,42 +779,14 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 7, "metadata": {}, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "2024-02-09 17:43:50,091 - distributed.worker - WARNING - Compute Failed\n", - "Key: ('to_pyarrow_string-be40c5ef4347788c673803e766c9f5ac', 5)\n", - "Function: execute_task\n", - "args: ((subgraph_callable-9daf7b14-cf6d-4b93-b345-9197c71b1866, [(, , 0, 24979433, b'\\n'), None, True, True]))\n", - "kwargs: {}\n", - "Exception: 'ValueError(\\'Mismatched dtypes found in `pd.read_csv`/`pd.read_table`.\\\\n\\\\n+----------------+---------+----------+\\\\n| Column | Found | Expected |\\\\n+----------------+---------+----------+\\\\n| CRSElapsedTime | float64 | int64 |\\\\n| TailNum | object | float64 |\\\\n+----------------+---------+----------+\\\\n\\\\nThe following columns also raised exceptions on conversion:\\\\n\\\\n- TailNum\\\\n ValueError(\"could not convert string to float: \\\\\\'N14346\\\\\\'\")\\\\n\\\\nUsually this is due to dask\\\\\\'s dtype inference failing, and\\\\n*may* be fixed by specifying dtypes manually by adding:\\\\n\\\\ndtype={\\\\\\'CRSElapsedTime\\\\\\': \\\\\\'float64\\\\\\',\\\\n \\\\\\'TailNum\\\\\\': \\\\\\'object\\\\\\'}\\\\n\\\\nto the call to `read_csv`/`read_table`.\\')'\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/var/folders/4n/v40br47s46ggrjm9bdm64lwh0000gn/T/ipykernel_35647/3690877535.py\", line 3, in \n", - " ddf.tail(3)\n", - " File \"/Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages/dask/dataframe/core.py\", line 1591, in tail\n", - " result = result.compute()\n", - " ^^^^^^^^^^^^^^^^\n", - " File \"/Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages/dask/base.py\", line 342, in compute\n", - " (result,) = compute(self, traverse=False, **kwargs)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages/dask/base.py\", line 628, in compute\n", - " results = schedule(dsk, keys, **kwargs)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages/dask/dataframe/io/csv.py\", line 142, in __call__\n", - " df = pandas_read_text(\n", - " ^^^^^^^^^^^^^^^^^\n", - " File \"/Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages/dask/dataframe/io/csv.py\", line 197, in pandas_read_text\n", - " coerce_dtypes(df, dtypes)\n", - " ^^^^^^^^^^^^^^^^^\n", - " File \"/Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages/dask/dataframe/io/csv.py\", line 298, in coerce_dtypes\n", - " raise ValueError(msg)\n", - " ^^^^^^^^^^^^^^^^^\n", - "ValueError: Mismatched dtypes found in `pd.read_csv`/`pd.read_table`.\n", + "ValueError, Mismatched dtypes found in `pd.read_csv`/`pd.read_table`.\n", "\n", "+----------------+---------+----------+\n", "| Column | Found | Expected |\n", @@ -815,14 +808,26 @@ "\n", "to the call to `read_csv`/`read_table`.\n" ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-02-20 13:26:56,131 - distributed.worker - WARNING - Compute Failed\n", + "Key: ('to_pyarrow_string-be40c5ef4347788c673803e766c9f5ac', 5)\n", + "Function: execute_task\n", + "args: ((subgraph_callable-104274d7-85ea-4720-954d-78db2282e8c8, [(, , 0, 24979433, b'\\n'), None, True, True]))\n", + "kwargs: {}\n", + "Exception: 'ValueError(\\'Mismatched dtypes found in `pd.read_csv`/`pd.read_table`.\\\\n\\\\n+----------------+---------+----------+\\\\n| Column | Found | Expected |\\\\n+----------------+---------+----------+\\\\n| CRSElapsedTime | float64 | int64 |\\\\n| TailNum | object | float64 |\\\\n+----------------+---------+----------+\\\\n\\\\nThe following columns also raised exceptions on conversion:\\\\n\\\\n- TailNum\\\\n ValueError(\"could not convert string to float: \\\\\\'N14346\\\\\\'\")\\\\n\\\\nUsually this is due to dask\\\\\\'s dtype inference failing, and\\\\n*may* be fixed by specifying dtypes manually by adding:\\\\n\\\\ndtype={\\\\\\'CRSElapsedTime\\\\\\': \\\\\\'float64\\\\\\',\\\\n \\\\\\'TailNum\\\\\\': \\\\\\'object\\\\\\'}\\\\n\\\\nto the call to `read_csv`/`read_table`.\\')'\n", + "\n" + ] } ], "source": [ - "import traceback\n", "try:\n", " ddf.tail(3)\n", - "except Exception:\n", - " traceback.print_exc()" + "except Exception as e:\n", + " print(f\"{type(e).__name__}, {e}\")" ] }, { @@ -836,7 +841,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -985,7 +990,7 @@ "[3 rows x 21 columns]" ] }, - "execution_count": 21, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -999,6 +1004,85 @@ "ddf.tail(3)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Dask DataFrame 把各个字段的 Schema 放在了 `_meta` 里,包括字段的名称和类型。" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DateDayOfWeekDepTimeCRSDepTimeArrTimeCRSArrTimeUniqueCarrierFlightNumTailNumActualElapsedTime...AirTimeArrDelayDepDelayOriginDestDistanceTaxiInTaxiOutCancelledDiverted
\n", + "

0 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [Date, DayOfWeek, DepTime, CRSDepTime, ArrTime, CRSArrTime, UniqueCarrier, FlightNum, TailNum, ActualElapsedTime, CRSElapsedTime, AirTime, ArrDelay, DepDelay, Origin, Dest, Distance, TaxiIn, TaxiOut, Cancelled, Diverted]\n", + "Index: []\n", + "\n", + "[0 rows x 21 columns]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ddf._meta" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1008,7 +1092,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -1020,8 +1104,8 @@ "JFK 255485\n", "LGA 591305\n", "Name: Origin, dtype: int64\n", - "CPU times: user 1.77 s, sys: 701 ms, total: 2.47 s\n", - "Wall time: 5.62 s\n" + "CPU times: user 1.67 s, sys: 328 ms, total: 2 s\n", + "Wall time: 5.49 s\n" ] } ], @@ -1038,7 +1122,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -1050,8 +1134,8 @@ "JFK 255485\n", "LGA 591305\n", "Name: Origin, dtype: int64\n", - "CPU times: user 48.6 ms, sys: 16.1 ms, total: 64.6 ms\n", - "Wall time: 509 ms\n" + "CPU times: user 51.4 ms, sys: 11.1 ms, total: 62.5 ms\n", + "Wall time: 539 ms\n" ] } ], @@ -1080,32 +1164,52 @@ "\n", "具体而言,列式存储按照列进行存储,而不是 CSV 那样按行存储。数据分析时,我们可能只关心特定的列,而不是所有的列,因此在读取数据时,Parquet 允许很方便地过滤掉不需要的列,而不必读取整个行。因为减少了读取的数据量,Parquet 可以显著提高性能,也被广泛应用在 Apache Spark、Apache Hive 和 Apache Flink 等大数据生态。Parquet 自带了表模式,每个 Parquet 文件里嵌入了每个列的列名、数据类型等元数据,也就避免了 Dask DataFrame 进行表模式推测时推测不准确的问题。Parquet 中的数据是经过压缩的,相比 CSV,Parquet 更节省持久化存储的空间。\n", "\n", + "```{figure} ../img/ch-dask-dataframe/parquet.svg\n", + "---\n", + "width: 600px\n", + "name: parquet-with-row-group\n", + "---\n", + "Parquet 是一种列式存储,一个 Parquet 文件至少一个 Row Group,Row Group 中的数据按列存储,每个列存储在一个 Column Chunk 中。 \n", + "```\n", + "\n", "比如,应该尽量读取所需要的列,而不是所有的列。\n", "\n", "```python\n", "dd.read_parquet(\n", - " \"s3://path/to/parquet/\",\n", + " \"s3://path/to/folder/\",\n", " columns=[\"a\", \"b\", \"c\"]\n", ")\n", "```\n", "\n", - "此外,Parquet 提供了行分组(Row Group)的概念,如 {numref}`parquet-row-group` 所示,Parquet 文件中的数据是分组的,Row Group 定义了一个组内有多少行,这个例子中,一共 3 个 Row Group,每个 Row Group 有 2 行数据。每个 Row Group 存储了列的最大值和最小值等元数据,对这些列进行某些查询时,通过元数据的信息就可以确定是否读取这个 Row Group。比如,某时间列是一个时间序列,查询 “每天 9:00 至 12:00 的销量”,Row Group 中的元数据记录了时间列的最大值和最小值,通过元数据就可以判断是否有必要把这个 Row Group 读取出来,避免了读取不必要的开销。\n", - "\n", - "```{figure} ../img/ch-dask-dataframe/parquet-row-group.svg\n", - "---\n", - "width: 600px\n", - "name: parquet-row-group\n", - "---\n", - "Parquet 列式存储与 Row Group\n", - "```\n", + "此外,Parquet 提供了行分组(Row Group)的概念,如 {numref}`parquet-with-row-group` 所示,Parquet 文件中的数据是分组的,Row Group 定义了一个组内有多少行,这个例子中,一共 3 个 Row Group,每个 Row Group 有 2 行数据。每个 Row Group 存储了列的最大值和最小值等元数据,对这些列进行某些查询时,通过元数据的信息就可以确定是否读取这个 Row Group。比如,某时间列是一个时间序列,查询 “每天 9:00 至 12:00 的销量”,Row Group 中的元数据记录了时间列的最大值和最小值,通过元数据就可以判断是否有必要把这个 Row Group 读取出来,避免了读取不必要的开销。\n", "\n", "由于 Row Group 的引入,一个 Parquet 文件内可能有多个组,也就是说 Parquet 文件帮我们做了分组;不过很多 Parquet 文件的 Row Group 是 1。\n", "\n", "通常,数据集的文件被拆分为多个 Parquet,并且按照一定的方式来组织。比如,按照时间来切分:\n", "\n", "```\n", - "/path/folder/\n", - ".../year/month/day.parquet\n", + "path/to/folder/\n", + "├── year=2023/\n", + "│ ├── month=01/\n", + "│ │ └── part.0.parquet\n", + "│ └── month=02/\n", + "│ ├── part.0.parquet\n", + "│ └── part.1.parquet\n", + "└── year=2024/\n", + " └── month=01/\n", + " └── part.0.parquet\n", + "```\n", + "\n", + "可以过滤掉不需要的数据:\n", + "\n", + "```python\n", + "dd.read_parquet(\"path/to/folder/\", filters=[(\"year\", \">=\", 2023)])\n", + "```\n", + "\n", + "也可以根据这种目录结构存储数据:\n", + "\n", + "```python\n", + "df.to_parquet(\"path/to/folder/\", partition_on=[\"year\", \"month\"])\n", "```\n", "\n", "Dask DataFrame 在读取 Parquet 数据集时,除了根据文件数量外,又多了一个可能的依据:Row Group。如果有 m 个文件,那 Partition 可能的选项有:\n", @@ -1114,12 +1218,12 @@ "* 每个 Row Group 对应 Dask 的一个 Partition,假如一个 Parquet 文件有 n 个 Row Group,共有 m * n 个 Partition\n", "\n", "但无论哪种方式,应该尽量保证每个 Partition 所占用的内存空间不超过 Worker 的物理内存空间。Dask DataFrame 的\n", - "`read_parquet()` 通过 `split_row_groups` 参数给用户更多选项。默认为 `split_row_groups=\"infer\"`:Dask DataFrame 根据 Parquet 文件中的元数据来推测,是每个文件对应一个 Partition,还是每个 Row Group 对应一个 Partition;如果设置为 `split_row_groups=True` 则强制每个 Row Group 对应一个 Partition;如果设置为 `False`,则每个文件对应一个 Partition。" + "`read_parquet()` 通过 `split_row_groups` 参数给用户更多选项。如果设置为 `split_row_groups=True` 则强制每个 Row Group 对应一个 Partition;如果设置为 `False`,则每个文件对应一个 Partition。默认为 `split_row_groups=\"infer\"`:Dask DataFrame 根据读到的第一个 Parquet 文件的数据大小等元数据来推测,是每个文件对应一个 Partition,还是每个 Row Group 对应一个 Partition。" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 15, "metadata": { "tags": [ "hide-cell" @@ -1129,6 +1233,13 @@ "source": [ "client.shutdown()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/ch-dask-dataframe/shuffle-test.ipynb b/ch-dask-dataframe/shuffle-test.ipynb new file mode 100644 index 0000000..63052e0 --- /dev/null +++ b/ch-dask-dataframe/shuffle-test.ipynb @@ -0,0 +1,1712 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "(dask-dataframe-shuffle)=\n", + "# Shuffle\n", + "\n", + "在分布式场景下,`sort`,`merge`,`groupby` 有可能会在不同 Worker 之间交换数据,即 Shuffle。这些 pandas 算子在单机上实现起来比较简单,但是在大数据分布式计算场景,实现起来并不简单。\n", + "Dask 在 `2023.1` 版本之后提供了一种新的 Shuffle 方法,可以加速大部分计算任务。\n", + "\n", + "## `groupby`\n", + "\n", + "{numref}`dataframe-groupby` 展示了 `groupby` 在单机上的操作流程,它主要有三个阶段:分组、聚合、输出。分布式场景下,不同的数据分布在不同的 Partition 下。\n", + "\n", + "```{figure} ../img/ch-dask-dataframe/groupby.svg\n", + "---\n", + "width: 600px\n", + "name: dataframe-groupby\n", + "---\n", + "DataFrame groupby 示意图\n", + "```\n", + "\n", + "* `groupby(indexed_columns).agg()` 和 `groupby(indexed_columns).apply(user_def_fn)` 性能最好。`indexed_columns` 指的是索引列 Key,`agg` 指的是 Dask DataFrame 提供的官方的 `sum`,`mean`,`nunique` 等聚合方法。因为 `indexed_columns` 是排过序的了,可以很快地对 `indexed_columns` 进行分组,Shuffle 数据量不大。\n", + "* `groupby(non_indexed_columns).agg()` 的数据交换量要更大一些,`agg` 是 Dask 官方提供的方法,做过一些优化。\n", + "* `groupby(non_indexed_columns).apply(user_def_fn)` 的成本最高。它既要对所有数据进行交换,又要执行用户自定义的函数,\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import urllib\n", + "import shutil\n", + "import warnings\n", + "\n", + "warnings.simplefilter(action='ignore', category=FutureWarning)\n", + "\n", + "folder_path = os.path.join(os.getcwd(), \"../data/\")\n", + "download_url_prefix = \"https://gender-pay-gap.service.gov.uk/viewing/download-data/\"\n", + "file_path_prefix = os.path.join(folder_path, \"gender-pay\")\n", + "if not os.path.exists(file_path_prefix):\n", + " os.makedirs(file_path_prefix)\n", + "for year in [2017, 2018, 2019, 2020, 2021, 2022]:\n", + " download_url = download_url_prefix + str(year)\n", + " file_path = os.path.join(file_path_prefix, f\"{str(year)}.csv\")\n", + " if not os.path.exists(file_path):\n", + " with urllib.request.urlopen(download_url) as response, open(file_path, 'wb') as out_file:\n", + " shutil.copyfileobj(response, out_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import dask.dataframe as dd\n", + "import pandas as pd\n", + "from dask.distributed import LocalCluster, Client\n", + "\n", + "cluster = LocalCluster()\n", + "client = Client(cluster)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PostCodeEmployerSizeDiffMeanHourlyPercent
0DT11 0PX500 to 99918.0
1EH6 8NU250 to 4992.3
2LS7 1AB250 to 49941.0
3TA6 3JA250 to 499-22.0
4SR5 1SU250 to 49913.4
\n", + "
" + ], + "text/plain": [ + " PostCode EmployerSize DiffMeanHourlyPercent\n", + "0 DT11 0PX 500 to 999 18.0\n", + "1 EH6 8NU 250 to 499 2.3\n", + "2 LS7 1AB 250 to 499 41.0\n", + "3 TA6 3JA 250 to 499 -22.0\n", + "4 SR5 1SU 250 to 499 13.4" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ddf = dd.read_csv(os.path.join(file_path_prefix, \"*.csv\"),\n", + " dtype={'EmployerSize': 'str',\n", + " 'DiffMeanHourlyPercent': 'float64'})\n", + "\n", + "def fillna(df):\n", + " return df.fillna(value={\"PostCode\": \"UNKNOWN\"})\n", + "\n", + "ddf = ddf[[\"PostCode\", \"EmployerSize\", \"DiffMeanHourlyPercent\"]]\n", + "ddf = ddf.dropna()\n", + "# ddf = ddf.map_partitions(fillna)\n", + "ddf.head(5)\n", + " # .map_partitions(update_empsize_to_median)\n", + "# ddf = ddf.map_partitions(update_empsize_to_median)" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "PostCode string[pyarrow]\n", + "EmployerSize string[pyarrow]\n", + "DiffMeanHourlyPercent float64\n", + "dtype: object" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ddf.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# ddf['EmployerSize']=ddf['EmployerSize'].astype(str)\n", + "def update_empsize_to_median(df):\n", + " def to_median(value):\n", + " if isinstance(value, str):\n", + " if \" to \" in value:\n", + " f , t = value.replace(\",\", \"\").split(\" to \")\n", + " return (int(f) + int(t)) / 2.0\n", + " elif \"Less than\" in value:\n", + " return 100\n", + " else:\n", + " return 10000\n", + " else:\n", + " return 0\n", + " df[\"EmployerSize\"] = df[\"EmployerSize\"].apply(to_median)\n", + " return df\n", + "\n", + "try:\n", + " ddf = ddf.map_partitions(update_empsize_to_median)\n", + "except Exception as e:\n", + " print(f\"{type(e).__name__}, {e}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PostCodeEmployerSizeDiffMeanHourlyPercentPostCodeLength
0DT11 0PX749.518.08
1EH6 8NU374.52.37
2LS7 1AB374.541.07
3TA6 3JA374.5-22.07
4SR5 1SU374.513.47
\n", + "
" + ], + "text/plain": [ + " PostCode EmployerSize DiffMeanHourlyPercent PostCodeLength\n", + "0 DT11 0PX 749.5 18.0 8\n", + "1 EH6 8NU 374.5 2.3 7\n", + "2 LS7 1AB 374.5 41.0 7\n", + "3 TA6 3JA 374.5 -22.0 7\n", + "4 SR5 1SU 374.5 13.4 7" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ddf['PostCodeLength'] = ddf['PostCode'].str.len()\n", + "ddf.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/4n/v40br47s46ggrjm9bdm64lwh0000gn/T/ipykernel_49527/3631319032.py:14: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n" + ] + }, + { + "data": { + "text/plain": [ + "PostCode\n", + " BS34 7QH 7.375000\n", + " DE14 2EB 0.070000\n", + " DN15 6NL 3.666667\n", + " OX1 4BH 17.460000\n", + " S65 1EG 14.716667\n", + " ... \n", + "WS11 0DJ 14.600000\n", + "WS13 8EL -4.800000\n", + "WV10 8DS 24.330000\n", + "WV6 8DA 18.100000\n", + "YO25 8EJ 12.930000\n", + "Name: DiffMeanHourlyPercent, Length: 9118, dtype: float64" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "d = ddf.groupby('PostCode')['DiffMeanHourlyPercent'].mean()\n", + "d.compute()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "ename": "KeyError", + "evalue": "'EmployerSize'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/pandas/core/indexes/base.py:3805\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3804\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3805\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3806\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n", + "File \u001b[0;32mindex.pyx:167\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mindex.pyx:175\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mpandas/_libs/index_class_helper.pxi:70\u001b[0m, in \u001b[0;36mpandas._libs.index.Int64Engine._check_type\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mKeyError\u001b[0m: 'EmployerSize'", + "\nThe above exception was the direct cause of the following exception:\n", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[5], line 18\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m total \u001b[38;5;241m/\u001b[39m weights\n\u001b[1;32m 12\u001b[0m weighted_mean \u001b[38;5;241m=\u001b[39m dd\u001b[38;5;241m.\u001b[39mAggregation(\n\u001b[1;32m 13\u001b[0m name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mweighted_mean\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m 14\u001b[0m chunk\u001b[38;5;241m=\u001b[39mprocess_chunk,\n\u001b[1;32m 15\u001b[0m agg\u001b[38;5;241m=\u001b[39magg,\n\u001b[1;32m 16\u001b[0m finalize\u001b[38;5;241m=\u001b[39mfinalize)\n\u001b[0;32m---> 18\u001b[0m aggregated \u001b[38;5;241m=\u001b[39m \u001b[43mddf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgroupby\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mPostCode\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mEmployerSize\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mDiffMeanHourlyPercent\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43magg\u001b[49m\u001b[43m(\u001b[49m\u001b[43mweighted_mean\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 19\u001b[0m aggregated\u001b[38;5;241m.\u001b[39mhead(\u001b[38;5;241m10\u001b[39m)\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/dask_expr/_groupby.py:1909\u001b[0m, in \u001b[0;36mGroupBy.agg\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1908\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21magg\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m-> 1909\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43maggregate\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/dask_expr/_groupby.py:1893\u001b[0m, in \u001b[0;36mGroupBy.aggregate\u001b[0;34m(self, arg, split_every, split_out, shuffle_method, **kwargs)\u001b[0m\n\u001b[1;32m 1890\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m arg \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msize\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 1891\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msize()\n\u001b[0;32m-> 1893\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mnew_collection\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1894\u001b[0m \u001b[43m \u001b[49m\u001b[43mGroupbyAggregation\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1895\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mobj\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexpr\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1896\u001b[0m \u001b[43m \u001b[49m\u001b[43marg\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1897\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mobserved\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1898\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdropna\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1899\u001b[0m \u001b[43m \u001b[49m\u001b[43msplit_every\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1900\u001b[0m \u001b[43m \u001b[49m\u001b[43msplit_out\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1901\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msort\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1902\u001b[0m \u001b[43m \u001b[49m\u001b[43mshuffle_method\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1903\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_slice\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1904\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mby\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1905\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1906\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/dask_expr/_collection.py:4578\u001b[0m, in \u001b[0;36mnew_collection\u001b[0;34m(expr)\u001b[0m\n\u001b[1;32m 4576\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mnew_collection\u001b[39m(expr):\n\u001b[1;32m 4577\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Create new collection from an expr\"\"\"\u001b[39;00m\n\u001b[0;32m-> 4578\u001b[0m meta \u001b[38;5;241m=\u001b[39m \u001b[43mexpr\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_meta\u001b[49m\n\u001b[1;32m 4579\u001b[0m expr\u001b[38;5;241m.\u001b[39m_name \u001b[38;5;66;03m# Ensure backend is imported\u001b[39;00m\n\u001b[1;32m 4580\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m get_collection_type(meta)(expr)\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/functools.py:1001\u001b[0m, in \u001b[0;36mcached_property.__get__\u001b[0;34m(self, instance, owner)\u001b[0m\n\u001b[1;32m 999\u001b[0m val \u001b[38;5;241m=\u001b[39m cache\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mattrname, _NOT_FOUND)\n\u001b[1;32m 1000\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m val \u001b[38;5;129;01mis\u001b[39;00m _NOT_FOUND:\n\u001b[0;32m-> 1001\u001b[0m val \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43minstance\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1002\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1003\u001b[0m cache[\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mattrname] \u001b[38;5;241m=\u001b[39m val\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/dask_expr/_groupby.py:431\u001b[0m, in \u001b[0;36mGroupbyAggregation._meta\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 429\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mcached_property\n\u001b[1;32m 430\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_meta\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m--> 431\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_lower\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_meta\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/functools.py:1001\u001b[0m, in \u001b[0;36mcached_property.__get__\u001b[0;34m(self, instance, owner)\u001b[0m\n\u001b[1;32m 999\u001b[0m val \u001b[38;5;241m=\u001b[39m cache\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mattrname, _NOT_FOUND)\n\u001b[1;32m 1000\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m val \u001b[38;5;129;01mis\u001b[39;00m _NOT_FOUND:\n\u001b[0;32m-> 1001\u001b[0m val \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43minstance\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1002\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1003\u001b[0m cache[\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mattrname] \u001b[38;5;241m=\u001b[39m val\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/dask_expr/_reductions.py:429\u001b[0m, in \u001b[0;36mApplyConcatApply._meta\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 427\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mcached_property\n\u001b[1;32m 428\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_meta\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m--> 429\u001b[0m meta \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_meta_chunk\u001b[49m\n\u001b[1;32m 430\u001b[0m aggregate \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maggregate \u001b[38;5;129;01mor\u001b[39;00m (\u001b[38;5;28;01mlambda\u001b[39;00m x: x)\n\u001b[1;32m 431\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcombine:\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/functools.py:1001\u001b[0m, in \u001b[0;36mcached_property.__get__\u001b[0;34m(self, instance, owner)\u001b[0m\n\u001b[1;32m 999\u001b[0m val \u001b[38;5;241m=\u001b[39m cache\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mattrname, _NOT_FOUND)\n\u001b[1;32m 1000\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m val \u001b[38;5;129;01mis\u001b[39;00m _NOT_FOUND:\n\u001b[0;32m-> 1001\u001b[0m val \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43minstance\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1002\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1003\u001b[0m cache[\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mattrname] \u001b[38;5;241m=\u001b[39m val\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/dask_expr/_groupby.py:205\u001b[0m, in \u001b[0;36mGroupByApplyConcatApply._meta_chunk\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 202\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mcached_property\n\u001b[1;32m 203\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_meta_chunk\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 204\u001b[0m meta \u001b[38;5;241m=\u001b[39m meta_nonempty(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mframe\u001b[38;5;241m.\u001b[39m_meta)\n\u001b[0;32m--> 205\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mchunk\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmeta\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_by_meta\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mchunk_kwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/dask/dataframe/groupby.py:1200\u001b[0m, in \u001b[0;36m_groupby_apply_funcs\u001b[0;34m(df, *by, **kwargs)\u001b[0m\n\u001b[1;32m 1198\u001b[0m result \u001b[38;5;241m=\u001b[39m collections\u001b[38;5;241m.\u001b[39mOrderedDict()\n\u001b[1;32m 1199\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m result_column, func, func_kwargs \u001b[38;5;129;01min\u001b[39;00m funcs:\n\u001b[0;32m-> 1200\u001b[0m r \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mgrouped\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mfunc_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1202\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(r, \u001b[38;5;28mtuple\u001b[39m):\n\u001b[1;32m 1203\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m idx, s \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(r):\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/dask/dataframe/groupby.py:1278\u001b[0m, in \u001b[0;36m_apply_func_to_column\u001b[0;34m(df_like, column, func)\u001b[0m\n\u001b[1;32m 1275\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m column \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1276\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(df_like)\n\u001b[0;32m-> 1278\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf_like\u001b[49m\u001b[43m[\u001b[49m\u001b[43mcolumn\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n", + "Cell \u001b[0;32mIn[5], line 4\u001b[0m, in \u001b[0;36mprocess_chunk\u001b[0;34m(chunk)\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mweighted_func\u001b[39m(df):\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m (df[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEmployerSize\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m*\u001b[39m df[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDiffMeanHourlyPercent\u001b[39m\u001b[38;5;124m\"\u001b[39m])\u001b[38;5;241m.\u001b[39msum()\n\u001b[0;32m----> 4\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m (\u001b[43mchunk\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[43mweighted_func\u001b[49m\u001b[43m)\u001b[49m, chunk[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEmployerSize\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39msum())\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/pandas/core/groupby/generic.py:230\u001b[0m, in \u001b[0;36mSeriesGroupBy.apply\u001b[0;34m(self, func, *args, **kwargs)\u001b[0m\n\u001b[1;32m 224\u001b[0m \u001b[38;5;129m@Appender\u001b[39m(\n\u001b[1;32m 225\u001b[0m _apply_docs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtemplate\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39mformat(\n\u001b[1;32m 226\u001b[0m \u001b[38;5;28minput\u001b[39m\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mseries\u001b[39m\u001b[38;5;124m\"\u001b[39m, examples\u001b[38;5;241m=\u001b[39m_apply_docs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mseries_examples\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 227\u001b[0m )\n\u001b[1;32m 228\u001b[0m )\n\u001b[1;32m 229\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mapply\u001b[39m(\u001b[38;5;28mself\u001b[39m, func, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Series:\n\u001b[0;32m--> 230\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/pandas/core/groupby/groupby.py:1824\u001b[0m, in \u001b[0;36mGroupBy.apply\u001b[0;34m(self, func, include_groups, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1822\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m option_context(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmode.chained_assignment\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[1;32m 1823\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1824\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_python_apply_general\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_selected_obj\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1825\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m 1826\u001b[0m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mobj, Series)\n\u001b[1;32m 1827\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_selection \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1828\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_selected_obj\u001b[38;5;241m.\u001b[39mshape \u001b[38;5;241m!=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_obj_with_exclusions\u001b[38;5;241m.\u001b[39mshape\n\u001b[1;32m 1829\u001b[0m ):\n\u001b[1;32m 1830\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[1;32m 1831\u001b[0m message\u001b[38;5;241m=\u001b[39m_apply_groupings_depr\u001b[38;5;241m.\u001b[39mformat(\n\u001b[1;32m 1832\u001b[0m \u001b[38;5;28mtype\u001b[39m(\u001b[38;5;28mself\u001b[39m)\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mapply\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1835\u001b[0m stacklevel\u001b[38;5;241m=\u001b[39mfind_stack_level(),\n\u001b[1;32m 1836\u001b[0m )\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/pandas/core/groupby/groupby.py:1885\u001b[0m, in \u001b[0;36mGroupBy._python_apply_general\u001b[0;34m(self, f, data, not_indexed_same, is_transform, is_agg)\u001b[0m\n\u001b[1;32m 1850\u001b[0m \u001b[38;5;129m@final\u001b[39m\n\u001b[1;32m 1851\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_python_apply_general\u001b[39m(\n\u001b[1;32m 1852\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1857\u001b[0m is_agg: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[1;32m 1858\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m NDFrameT:\n\u001b[1;32m 1859\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 1860\u001b[0m \u001b[38;5;124;03m Apply function f in python space\u001b[39;00m\n\u001b[1;32m 1861\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1883\u001b[0m \u001b[38;5;124;03m data after applying f\u001b[39;00m\n\u001b[1;32m 1884\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m-> 1885\u001b[0m values, mutated \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_grouper\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply_groupwise\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43maxis\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1886\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m not_indexed_same \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1887\u001b[0m not_indexed_same \u001b[38;5;241m=\u001b[39m mutated\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/pandas/core/groupby/ops.py:919\u001b[0m, in \u001b[0;36mBaseGrouper.apply_groupwise\u001b[0;34m(self, f, data, axis)\u001b[0m\n\u001b[1;32m 917\u001b[0m \u001b[38;5;66;03m# group might be modified\u001b[39;00m\n\u001b[1;32m 918\u001b[0m group_axes \u001b[38;5;241m=\u001b[39m group\u001b[38;5;241m.\u001b[39maxes\n\u001b[0;32m--> 919\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[43mgroup\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 920\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m mutated \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m _is_indexed_like(res, group_axes, axis):\n\u001b[1;32m 921\u001b[0m mutated \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n", + "Cell \u001b[0;32mIn[5], line 3\u001b[0m, in \u001b[0;36mprocess_chunk..weighted_func\u001b[0;34m(df)\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mweighted_func\u001b[39m(df):\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m (\u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mEmployerSize\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m \u001b[38;5;241m*\u001b[39m df[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDiffMeanHourlyPercent\u001b[39m\u001b[38;5;124m\"\u001b[39m])\u001b[38;5;241m.\u001b[39msum()\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/pandas/core/series.py:1112\u001b[0m, in \u001b[0;36mSeries.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 1109\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_values[key]\n\u001b[1;32m 1111\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m key_is_scalar:\n\u001b[0;32m-> 1112\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_value\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1114\u001b[0m \u001b[38;5;66;03m# Convert generator to list before going through hashable part\u001b[39;00m\n\u001b[1;32m 1115\u001b[0m \u001b[38;5;66;03m# (We will iterate through the generator there to check for slices)\u001b[39;00m\n\u001b[1;32m 1116\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_iterator(key):\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/pandas/core/series.py:1228\u001b[0m, in \u001b[0;36mSeries._get_value\u001b[0;34m(self, label, takeable)\u001b[0m\n\u001b[1;32m 1225\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_values[label]\n\u001b[1;32m 1227\u001b[0m \u001b[38;5;66;03m# Similar to Index.get_value, but we do not fall back to positional\u001b[39;00m\n\u001b[0;32m-> 1228\u001b[0m loc \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mindex\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlabel\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1230\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(loc):\n\u001b[1;32m 1231\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_values[loc]\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/pandas/core/indexes/base.py:3812\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3807\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[1;32m 3808\u001b[0m \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[1;32m 3809\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[1;32m 3810\u001b[0m ):\n\u001b[1;32m 3811\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[0;32m-> 3812\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m 3813\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m 3814\u001b[0m \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[1;32m 3815\u001b[0m \u001b[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[1;32m 3816\u001b[0m \u001b[38;5;66;03m# the TypeError.\u001b[39;00m\n\u001b[1;32m 3817\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n", + "\u001b[0;31mKeyError\u001b[0m: 'EmployerSize'" + ] + } + ], + "source": [ + "def process_chunk(chunk):\n", + " def weighted_func(df):\n", + " return (df[\"EmployerSize\"] * df[\"DiffMeanHourlyPercent\"]).sum()\n", + " return (chunk.apply(weighted_func), chunk[\"EmployerSize\"].sum())\n", + "\n", + "def agg(total, weights):\n", + " return (total.sum(), weights.sum())\n", + "\n", + "def finalize(total, weights):\n", + " return total / weights\n", + " \n", + "weighted_mean = dd.Aggregation(\n", + " name='weighted_mean',\n", + " chunk=process_chunk,\n", + " agg=agg,\n", + " finalize=finalize)\n", + "\n", + "aggregated = ddf.groupby(\"PostCode\")[\"EmployerSize\", \"DiffMeanHourlyPercent\"].agg(weighted_mean)\n", + "aggregated.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/4n/v40br47s46ggrjm9bdm64lwh0000gn/T/ipykernel_49527/3631319032.py:14: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PostCodeEmployerSizeDiffMeanHourlyPercentPostCodeLength
0DT11 0PX749.518.08
1EH6 8NU374.52.37
2LS7 1AB374.541.07
3TA6 3JA374.5-22.07
4SR5 1SU374.513.47
...............
10838SN1 1AP2999.525.97
10839PO15 7JZ2999.516.48
10840SK11 0LP374.518.98
10841SY5 0BD749.520.07
10842SE1 2AU374.53.77
\n", + "

58849 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " PostCode EmployerSize DiffMeanHourlyPercent PostCodeLength\n", + "0 DT11 0PX 749.5 18.0 8\n", + "1 EH6 8NU 374.5 2.3 7\n", + "2 LS7 1AB 374.5 41.0 7\n", + "3 TA6 3JA 374.5 -22.0 7\n", + "4 SR5 1SU 374.5 13.4 7\n", + "... ... ... ... ...\n", + "10838 SN1 1AP 2999.5 25.9 7\n", + "10839 PO15 7JZ 2999.5 16.4 8\n", + "10840 SK11 0LP 374.5 18.9 8\n", + "10841 SY5 0BD 749.5 20.0 7\n", + "10842 SE1 2AU 374.5 3.7 7\n", + "\n", + "[58849 rows x 4 columns]" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ddf.groupby(\"PostCode\")['DiffMeanHourlyPercent'].sum()\n", + "d = ddf.compute()\n", + "d" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "Metadata inference failed in `_groupby_apply_funcs`.\n\nYou have supplied a custom function and Dask is unable to \ndetermine the type of output that that function returns. \n\nTo resolve this please provide a meta= keyword.\nThe docstring of the Dask function you ran should have more information.\n\nOriginal error is below:\n------------------------\nIndexError('Column(s) DiffMeanHourlyPercent already selected')\n\nTraceback:\n---------\n File \"/Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages/dask/dataframe/utils.py\", line 194, in raise_on_meta_error\n yield\n File \"/Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages/dask/dataframe/core.py\", line 7057, in _emulate\n return func(*_extract_meta(args, True), **_extract_meta(kwargs, True))\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages/dask/dataframe/groupby.py\", line 1194, in _groupby_apply_funcs\n r = func(grouped, **func_kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages/dask/dataframe/groupby.py\", line 1240, in _apply_func_to_column\n return func(df_like[column])\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/var/folders/4n/v40br47s46ggrjm9bdm64lwh0000gn/T/ipykernel_50604/416441874.py\", line 3, in \n chunk=lambda s: (s['EmployerSize'].count(), s['DiffMeanHourlyPercent'].sum()),\n ~^^^^^^^^^^^^^^^^\n File \"/Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages/pandas/core/base.py\", line 234, in __getitem__\n raise IndexError(f\"Column(s) {self._selection} already selected\")\n", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/dask/dataframe/utils.py:194\u001b[0m, in \u001b[0;36mraise_on_meta_error\u001b[0;34m(funcname, udf)\u001b[0m\n\u001b[1;32m 193\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 194\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m\n\u001b[1;32m 195\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/dask/dataframe/core.py:7057\u001b[0m, in \u001b[0;36m_emulate\u001b[0;34m(func, udf, *args, **kwargs)\u001b[0m\n\u001b[1;32m 7056\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m raise_on_meta_error(funcname(func), udf\u001b[38;5;241m=\u001b[39mudf), check_numeric_only_deprecation():\n\u001b[0;32m-> 7057\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43m_extract_meta\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43m_extract_meta\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/dask/dataframe/groupby.py:1194\u001b[0m, in \u001b[0;36m_groupby_apply_funcs\u001b[0;34m(df, *by, **kwargs)\u001b[0m\n\u001b[1;32m 1193\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m result_column, func, func_kwargs \u001b[38;5;129;01min\u001b[39;00m funcs:\n\u001b[0;32m-> 1194\u001b[0m r \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mgrouped\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mfunc_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1196\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(r, \u001b[38;5;28mtuple\u001b[39m):\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/dask/dataframe/groupby.py:1240\u001b[0m, in \u001b[0;36m_apply_func_to_column\u001b[0;34m(df_like, column, func)\u001b[0m\n\u001b[1;32m 1238\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(df_like)\n\u001b[0;32m-> 1240\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf_like\u001b[49m\u001b[43m[\u001b[49m\u001b[43mcolumn\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n", + "Cell \u001b[0;32mIn[10], line 3\u001b[0m, in \u001b[0;36m\u001b[0;34m(s)\u001b[0m\n\u001b[1;32m 1\u001b[0m custom_mean \u001b[38;5;241m=\u001b[39m dd\u001b[38;5;241m.\u001b[39mAggregation(\n\u001b[1;32m 2\u001b[0m name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcustom_mean\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[0;32m----> 3\u001b[0m chunk\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mlambda\u001b[39;00m s: (\u001b[43ms\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mEmployerSize\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241m.\u001b[39mcount(), s[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mDiffMeanHourlyPercent\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39msum()),\n\u001b[1;32m 4\u001b[0m agg\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mlambda\u001b[39;00m count, \u001b[38;5;28msum\u001b[39m: (count\u001b[38;5;241m.\u001b[39msum(), \u001b[38;5;28msum\u001b[39m\u001b[38;5;241m.\u001b[39msum()),\n\u001b[1;32m 5\u001b[0m finalize\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mlambda\u001b[39;00m count, \u001b[38;5;28msum\u001b[39m: \u001b[38;5;28msum\u001b[39m \u001b[38;5;241m/\u001b[39m count,\n\u001b[1;32m 6\u001b[0m ) \n\u001b[1;32m 7\u001b[0m a \u001b[38;5;241m=\u001b[39m ddf\u001b[38;5;241m.\u001b[39mgroupby(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mPostCode\u001b[39m\u001b[38;5;124m'\u001b[39m)\u001b[38;5;241m.\u001b[39magg(custom_mean)\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/pandas/core/base.py:234\u001b[0m, in \u001b[0;36mSelectionMixin.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 233\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_selection \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 234\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mIndexError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mColumn(s) \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_selection\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m already selected\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 236\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(key, (\u001b[38;5;28mlist\u001b[39m, \u001b[38;5;28mtuple\u001b[39m, ABCSeries, ABCIndex, np\u001b[38;5;241m.\u001b[39mndarray)):\n", + "\u001b[0;31mIndexError\u001b[0m: Column(s) DiffMeanHourlyPercent already selected", + "\nThe above exception was the direct cause of the following exception:\n", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[10], line 7\u001b[0m\n\u001b[1;32m 1\u001b[0m custom_mean \u001b[38;5;241m=\u001b[39m dd\u001b[38;5;241m.\u001b[39mAggregation(\n\u001b[1;32m 2\u001b[0m name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcustom_mean\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m 3\u001b[0m chunk\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mlambda\u001b[39;00m s: (s[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mEmployerSize\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mcount(), s[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mDiffMeanHourlyPercent\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39msum()),\n\u001b[1;32m 4\u001b[0m agg\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mlambda\u001b[39;00m count, \u001b[38;5;28msum\u001b[39m: (count\u001b[38;5;241m.\u001b[39msum(), \u001b[38;5;28msum\u001b[39m\u001b[38;5;241m.\u001b[39msum()),\n\u001b[1;32m 5\u001b[0m finalize\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mlambda\u001b[39;00m count, \u001b[38;5;28msum\u001b[39m: \u001b[38;5;28msum\u001b[39m \u001b[38;5;241m/\u001b[39m count,\n\u001b[1;32m 6\u001b[0m ) \n\u001b[0;32m----> 7\u001b[0m a \u001b[38;5;241m=\u001b[39m \u001b[43mddf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgroupby\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mPostCode\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43magg\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcustom_mean\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 8\u001b[0m a\u001b[38;5;241m.\u001b[39mhead(\u001b[38;5;241m5\u001b[39m) \n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/dask/dataframe/groupby.py:352\u001b[0m, in \u001b[0;36mnumeric_only_not_implemented..wrapper\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 342\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m 343\u001b[0m PANDAS_GE_150\n\u001b[1;32m 344\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m PANDAS_GE_200\n\u001b[1;32m 345\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m numeric_only \u001b[38;5;129;01mis\u001b[39;00m no_default\n\u001b[1;32m 346\u001b[0m ):\n\u001b[1;32m 347\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[1;32m 348\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe default value of numeric_only will be changed to False \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 349\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124min the future when using dask with pandas 2.0\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 350\u001b[0m \u001b[38;5;167;01mFutureWarning\u001b[39;00m,\n\u001b[1;32m 351\u001b[0m )\n\u001b[0;32m--> 352\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/dask/dataframe/groupby.py:2928\u001b[0m, in \u001b[0;36mDataFrameGroupBy.agg\u001b[0;34m(self, arg, split_every, split_out, shuffle, **kwargs)\u001b[0m\n\u001b[1;32m 2925\u001b[0m \u001b[38;5;129m@_aggregate_docstring\u001b[39m(based_on\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpd.core.groupby.DataFrameGroupBy.agg\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 2926\u001b[0m \u001b[38;5;129m@numeric_only_not_implemented\u001b[39m\n\u001b[1;32m 2927\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21magg\u001b[39m(\u001b[38;5;28mself\u001b[39m, arg\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, split_every\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, split_out\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m, shuffle\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m-> 2928\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43maggregate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2929\u001b[0m \u001b[43m \u001b[49m\u001b[43marg\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43marg\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2930\u001b[0m \u001b[43m \u001b[49m\u001b[43msplit_every\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msplit_every\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2931\u001b[0m \u001b[43m \u001b[49m\u001b[43msplit_out\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msplit_out\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2932\u001b[0m \u001b[43m \u001b[49m\u001b[43mshuffle\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mshuffle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2933\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2934\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/dask/dataframe/groupby.py:2917\u001b[0m, in \u001b[0;36mDataFrameGroupBy.aggregate\u001b[0;34m(self, arg, split_every, split_out, shuffle, **kwargs)\u001b[0m\n\u001b[1;32m 2914\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m arg \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msize\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 2915\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msize()\n\u001b[0;32m-> 2917\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43maggregate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2918\u001b[0m \u001b[43m \u001b[49m\u001b[43marg\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43marg\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2919\u001b[0m \u001b[43m \u001b[49m\u001b[43msplit_every\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msplit_every\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2920\u001b[0m \u001b[43m \u001b[49m\u001b[43msplit_out\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msplit_out\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2921\u001b[0m \u001b[43m \u001b[49m\u001b[43mshuffle\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mshuffle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2922\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2923\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/dask/dataframe/groupby.py:2381\u001b[0m, in \u001b[0;36m_GroupBy.aggregate\u001b[0;34m(self, arg, split_every, split_out, shuffle, **kwargs)\u001b[0m\n\u001b[1;32m 2374\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msort \u001b[38;5;129;01mand\u001b[39;00m split_out \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 2375\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m(\n\u001b[1;32m 2376\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCannot guarantee sorted keys for `split_out>1` and `shuffle=False`\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2377\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m Try using `shuffle=True` if you are grouping on a single column.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2378\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m Otherwise, try using split_out=1, or grouping with sort=False.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2379\u001b[0m )\n\u001b[0;32m-> 2381\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43maca\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2382\u001b[0m \u001b[43m \u001b[49m\u001b[43mchunk_args\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2383\u001b[0m \u001b[43m \u001b[49m\u001b[43mchunk\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m_groupby_apply_funcs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2384\u001b[0m \u001b[43m \u001b[49m\u001b[43mchunk_kwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mdict\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2385\u001b[0m \u001b[43m \u001b[49m\u001b[43mfuncs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mchunk_funcs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2386\u001b[0m \u001b[43m \u001b[49m\u001b[43msort\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 2387\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mobserved\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2388\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdropna\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2389\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2390\u001b[0m \u001b[43m \u001b[49m\u001b[43mcombine\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m_groupby_apply_funcs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2391\u001b[0m \u001b[43m \u001b[49m\u001b[43mcombine_kwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mdict\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2392\u001b[0m \u001b[43m \u001b[49m\u001b[43mfuncs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maggregate_funcs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2393\u001b[0m \u001b[43m \u001b[49m\u001b[43mlevel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlevels\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2394\u001b[0m \u001b[43m \u001b[49m\u001b[43msort\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 2395\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mobserved\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2396\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdropna\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2397\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2398\u001b[0m \u001b[43m \u001b[49m\u001b[43maggregate\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m_agg_finalize\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2399\u001b[0m \u001b[43m \u001b[49m\u001b[43maggregate_kwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mdict\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2400\u001b[0m \u001b[43m \u001b[49m\u001b[43maggregate_funcs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maggregate_funcs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2401\u001b[0m \u001b[43m \u001b[49m\u001b[43mfinalize_funcs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfinalizers\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2402\u001b[0m \u001b[43m \u001b[49m\u001b[43mlevel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlevels\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2403\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mobserved\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2404\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdropna\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2405\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2406\u001b[0m \u001b[43m \u001b[49m\u001b[43mtoken\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43maggregate\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2407\u001b[0m \u001b[43m \u001b[49m\u001b[43msplit_every\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msplit_every\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2408\u001b[0m \u001b[43m \u001b[49m\u001b[43msplit_out\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msplit_out\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2409\u001b[0m \u001b[43m \u001b[49m\u001b[43msplit_out_setup\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msplit_out_on_index\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2410\u001b[0m \u001b[43m \u001b[49m\u001b[43msort\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msort\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2411\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2413\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m relabeling \u001b[38;5;129;01mand\u001b[39;00m result \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 2414\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m order \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/dask/dataframe/core.py:7010\u001b[0m, in \u001b[0;36mapply_concat_apply\u001b[0;34m(args, chunk, aggregate, combine, meta, token, chunk_kwargs, aggregate_kwargs, combine_kwargs, split_every, split_out, split_out_setup, split_out_setup_kwargs, sort, ignore_index, **kwargs)\u001b[0m\n\u001b[1;32m 6995\u001b[0m layer \u001b[38;5;241m=\u001b[39m DataFrameTreeReduction(\n\u001b[1;32m 6996\u001b[0m final_name,\n\u001b[1;32m 6997\u001b[0m chunked\u001b[38;5;241m.\u001b[39mname,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 7006\u001b[0m tree_node_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtoken\u001b[38;5;250m \u001b[39m\u001b[38;5;129;01mor\u001b[39;00m\u001b[38;5;250m \u001b[39mfuncname(combine)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m-combine-\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtoken_key\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 7007\u001b[0m )\n\u001b[1;32m 7009\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m meta \u001b[38;5;129;01mis\u001b[39;00m no_default:\n\u001b[0;32m-> 7010\u001b[0m meta_chunk \u001b[38;5;241m=\u001b[39m \u001b[43m_emulate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mchunk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mudf\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mchunk_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 7011\u001b[0m meta \u001b[38;5;241m=\u001b[39m _emulate(\n\u001b[1;32m 7012\u001b[0m aggregate, _concat([meta_chunk], ignore_index), udf\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39maggregate_kwargs\n\u001b[1;32m 7013\u001b[0m )\n\u001b[1;32m 7014\u001b[0m meta \u001b[38;5;241m=\u001b[39m make_meta(\n\u001b[1;32m 7015\u001b[0m meta,\n\u001b[1;32m 7016\u001b[0m index\u001b[38;5;241m=\u001b[39m(\u001b[38;5;28mgetattr\u001b[39m(make_meta(dfs[\u001b[38;5;241m0\u001b[39m]), \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mindex\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;28;01mif\u001b[39;00m dfs \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m),\n\u001b[1;32m 7017\u001b[0m parent_meta\u001b[38;5;241m=\u001b[39mdfs[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39m_meta,\n\u001b[1;32m 7018\u001b[0m )\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/dask/dataframe/core.py:7056\u001b[0m, in \u001b[0;36m_emulate\u001b[0;34m(func, udf, *args, **kwargs)\u001b[0m\n\u001b[1;32m 7051\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_emulate\u001b[39m(func, \u001b[38;5;241m*\u001b[39margs, udf\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 7052\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 7053\u001b[0m \u001b[38;5;124;03m Apply a function using args / kwargs. If arguments contain dd.DataFrame /\u001b[39;00m\n\u001b[1;32m 7054\u001b[0m \u001b[38;5;124;03m dd.Series, using internal cache (``_meta``) for calculation\u001b[39;00m\n\u001b[1;32m 7055\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m-> 7056\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mwith\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mraise_on_meta_error\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfuncname\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mudf\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mudf\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcheck_numeric_only_deprecation\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[1;32m 7057\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mreturn\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43m_extract_meta\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43m_extract_meta\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/contextlib.py:158\u001b[0m, in \u001b[0;36m_GeneratorContextManager.__exit__\u001b[0;34m(self, typ, value, traceback)\u001b[0m\n\u001b[1;32m 156\u001b[0m value \u001b[38;5;241m=\u001b[39m typ()\n\u001b[1;32m 157\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 158\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgen\u001b[38;5;241m.\u001b[39mthrow(typ, value, traceback)\n\u001b[1;32m 159\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m 160\u001b[0m \u001b[38;5;66;03m# Suppress StopIteration *unless* it's the same exception that\u001b[39;00m\n\u001b[1;32m 161\u001b[0m \u001b[38;5;66;03m# was passed to throw(). This prevents a StopIteration\u001b[39;00m\n\u001b[1;32m 162\u001b[0m \u001b[38;5;66;03m# raised inside the \"with\" statement from being suppressed.\u001b[39;00m\n\u001b[1;32m 163\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m exc \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m value\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/dask/dataframe/utils.py:215\u001b[0m, in \u001b[0;36mraise_on_meta_error\u001b[0;34m(funcname, udf)\u001b[0m\n\u001b[1;32m 206\u001b[0m msg \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 207\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mOriginal error is below:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 208\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m------------------------\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 212\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{2}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 213\u001b[0m )\n\u001b[1;32m 214\u001b[0m msg \u001b[38;5;241m=\u001b[39m msg\u001b[38;5;241m.\u001b[39mformat(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m in `\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfuncname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m`\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m funcname \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28mrepr\u001b[39m(e), tb)\n\u001b[0;32m--> 215\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(msg) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01me\u001b[39;00m\n", + "\u001b[0;31mValueError\u001b[0m: Metadata inference failed in `_groupby_apply_funcs`.\n\nYou have supplied a custom function and Dask is unable to \ndetermine the type of output that that function returns. \n\nTo resolve this please provide a meta= keyword.\nThe docstring of the Dask function you ran should have more information.\n\nOriginal error is below:\n------------------------\nIndexError('Column(s) DiffMeanHourlyPercent already selected')\n\nTraceback:\n---------\n File \"/Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages/dask/dataframe/utils.py\", line 194, in raise_on_meta_error\n yield\n File \"/Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages/dask/dataframe/core.py\", line 7057, in _emulate\n return func(*_extract_meta(args, True), **_extract_meta(kwargs, True))\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages/dask/dataframe/groupby.py\", line 1194, in _groupby_apply_funcs\n r = func(grouped, **func_kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages/dask/dataframe/groupby.py\", line 1240, in _apply_func_to_column\n return func(df_like[column])\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/var/folders/4n/v40br47s46ggrjm9bdm64lwh0000gn/T/ipykernel_50604/416441874.py\", line 3, in \n chunk=lambda s: (s['EmployerSize'].count(), s['DiffMeanHourlyPercent'].sum()),\n ~^^^^^^^^^^^^^^^^\n File \"/Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages/pandas/core/base.py\", line 234, in __getitem__\n raise IndexError(f\"Column(s) {self._selection} already selected\")\n" + ] + } + ], + "source": [ + "custom_mean = dd.Aggregation(\n", + " name='custom_mean',\n", + " chunk=lambda s: (s['EmployerSize'].count(), s['DiffMeanHourlyPercent'].sum()),\n", + " agg=lambda count, sum: (count.sum(), sum.sum()),\n", + " finalize=lambda count, sum: sum / count,\n", + ") \n", + "a = ddf.groupby('PostCode').agg(custom_mean)\n", + "a.head(5) " + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
EmployerNameEmployerIdAddressPostCodeCompanyNumberSicCodesDiffMeanHourlyPercentDiffMedianHourlyPercentDiffMeanBonusPercentDiffMedianBonusPercent...FemaleUpperMiddleQuartileMaleTopQuartileFemaleTopQuartileCompanyLinkToGPGInfoResponsiblePersonEmployerSizeCurrentNameSubmittedAfterTheDeadlineDueDateDateSubmitted
1\"RED BAND\" CHEMICAL COMPANY, LIMITED1687919 Smith's Place, Leith Walk, Edinburgh, EH6 8NUEH6 8NUSC016876477302.3-2.715.037.5...89.718.181.9<NA>Philip Galt (Managing Director)250 to 499\"RED BAND\" CHEMICAL COMPANY, LIMITEDFalse2018/04/05 00:00:002018/03/28 16:44:25
2123 EMPLOYEES LTD1767734 Roundhay Road, Leeds, England, LS7 1ABLS7 1AB105306517830041.036.0-69.8-157.2...89.023.077.0<NA>Chloe Lines (Financial Controller)250 to 499123 EMPLOYEES LTDTrue2018/04/05 00:00:002018/05/04 11:24:06
71STOP HALAL LIMITED689Colmore Court, 9 Colmore Row, Birmingham, West...B3 2BJ089290705629011.90.00.00.0...41.969.830.2<NA>Stephen Elder (Finance Director)250 to 499SHAZAN FOODS LIMITEDFalse2018/04/05 00:00:002018/03/22 08:08:33
\n", + "

3 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " EmployerName EmployerId \\\n", + "1 \"RED BAND\" CHEMICAL COMPANY, LIMITED 16879 \n", + "2 123 EMPLOYEES LTD 17677 \n", + "7 1STOP HALAL LIMITED 689 \n", + "\n", + " Address PostCode CompanyNumber \\\n", + "1 19 Smith's Place, Leith Walk, Edinburgh, EH6 8NU EH6 8NU SC016876 \n", + "2 34 Roundhay Road, Leeds, England, LS7 1AB LS7 1AB 10530651 \n", + "7 Colmore Court, 9 Colmore Row, Birmingham, West... B3 2BJ 08929070 \n", + "\n", + " SicCodes DiffMeanHourlyPercent DiffMedianHourlyPercent \\\n", + "1 47730 2.3 -2.7 \n", + "2 78300 41.0 36.0 \n", + "7 56290 11.9 0.0 \n", + "\n", + " DiffMeanBonusPercent DiffMedianBonusPercent ... \\\n", + "1 15.0 37.5 ... \n", + "2 -69.8 -157.2 ... \n", + "7 0.0 0.0 ... \n", + "\n", + " FemaleUpperMiddleQuartile MaleTopQuartile FemaleTopQuartile \\\n", + "1 89.7 18.1 81.9 \n", + "2 89.0 23.0 77.0 \n", + "7 41.9 69.8 30.2 \n", + "\n", + " CompanyLinkToGPGInfo ResponsiblePerson EmployerSize \\\n", + "1 Philip Galt (Managing Director) 250 to 499 \n", + "2 Chloe Lines (Financial Controller) 250 to 499 \n", + "7 Stephen Elder (Finance Director) 250 to 499 \n", + "\n", + " CurrentName SubmittedAfterTheDeadline \\\n", + "1 \"RED BAND\" CHEMICAL COMPANY, LIMITED False \n", + "2 123 EMPLOYEES LTD True \n", + "7 SHAZAN FOODS LIMITED False \n", + "\n", + " DueDate DateSubmitted \n", + "1 2018/04/05 00:00:00 2018/03/28 16:44:25 \n", + "2 2018/04/05 00:00:00 2018/05/04 11:24:06 \n", + "7 2018/04/05 00:00:00 2018/03/22 08:08:33 \n", + "\n", + "[3 rows x 27 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "na_rows.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Dask DataFrame Structure:
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DateDayOfWeekDepTimeCRSDepTimeArrTimeCRSArrTimeUniqueCarrierFlightNumTailNumActualElapsedTimeCRSElapsedTimeAirTimeArrDelayDepDelayOriginDestDistanceTaxiInTaxiOutCancelledDiverted
npartitions=6
datetime64[ns]int64float64int64float64int64stringint64float64float64int64float64float64float64stringstringfloat64float64float64int64int64
...............................................................
..................................................................
...............................................................
...............................................................
\n", + "
\n", + "
Dask Name: try_loc, 3 graph layers
" + ], + "text/plain": [ + "Dask DataFrame Structure:\n", + " Date DayOfWeek DepTime CRSDepTime ArrTime CRSArrTime UniqueCarrier FlightNum TailNum ActualElapsedTime CRSElapsedTime AirTime ArrDelay DepDelay Origin Dest Distance TaxiIn TaxiOut Cancelled Diverted\n", + "npartitions=6 \n", + " datetime64[ns] int64 float64 int64 float64 int64 string int64 float64 float64 int64 float64 float64 float64 string string float64 float64 float64 int64 int64\n", + " ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", + "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", + " ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", + " ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", + "Dask Name: try_loc, 3 graph layers" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def process_chunk(chunk):\n", + " def weighted_func(df):\n", + " return (df[\"EmployerSize\"] * df[\"DiffMeanHourlyPercent\"]).sum()\n", + " return (chunk.apply(weighted_func), chunk.sum()[\"EmployerSize\"])\n", + " \n", + "def agg(total, weights):\n", + " return (total.sum(), weights.sum())\n", + "\n", + "def finalize(total, weights):\n", + " return total / weights\n", + " \n", + "weighted_mean = dd.Aggregation(\n", + " name='weighted_mean',\n", + " chunk=process_chunk,\n", + " agg=agg,\n", + " finalize=finalize)\n", + "\n", + "aggregated = ddf.groupby(\"PostCode\")[\"EmployerSize\", \"DiffMeanHourlyPercent\"].agg(weighted_mean)" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [], + "source": [ + "client.shutdown()" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PostCodeEmployerSizeDiffMeanHourlyPercent
0A1000.1
1A2000.2
2B3000.3
3B4000.4
\n", + "
" + ], + "text/plain": [ + " PostCode EmployerSize DiffMeanHourlyPercent\n", + "0 A 100 0.1\n", + "1 A 200 0.2\n", + "2 B 300 0.3\n", + "3 B 400 0.4" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = {'PostCode': ['A', 'A', 'B', 'B'],\n", + " 'EmployerSize': [100, 200, 300, 400],\n", + " 'DiffMeanHourlyPercent': [0.1, 0.2, 0.3, 0.4]}\n", + "df = pd.DataFrame(data)\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1000" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.sum()[\"EmployerSize\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1000" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['EmployerSize'].sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PostCodeEmployerSizeDiffMeanHourlyPercent
0A1000.1
1A2000.2
2B3000.3
3B4000.4
\n", + "
" + ], + "text/plain": [ + " PostCode EmployerSize DiffMeanHourlyPercent\n", + "0 A 100 0.1\n", + "1 A 200 0.2\n", + "2 B 300 0.3\n", + "3 B 400 0.4" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = {'PostCode': ['A', 'A', 'B', 'B'], 'EmployerSize': [100, 200, 300, 400], 'DiffMeanHourlyPercent': [0.1, 0.2, 0.3, 0.4]}\n", + "\n", + "df = pd.DataFrame(data)\n", + "ddf = dd.from_pandas(df, 2)\n", + "ddf.compute()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " PostCode EmployerSize DiffMeanHourlyPercent\n", + "0 A 100 0.1\n", + "1 A 200 0.2\n", + " PostCode EmployerSize DiffMeanHourlyPercent\n", + "2 B 300 0.3\n", + "3 B 400 0.4\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/4n/v40br47s46ggrjm9bdm64lwh0000gn/T/ipykernel_65513/3023763576.py:5: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " result = df.groupby('PostCode').apply(weighted_mean).reset_index(name='WeightedMean')\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PostCodeWeightedMean
0A0.166667
1B0.357143
\n", + "
" + ], + "text/plain": [ + " PostCode WeightedMean\n", + "0 A 0.166667\n", + "1 B 0.357143" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def weighted_mean(data):\n", + " print(data)\n", + " return (data['EmployerSize'] * data['DiffMeanHourlyPercent']).sum() / data['EmployerSize'].sum()\n", + "\n", + "result = df.groupby('PostCode').apply(weighted_mean).reset_index(name='WeightedMean')\n", + "result" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "PostCode\n", + "A 0.166667\n", + "B 0.357143\n", + "dtype: float64" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[\"Weighted\"] = df[\"EmployerSize\"] * df[\"DiffMeanHourlyPercent\"]\n", + "df = df.groupby(\"PostCode\").sum()\n", + "df = df[\"Weighted\"] / df[\"EmployerSize\"]\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "ename": "KeyError", + "evalue": "\"Columns not found: 'EmployerSize'\"", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[49], line 15\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m total \u001b[38;5;241m/\u001b[39m weights\n\u001b[1;32m 14\u001b[0m extent \u001b[38;5;241m=\u001b[39m dd\u001b[38;5;241m.\u001b[39mAggregation(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mextent\u001b[39m\u001b[38;5;124m'\u001b[39m, chunk, agg, finalize\u001b[38;5;241m=\u001b[39mfinalize)\n\u001b[0;32m---> 15\u001b[0m \u001b[43mddf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgroupby\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mPostCode\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mEmployerSize\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mDiffMeanHourlyPercent\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241m.\u001b[39magg(extent)\u001b[38;5;241m.\u001b[39mcompute()\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/dask/dataframe/groupby.py:2885\u001b[0m, in \u001b[0;36mDataFrameGroupBy.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 2883\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(key, \u001b[38;5;28mtuple\u001b[39m):\n\u001b[1;32m 2884\u001b[0m key \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(key)\n\u001b[0;32m-> 2885\u001b[0m g\u001b[38;5;241m.\u001b[39m_meta \u001b[38;5;241m=\u001b[39m \u001b[43mg\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_meta\u001b[49m\u001b[43m[\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m 2886\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m g\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/pandas/core/groupby/generic.py:1964\u001b[0m, in \u001b[0;36mDataFrameGroupBy.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 1957\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(key, \u001b[38;5;28mtuple\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(key) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 1958\u001b[0m \u001b[38;5;66;03m# if len == 1, then it becomes a SeriesGroupBy and this is actually\u001b[39;00m\n\u001b[1;32m 1959\u001b[0m \u001b[38;5;66;03m# valid syntax, so don't raise\u001b[39;00m\n\u001b[1;32m 1960\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 1961\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCannot subset columns with a tuple with more than one element. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1962\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUse a list instead.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1963\u001b[0m )\n\u001b[0;32m-> 1964\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__getitem__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/pandas/core/base.py:239\u001b[0m, in \u001b[0;36mSelectionMixin.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 237\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mobj\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mintersection(key)) \u001b[38;5;241m!=\u001b[39m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mset\u001b[39m(key)):\n\u001b[1;32m 238\u001b[0m bad_keys \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(\u001b[38;5;28mset\u001b[39m(key)\u001b[38;5;241m.\u001b[39mdifference(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mobj\u001b[38;5;241m.\u001b[39mcolumns))\n\u001b[0;32m--> 239\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mColumns not found: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mstr\u001b[39m(bad_keys)[\u001b[38;5;241m1\u001b[39m:\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 240\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_gotitem(\u001b[38;5;28mlist\u001b[39m(key), ndim\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m2\u001b[39m)\n\u001b[1;32m 242\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n", + "\u001b[0;31mKeyError\u001b[0m: \"Columns not found: 'EmployerSize'\"" + ] + } + ], + "source": [ + "def chunk(chunk):\n", + " def weighted_func(df):\n", + " return (df[\"EmployerSize\"] * df[\"DiffMeanHourlyPercent\"]).sum()\n", + " return (chunk.apply(weighted_func), chunk.sum()[\"EmployerSize\"])\n", + "\n", + "def agg(total, weights):\n", + " return (total.sum(), weights.sum())\n", + " # return chunk_maxes.max(), chunk_mins.min()\n", + "\n", + "def finalize(total, weights):\n", + " return total / weights\n", + "\n", + "extent = dd.Aggregation('extent', chunk, agg, finalize=finalize)\n", + "ddf.groupby(\"PostCode\")['EmployerSize', 'DiffMeanHourlyPercent'].agg(extent).compute()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame({\n", + " 'PostCode': ['a', 'b', 'a', 'a', 'b', 'c', 'd'],\n", + " 'Size': [0, 1, 0, 2, 5, 3, 4],\n", + " 'DiffMeanHourlyPercent': [0.1, 0.2, 0.3, 0.4, 0.5, 0.1, 0.2],\n", + "})\n", + "ddf = dd.from_pandas(df, 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " count mean std min 25% 50% 75% max\n", + "PostCode \n", + "a 1.0 1.0 NaN 1.0 1.0 1.0 1.0 1.0\n", + " count mean std min 25% 50% 75% max\n", + "PostCode \n", + "a 1.0 1.0 NaN 1.0 1.0 1.0 1.0 1.0\n", + " count mean std min 25% 50% 75% max\n", + "PostCode \n", + "a 1.0 1.0 NaN 1.0 1.0 1.0 1.0 1.0\n", + " count mean std min 25% 50% 75% max\n", + "PostCode \n", + "a 1.0 1.0 NaN 1.0 1.0 1.0 1.0 1.0\n", + " count mean std min 25% 50% 75% max\n", + "PostCode \n", + "a 1.0 1.0 NaN 1.0 1.0 1.0 1.0 1.0\n", + " count mean std min 25% 50% 75% max\n", + "PostCode \n", + "a 1.0 1.0 NaN 1.0 1.0 1.0 1.0 1.0\n", + " count mean std min 25% 50% 75% max\n", + "PostCode \n", + "a 3.0 0.266667 0.152753 0.1 0.200 0.30 0.350 0.4\n", + "b 2.0 0.350000 0.212132 0.2 0.275 0.35 0.425 0.5\n", + "c 1.0 0.100000 NaN 0.1 0.100 0.10 0.100 0.1\n", + "d 1.0 0.200000 NaN 0.2 0.200 0.20 0.200 0.2\n", + " count mean std min 25% 50% 75% max\n", + "PostCode \n", + "a 3.0 0.666667 1.154701 0.0 0.0 0.0 1.0 2.0\n", + "b 2.0 3.000000 2.828427 1.0 2.0 3.0 4.0 5.0\n", + "c 1.0 3.000000 NaN 3.0 3.0 3.0 3.0 3.0\n", + "d 1.0 4.000000 NaN 4.0 4.0 4.0 4.0 4.0\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SizeDiffMeanHourlyPercent
PostCode
a20.3
b40.3
c00.0
d00.0
\n", + "
" + ], + "text/plain": [ + " Size DiffMeanHourlyPercent\n", + "PostCode \n", + "a 2 0.3\n", + "b 4 0.3\n", + "c 0 0.0\n", + "d 0 0.0" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-03-20 13:47:52,849 - tornado.application - ERROR - Exception in callback >\n", + "Traceback (most recent call last):\n", + " File \"/Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages/tornado/ioloop.py\", line 937, in _run\n", + " val = self.callback()\n", + " ^^^^^^^^^^^^^^^\n", + " File \"/Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages/distributed/system_monitor.py\", line 167, in update\n", + " net_ioc = psutil.net_io_counters()\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages/psutil/__init__.py\", line 2126, in net_io_counters\n", + " rawdict = _psplatform.net_io_counters()\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + "OSError: [Errno 12] Cannot allocate memory\n", + "2024-03-20 19:55:47,781 - tornado.application - ERROR - Exception in callback >\n", + "Traceback (most recent call last):\n", + " File \"/Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages/tornado/ioloop.py\", line 937, in _run\n", + " val = self.callback()\n", + " ^^^^^^^^^^^^^^^\n", + " File \"/Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages/distributed/system_monitor.py\", line 167, in update\n", + " net_ioc = psutil.net_io_counters()\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages/psutil/__init__.py\", line 2126, in net_io_counters\n", + " rawdict = _psplatform.net_io_counters()\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + "OSError: [Errno 12] Cannot allocate memory\n" + ] + } + ], + "source": [ + "def chunk(grouped):\n", + " print(grouped.describe())\n", + " return grouped.max(), grouped.min()\n", + "\n", + "def agg(chunk_maxes, chunk_mins):\n", + " return chunk_maxes.max(), chunk_mins.min()\n", + "\n", + "def finalize(maxima, minima):\n", + " return maxima - minima\n", + "\n", + "extent = dd.Aggregation('extent', chunk, agg, finalize=finalize)\n", + "ddf.groupby('PostCode')[\"Size\", \"DiffMeanHourlyPercent\"].agg(extent).compute()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame({\n", + " 'A': ['a', 'b', 'a', 'a', 'b'],\n", + " 'B': [0, 1, 0, 2, 5],\n", + "})\n", + "ddf = dd.from_pandas(df, 2)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def chunk(grouped):\n", + " print(grouped)\n", + " return grouped.max(), grouped.min()\n", + "\n", + "def agg(chunk_maxes, chunk_mins):\n", + " return chunk_maxes.max(), chunk_mins.min()\n", + "\n", + "def finalize(maxima, minima):\n", + " return maxima - minima" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "ename": "KeyError", + "evalue": "'A'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[11], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m extent \u001b[38;5;241m=\u001b[39m dd\u001b[38;5;241m.\u001b[39mAggregation(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mextent\u001b[39m\u001b[38;5;124m'\u001b[39m, chunk, agg, finalize\u001b[38;5;241m=\u001b[39mfinalize)\n\u001b[0;32m----> 2\u001b[0m \u001b[43mddf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgroupby\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mA\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39magg(extent)\u001b[38;5;241m.\u001b[39mcompute()\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/dask_expr/_collection.py:2840\u001b[0m, in \u001b[0;36mDataFrame.groupby\u001b[0;34m(self, by, group_keys, sort, observed, dropna, **kwargs)\u001b[0m\n\u001b[1;32m 2835\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(by, FrameBase) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(by, Series):\n\u001b[1;32m 2836\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 2837\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m`by` must be a column name or list of columns, got \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mby\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2838\u001b[0m )\n\u001b[0;32m-> 2840\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mGroupBy\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2841\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2842\u001b[0m \u001b[43m \u001b[49m\u001b[43mby\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2843\u001b[0m \u001b[43m \u001b[49m\u001b[43mgroup_keys\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup_keys\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2844\u001b[0m \u001b[43m \u001b[49m\u001b[43msort\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msort\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2845\u001b[0m \u001b[43m \u001b[49m\u001b[43mobserved\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mobserved\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2846\u001b[0m \u001b[43m \u001b[49m\u001b[43mdropna\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdropna\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2847\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2848\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/dask_expr/_groupby.py:1519\u001b[0m, in \u001b[0;36mGroupBy.__init__\u001b[0;34m(self, obj, by, group_keys, sort, observed, dropna, slice)\u001b[0m\n\u001b[1;32m 1513\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mby \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 1514\u001b[0m [by]\n\u001b[1;32m 1515\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m np\u001b[38;5;241m.\u001b[39misscalar(by) \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(by, Expr) \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(by, Callable)\n\u001b[1;32m 1516\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mlist\u001b[39m(by)\n\u001b[1;32m 1517\u001b[0m )\n\u001b[1;32m 1518\u001b[0m \u001b[38;5;66;03m# surface pandas errors\u001b[39;00m\n\u001b[0;32m-> 1519\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_meta \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mobj\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_meta\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgroupby\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1520\u001b[0m \u001b[43m \u001b[49m\u001b[43mby\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1521\u001b[0m \u001b[43m \u001b[49m\u001b[43mgroup_keys\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup_keys\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1522\u001b[0m \u001b[43m \u001b[49m\u001b[43msort\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msort\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1523\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43m_as_dict\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mobserved\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mobserved\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1524\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43m_as_dict\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdropna\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdropna\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1525\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1526\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mslice\u001b[39m \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1527\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[38;5;28mslice\u001b[39m, \u001b[38;5;28mtuple\u001b[39m):\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/pandas/core/frame.py:9170\u001b[0m, in \u001b[0;36mDataFrame.groupby\u001b[0;34m(self, by, axis, level, as_index, sort, group_keys, observed, dropna)\u001b[0m\n\u001b[1;32m 9167\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m level \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m by \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 9168\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mYou have to supply one of \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mby\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m and \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlevel\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m-> 9170\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mDataFrameGroupBy\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 9171\u001b[0m \u001b[43m \u001b[49m\u001b[43mobj\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9172\u001b[0m \u001b[43m \u001b[49m\u001b[43mkeys\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mby\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9173\u001b[0m \u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9174\u001b[0m \u001b[43m \u001b[49m\u001b[43mlevel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlevel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9175\u001b[0m \u001b[43m \u001b[49m\u001b[43mas_index\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mas_index\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9176\u001b[0m \u001b[43m \u001b[49m\u001b[43msort\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msort\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9177\u001b[0m \u001b[43m \u001b[49m\u001b[43mgroup_keys\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup_keys\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9178\u001b[0m \u001b[43m \u001b[49m\u001b[43mobserved\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mobserved\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9179\u001b[0m \u001b[43m \u001b[49m\u001b[43mdropna\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdropna\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9180\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/pandas/core/groupby/groupby.py:1329\u001b[0m, in \u001b[0;36mGroupBy.__init__\u001b[0;34m(self, obj, keys, axis, level, grouper, exclusions, selection, as_index, sort, group_keys, observed, dropna)\u001b[0m\n\u001b[1;32m 1326\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdropna \u001b[38;5;241m=\u001b[39m dropna\n\u001b[1;32m 1328\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m grouper \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m-> 1329\u001b[0m grouper, exclusions, obj \u001b[38;5;241m=\u001b[39m \u001b[43mget_grouper\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1330\u001b[0m \u001b[43m \u001b[49m\u001b[43mobj\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1331\u001b[0m \u001b[43m \u001b[49m\u001b[43mkeys\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1332\u001b[0m \u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1333\u001b[0m \u001b[43m \u001b[49m\u001b[43mlevel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlevel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1334\u001b[0m \u001b[43m \u001b[49m\u001b[43msort\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msort\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1335\u001b[0m \u001b[43m \u001b[49m\u001b[43mobserved\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mobserved\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mis\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mlib\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mno_default\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mobserved\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1336\u001b[0m \u001b[43m \u001b[49m\u001b[43mdropna\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdropna\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1337\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1339\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m observed \u001b[38;5;129;01mis\u001b[39;00m lib\u001b[38;5;241m.\u001b[39mno_default:\n\u001b[1;32m 1340\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28many\u001b[39m(ping\u001b[38;5;241m.\u001b[39m_passed_categorical \u001b[38;5;28;01mfor\u001b[39;00m ping \u001b[38;5;129;01min\u001b[39;00m grouper\u001b[38;5;241m.\u001b[39mgroupings):\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/pandas/core/groupby/grouper.py:1043\u001b[0m, in \u001b[0;36mget_grouper\u001b[0;34m(obj, key, axis, level, sort, observed, validate, dropna)\u001b[0m\n\u001b[1;32m 1041\u001b[0m in_axis, level, gpr \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m, gpr, \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1042\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1043\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(gpr)\n\u001b[1;32m 1044\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(gpr, Grouper) \u001b[38;5;129;01mand\u001b[39;00m gpr\u001b[38;5;241m.\u001b[39mkey \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1045\u001b[0m \u001b[38;5;66;03m# Add key to exclusions\u001b[39;00m\n\u001b[1;32m 1046\u001b[0m exclusions\u001b[38;5;241m.\u001b[39madd(gpr\u001b[38;5;241m.\u001b[39mkey)\n", + "\u001b[0;31mKeyError\u001b[0m: 'A'" + ] + } + ], + "source": [ + "extent = dd.Aggregation('extent', chunk, agg, finalize=finalize)\n", + "ddf.groupby('A').agg(extent).compute()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "client.shutdown()" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "df = pd.DataFrame({\n", + " 'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],\n", + " 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],\n", + " 'C': [1, 2, 3, 4, 5, 6, 7, 8],\n", + " 'D': [10, 20, 30, 40, 50, 60, 70, 80]\n", + "})\n", + "\n", + "# 根据 'A' 列进行分组\n", + "grouped = df.groupby('A')\n", + "print(grouped)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'client' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[2], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mclient\u001b[49m\u001b[38;5;241m.\u001b[39mshutdown()\n", + "\u001b[0;31mNameError\u001b[0m: name 'client' is not defined" + ] + } + ], + "source": [ + "client.shutdown()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dispy", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/ch-dask-dataframe/shuffle.ipynb b/ch-dask-dataframe/shuffle.ipynb new file mode 100644 index 0000000..e7f9bc5 --- /dev/null +++ b/ch-dask-dataframe/shuffle.ipynb @@ -0,0 +1,59 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "(dask-dataframe-shuffle)=\n", + "# Shuffle\n", + "\n", + "在分布式场景下,`sort`,`merge`,`groupby` 有可能会在不同 Worker 之间交换数据,即 Shuffle。这些 pandas 算子在单机上实现起来比较简单,但是在大数据分布式计算场景,实现起来并不简单。\n", + "Dask 在 `2023.1` 版本之后提供了一种新的 Shuffle 方法,可以加速大部分计算任务。\n", + "\n", + "## `groupby`\n", + "\n", + "{numref}`dataframe-groupby` 展示了 `groupby` 在单机上的操作流程,它主要有三个阶段:分组、聚合、输出。分布式场景下,不同的数据分布在不同的 Partition 下。\n", + "\n", + "```{figure} ../img/ch-dask-dataframe/groupby.svg\n", + "---\n", + "width: 600px\n", + "name: dataframe-groupby\n", + "---\n", + "DataFrame groupby 示意图\n", + "```\n", + "\n", + "* `groupby(indexed_columns).agg()` 和 `groupby(indexed_columns).apply(user_def_fn)` 性能最好。`indexed_columns` 指的是索引列 Key,`agg` 指的是 Dask DataFrame 提供的官方的 `sum`,`mean`,`nunique` 等聚合方法。因为 `indexed_columns` 是排过序的了,可以很快地对 `indexed_columns` 进行分组,Shuffle 数据量不大。\n", + "* `groupby(non_indexed_columns).agg()` 的数据交换量要更大一些,`agg` 是 Dask 官方提供的方法,做过一些优化。\n", + "* `groupby(non_indexed_columns).apply(user_def_fn)` 的成本最高。它既要对所有数据进行交换,又要执行用户自定义的函数,\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dispy", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/ch-dask/dask-distributed.ipynb b/ch-dask/dask-distributed.ipynb index 387a0c7..45f1830 100644 --- a/ch-dask/dask-distributed.ipynb +++ b/ch-dask/dask-distributed.ipynb @@ -467,7 +467,7 @@ "* 用户当前作业对计算资源要求很高,需要更多的资源来满足计算需求。\n", "* 用户当前作业所申请的计算资源闲置,这些资源可被其他用户使用。尤其是当用户进行交互式数据可视化,而非大规模计算时。\n", "\n", - "比如 `KubeCluster` 和 `SLURMCluster` 都提供了 `adapt()` 方法。下面的例子可以在 0 到 10 个 Worker 之间动态缩放。自动缩放根据 Dask Scheduler 上作业的负载来决定运行多少个 Dask Worker。Dask 收集一些信息,比如每个 Dask Worker 上已用内存和可用内存,并根据这些信息自适应地调整计算资源数量。\n", + "`KubeCluster` 和 `SLURMCluster` 都提供了 `adapt()` 方法。下面的例子可以在 0 到 10 个 Worker 之间动态缩放。自动缩放根据 Dask Scheduler 上作业的负载来决定运行多少个 Dask Worker。Dask 收集一些信息,比如每个 Dask Worker 上已用内存和可用内存,并根据这些信息自适应地调整计算资源数量。\n", "\n", "```python\n", "from dask_kubernetes import KubeCluster\n", @@ -486,7 +486,9 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "client.shutdown()" + ] } ], "metadata": { diff --git a/ch-dask/task-graph-partitioning.ipynb b/ch-dask/task-graph-partitioning.ipynb index bbf88ae..ff5085d 100644 --- a/ch-dask/task-graph-partitioning.ipynb +++ b/ch-dask/task-graph-partitioning.ipynb @@ -532,7 +532,7 @@ "\n", "我们需要关注 Task Stream 栏,应该避免大量的空白或大量的红色。空白表示 Dask Worker 上没有任何任务,红色表示 Dask Worker 之间进行大量的数据交换。\n", "\n", - "{numref}`dask-good-partitions` 和 {numref}`dask-too-many-partitions` 是一个对比,两张图使用代码相同({numref}`dask-read-write` 中的例子),但是使用的数据块大小不同。{numref}`dask-too-many-partitions` 中的数据块过小,Task Graph 过大,出现了大量的红色,时间没有用在计算上,而是浪费在数据交换等其他事情上。\n", + "{numref}`dask-good-partitions` 和 {numref}`dask-too-many-partitions` 是一个对比,两张图使用代码相同({numref}`dask-dataframe-read-write` 中的例子),但是使用的数据块大小不同。{numref}`dask-too-many-partitions` 中的数据块过小,Task Graph 过大,出现了大量的红色,时间没有用在计算上,而是浪费在数据交换等其他事情上。\n", "\n", "```{figure} ../img/ch-dask/good-partitions.png\n", "---\n", diff --git a/ch-data-science/machine-learning.ipynb b/ch-data-science/machine-learning.ipynb index f1ede8b..fed1068 100644 --- a/ch-data-science/machine-learning.ipynb +++ b/ch-data-science/machine-learning.ipynb @@ -1056,6 +1056,44 @@ "* 学习率,即刚才提到的 $\\alpha$,控制着每次更新参数的步长。\n", "* 网络结构:模型的层数、每层的神经元数量、激活函数的选择等。不同的网络结构对于不同的任务可能有不同的性能表现。\n", "\n", + "## 实现细节\n", + "\n", + "神经网络训练实现起来要关注以下三个步骤:\n", + "\n", + "* 一次前向传播\n", + "* 一次反向传播\n", + "* 一次更新模型权重\n", + "\n", + "{numref}`model-training-input-output` 整理了神经网络的第 i 层进行训练时,以上三个步骤的输入和输出。\n", + "\n", + "```{figure} ../img/ch-data-science/model-training-input-output.svg\n", + "---\n", + "width: 800px\n", + "name: model-training-input-output\n", + "---\n", + "前向传播、反向传播和更新模型权重的输入和输出\n", + "```\n", + "\n", + "对于前向传播,输入有两部分:i-1 层输出 $\\boldsymbol{a^{[i-1]}}$ 和第 i 层的模型权重 $\\boldsymbol{W^{[i]}}$、$\\boldsymbol{b^{[i]}}$;输出又被称为激活(Activation)。\n", + "\n", + "对于反向传播,输入有三部分:i 层输出 $\\boldsymbol{a^{[i]}}$;第 i 层的模型权重 $\\boldsymbol{W^{[i]}}$、$\\boldsymbol{b^{[i]}}$;损失对 i 层输出的导数 $\\boldsymbol{\\boldsymbol{\\frac{\\partial L}{a^{[i]}}}}$。根据链式法则,可以求出损失对 i 层模型权重的导数 $\\boldsymbol{\\frac{\\partial L}{\\partial W^{[i]}\n", + "}}$、$\\boldsymbol{\\frac{\\partial L}{\\partial b^{[i]}\n", + "}}$,也就是梯度。\n", + "\n", + "得到梯度后,需要沿着梯度下降的方向更新模型权重。如果是最简单的梯度下降法,优化器直接在模型原有权重基础上做减法,不需要额外保存状态,比如:$\\boldsymbol{W^{[l]}} = \\boldsymbol{W^{[l]}}-\\alpha\\frac{\\partial L}{\\partial \\boldsymbol{W^{[l]}}}$\n", + "\n", + "复杂一点的优化器,比如 Adam, 在梯度下降时引入了动量的概念。动量是梯度的指数移动平均,需要维护一个梯度的移动平均矩阵,这个矩阵就是优化器的状态。因此,优化器状态、原来的模型权重和梯度共同作为输入,可以得到更新后的模型权重。至此才能完成一轮模型的训练。\n", + "\n", + "如果只考虑前向传播和反向传播,对于一个神经网络,其训练过程如 {numref}`model-training` 所示。{numref}`model-training` 演示了 3 层神经网络,前向过程用 FWD 表示,反向过程用 BWD 表示。\n", + "\n", + "```{figure} ../img/ch-data-science/model-training.svg\n", + "---\n", + "width: 800px\n", + "name: model-training\n", + "---\n", + "前向传播(图中用 FWD 表示)和反向传播(图中用 BWD 表示)\n", + "```\n", + "\n", "## 推理\n", "\n", "模型训练就是前向和反向传播,模型推理只需要前向传播,只不过输入层换成了需要预测的 $\\boldsymbol{x}$。" diff --git a/ch-mpi-large-model/data-parallel.md b/ch-mpi-large-model/data-parallel.md index 17ebe32..8f6fde2 100644 --- a/ch-mpi-large-model/data-parallel.md +++ b/ch-mpi-large-model/data-parallel.md @@ -1,6 +1,57 @@ -数据并行性是一种允许您通过在多个计算节点之间复制模型并在它们之间划分数据集来更快地训练模型。 +(data-parallel)= +# 数据并行 -首先对于一次顺序训练,其中我们有一个计算节点,该节点将我们的模型加载到内存中。在每次训练迭代期间,我们加载下一个小批量,并在缓存每层输出的同时对模型执行前向传递。然后,我们计算损失并运行向后传递,从而计算梯度。这个过程如下图所示,使用 MNIST 图像作为我们的示例输入数据。 -![[数据并行图1.png]] -在数据并行(DataParallel)中,需要再N台机器上赋值模型,我们分割我们的数据为N个块,并且让每台机器处理一个块。![[数据并行图2.png]]对于整个过程,首先在初始对于两个计算节点,需要将模型参数scatter到进程组中的其他进程,之后对于每个计算节点,一旦计算出梯度,都需要进行MPI.AllReduce,收集全部的信息来更新参数。 -![[数据并行图3.png]] \ No newline at end of file +数据并行是一种最常见的大模型并行方法,相对其他并行,数据并行最简单。如 {numref}`data-parallel-img` 所示,模型被拷贝到不同的 GPU 设备上,训练数据被切分为多份,每份分给不同的 GPU 进行训练。 + +```{figure} ../img/ch-mpi-large-model/data-parallel.svg +--- +width: 600px +name: data-parallel-img +--- +数据并行示意图 +``` + +## 非并行训练 + +{numref}`machine-learning-intro` 介绍了神经网络模型训练的过程。我们先从非并行的场景开始,这里使用 MNIST 手写数字识别案例来演示,如 {numref}`data-parallel-single` 所示,它包含了一次前向传播和一次反向传播。 + +```{figure} ../img/ch-mpi-large-model/data-parallel-single.svg +--- +width: 600px +name: data-parallel-single +--- +在单个 GPU 上进行神经网络训练 +``` + +## 数据并行 + +数据并行将数据集切分成多份,模型权重在不同 GPU 上拷贝一份。如 {numref}`data-parallel-distributed` 所示,有两块 GPU,在每块 GPU 上,有拷贝的模型权重和被切分的输入数据集;每块 GPU 上**独立**进行前向传播和反向传播,即前向传播计算每层的输出值,反向传播计算模型权重的梯度,两块 GPU 上的计算互不影响。 + + +```{figure} ../img/ch-mpi-large-model/data-parallel-distributed.svg +--- +width: 600px +name: data-parallel-distributed +--- +在两个 GPU 上进行神经网络训练 +``` + +至少在前向传播和反向传播阶段,还没有通信的开销。但到了更新模型权重部分,需要进行必要的同步,因为每块 GPU 上得到的梯度不一样,可以用求平均的方式求得梯度。这里只演示 $\boldsymbol{W}$,$\boldsymbol{\frac{\partial L}{\partial W}}^{i}$ 为单块 GPU 上的梯度,$\boldsymbol{{\frac{\partial L}{\partial W}}^{sync}}$ 为同步之后的平均值。 + +$$ +\boldsymbol{ + {\frac{\partial L}{\partial W}}^{sync} = \frac{1}{\# GPUs} \sum_{i=0}^{\# GPUs} {\frac{\partial L}{\partial W}}^{i} +} +$$ + +对这些分布在不同 GPU 上的梯度同步时,可以使用 MPI 提供的 `AllReduce` 原语。MPI 的 `AllReduce` 将每块 GPU 上分别计算得到的梯度收集起来,计算平均后,再将更新后的梯度重新分发给各块 GPU。 + +如 {numref}`data-parallel-all-reduce` 所示,梯度同步阶段,MPI 的 `AllReduce` 原语将各 GPU 上的梯度进行同步。 + +```{figure} ../img/ch-mpi-large-model/data-parallel-all-reduce.svg +--- +width: 600px +name: data-parallel-all-reduce +--- +在进行模型权重更新时,要使用 MPI 的 `AllReduce` 原语,将各 GPU 上的梯度进行同步 +``` \ No newline at end of file diff --git a/ch-mpi-large-model/large-model.md b/ch-mpi-large-model/large-model.md new file mode 100644 index 0000000..08cd79a --- /dev/null +++ b/ch-mpi-large-model/large-model.md @@ -0,0 +1,9 @@ +# 大模型 + +本章主要解释大模型的并行方法。大模型指的是神经网络的参数量很大,必须并行地进行训练和推理。大模型并行有如下特点: + +* 计算运行在 GPU 这样的加速卡上; +* 加速卡非常昂贵,应尽量提高加速卡的利用率; +* 模型参数量大,无论是训练还是推理,可能有大量数据需要在加速卡之间传输,对带宽和延迟的要求都很高。 + +本章主要从概念和原理上进行解读,具体的实现可参考其他论文和开源库。 \ No newline at end of file diff --git a/ch-mpi-large-model/nccl.md b/ch-mpi-large-model/nccl.md index 442e3eb..2a107bb 100644 --- a/ch-mpi-large-model/nccl.md +++ b/ch-mpi-large-model/nccl.md @@ -22,9 +22,9 @@ NCCL 实现了常见的通信原语 同样,其他加速器厂商也提出了自己的通信库,比如: -* AMD 提供了针对 ROCm 编程范式的 RCCL(ROCm Communication Collectives Library) +* AMD 提供了针对 ROCm 的 RCCL(ROCm Communication Collectives Library) * 华为提供了 HCCL(Huawei Collective Communication Library) 这些集合通信库都是针对特定硬件的通信库,旨在解决特定集群的通信问题。 -NCCL 主要提供了 C/C++ 编程接口,Python 社区如果使用的话,可以考虑 PyTorch 的 `torch.distributed`。NCCL 也是 PyTorch 推荐的 GPU 并行计算后端。本书不再细致讲解 `torch.distributed` 的使用,而是继续用 mpi4py 来演示大模型训练和推理过程中涉及的各类通信问题。 \ No newline at end of file +NCCL 主要提供了 C/C++ 编程接口,Python 社区如果使用的话,可以考虑 PyTorch 的 `torch.distributed`。NCCL 也是 PyTorch 推荐的 GPU 并行计算后端。本书不再细致讲解 `torch.distributed` 的使用,而是继续用 MPI 来演示大模型训练和推理过程中涉及的各类通信问题。 \ No newline at end of file diff --git a/ch-mpi-large-model/neural-network-training.md b/ch-mpi-large-model/neural-network-training.md new file mode 100644 index 0000000..fced574 --- /dev/null +++ b/ch-mpi-large-model/neural-network-training.md @@ -0,0 +1,7 @@ +(neural-network-training)= +# 模型训练 + +## 模型训练的过程 + +在深入分析大模型训练前,我们先回顾一下模型训练。 + diff --git a/ch-mpi-large-model/pipeline-parallel.md b/ch-mpi-large-model/pipeline-parallel.md index c499564..d4c6850 100644 --- a/ch-mpi-large-model/pipeline-parallel.md +++ b/ch-mpi-large-model/pipeline-parallel.md @@ -1,14 +1,54 @@ -naive pipeline是实现流水线并行训练的最直接的方法,我们将模型分为多个部分,每个部分分配一个GPU,然后进行训练,在分割模型的边界处插入通信步骤。 -我们以这个4层顺序模型为例: -Output=L4(L3(L2(L1(Input)))) +(pipeline-parallel)= +# 流水线并行 -如果把这个模型的计算分配给两个GPU,例如: -- GPU1:intermediate = L2(L1(input)) -- GPU2:output = L4(L3(intermediate)) +流水线并行是另外一种常见的大模型并行方法。数据并行将模型权重在每个 GPU 上拷贝一份,如果模型大小没有超过单块 GPU 显存大小,数据并行是最简单易用的选项。但现在的模型大到已经无法放在单块 GPU 上,比如 175B 的 GPT-3,如果用 FP16 存储,也需要 350GB 存储空间,而单块 NVIDIA A100 和 H100 为 80GB。流水线并行可以解决这个问题,它将大模型的不同层切分到不同的 GPU 上。其核心思想如 {numref}`pipeline-parallel-img` 所示。 -所以,为了完成前向传递,我们把GPU1计算出来的结果张量 intermediate 传输到GPU2,对于后向传递,我们把梯度 intermediate 从GPU2发送到GPU1。这样,模型并行训练会保持与单节点相同的输出和梯度。 +```{figure} ../img/ch-mpi-large-model/pipeline-parallel.svg +--- +width: 600px +name: pipeline-parallel-img +--- +朴素流水线并行示意图 +``` -如图,仅需要使用节点到节点通信(MPI.Send 和 MPI.Recv),并且不需要任何集体通信原语。 -![[pipeline 并行图1.png]] +## 朴素流水线并行 -数据并行和流水线并行一同作用的示意图:![[pipeline 并行图2.png]] \ No newline at end of file +假如模型被切分为两部分,第一部分在 GPU 0 上,第二部分在 GPU 1 上。每个 GPU 上计算前向传播和反向传播,如 {numref}`pipeline-parallel-distributed` 所示。 + +* 前向传播过程中,输出需要在 GPU 之间传输。 +* 反向传播过程中,损失对于输出的梯度需要在 GPU 之间传输。 + +```{figure} ../img/ch-mpi-large-model/pipeline-parallel-distributed.svg +--- +width: 600px +name: pipeline-parallel-distributed +--- +在两个 GPU 上使用流水线并行 +``` + +在这种最朴素的流水线并行场景,只需要使用点对点通信:`MPI.Send` 和 `MPI.Recv`,不需要集合通信。 + +朴素流水线并行有一个致命的缺点,那就是**GPU 利用率低**。主要体现在: + +* 任何一个时刻只有一块 GPU 在进行计算,其他 GPU 都在等待上下游的计算结果传过来。如果不考虑通信的时间成本,GPU 利用率仅为 $\frac{1}{\# GPUs}\%$。 +* 如果考虑通信的时间成本,GPU 在等待网卡的数据传输过来,GPU 计算和 GPU 之间通信没有重叠(Overlap)。GPU 设备和网络设备是相互独立的,GPU 进行当前批次计算的同时,本可以让网络设备传输上一批次的数据,两者本可以同时工作。 + +针对这些问题,研究者提出了一些方法,从数据切分和重叠的角度优化流水线并行,以提高 GPU 利用率。这些方法在朴素流水线并行基础上进行改进,感兴趣的读者可以阅读以下原文,这里不再赘述。 + +* GPipe {cite}`huang2019GPipe`。 +* PipeDream {cite}`narayanan2019PipeDream`。 +* Megatron-LM {cite}`narayanan2021Efficient`。 + +## 流水线并行 + 数据并行 + +流水线并行与数据并行是相互正交的,两者可以结合起来同时使用。由于两种并行是正交的,互不干扰,为避免数据传输错乱,应使用 MPI 的 Communicator 来做隔离。在 {numref}`mpi-hello-world` 中我们曾经提到,Communicator 可以被理解为 MPI 中的组,同一个 GPU 可以在不同的 Communicator 中。如 {numref}`pipeline-parallel-data-parallel` 所示,我们创建了两类 Communicator:红色为流水线并行的 Communicator,蓝色为数据并行的 Communicator。同一个 GPU 既属于红色,也属于蓝色:既要实现流水线并行中模型层之间的通信,也要实现数据并行的梯度的同步。 + +```{figure} ../img/ch-mpi-large-model/pipeline-parallel-data-parallel.svg +--- +width: 600px +name: pipeline-parallel-data-parallel +--- +流水线并行结合数据并行 +``` + +至此,我们介绍了两种最朴素的大模型并行训练方式:数据并行和流水线并行。工业级分布式训练库的实现比这些复杂,但背后的思想万变不离其宗。 \ No newline at end of file diff --git a/ch-ray-data/data-transform.ipynb b/ch-ray-data/data-transform.ipynb index edb1872..2babc27 100644 --- a/ch-ray-data/data-transform.ipynb +++ b/ch-ray-data/data-transform.ipynb @@ -45,23 +45,21 @@ "name": "stderr", "output_type": "stream", "text": [ - "/Users/luweizheng/anaconda3/envs/dispy/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n", - "2023-12-15 12:08:18,544\tINFO util.py:159 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n", - "2023-12-15 12:08:21,451\tINFO worker.py:1673 -- Started a local Ray instance.\n" + "2024-02-15 19:12:00,048\tINFO worker.py:1724 -- Started a local Ray instance.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "文件夹 /Users/luweizheng/Projects/py-101/distributed-python/ch-ray-data/../data/nyc-taxi 已存在,无需操作。\n" + "文件夹 /Users/luweizheng/Projects/godaai/distributed-python/ch-ray-data/../data/nyc-taxi 已存在,无需操作。\n" ] } ], "source": [ "import os\n", "import shutil\n", + "import warnings\n", "import urllib.request\n", "from typing import Any, Dict\n", "\n", @@ -70,6 +68,8 @@ "import torch\n", "import ray\n", "\n", + "warnings.simplefilter(action='ignore', category=FutureWarning)\n", + "\n", "if ray.is_initialized:\n", " ray.shutdown()\n", "\n", @@ -103,20 +103,52 @@ "execution_count": 2, "metadata": {}, "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "128b9acfa02540eca47e5a1f91c66b4f", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Parquet Files Sample 0: 0%| | 0/1 [00:00 TaskPoolMapOperator[ReadParquet] -> LimitOperator[limit=1]\n", - "2023-12-15 12:08:26,713\tINFO streaming_executor.py:105 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", - "2023-12-15 12:08:26,714\tINFO streaming_executor.py:107 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n", - "\u001b[36m(ReadParquet->SplitBlocks(200) pid=38102)\u001b[0m /Users/luweizheng/anaconda3/envs/dispy/lib/python3.11/site-packages/ray/data/_internal/arrow_block.py:128: FutureWarning: promote has been superseded by mode='default'.\n", - "\u001b[36m(ReadParquet->SplitBlocks(200) pid=38102)\u001b[0m return transform_pyarrow.concat(tables) \n", - " \r" + "2024-02-15 19:12:01,506\tINFO dataset.py:2488 -- Tip: Use `take_batch()` instead of `take() / show()` to return records in pandas or numpy batch format.\n", + "2024-02-15 19:12:01,509\tINFO set_read_parallelism.py:115 -- Using autodetected parallelism=200 for stage ReadParquet to satisfy DataContext.get_current().min_parallelism=200.\n", + "2024-02-15 19:12:01,509\tINFO set_read_parallelism.py:122 -- To satisfy the requested parallelism of 200, each read task output is split into 50 smaller blocks.\n", + "2024-02-15 19:12:01,510\tINFO streaming_executor.py:112 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadParquet] -> LimitOperator[limit=1]\n", + "2024-02-15 19:12:01,510\tINFO streaming_executor.py:113 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=0, gpu=0, object_store_memory=0), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", + "2024-02-15 19:12:01,510\tINFO streaming_executor.py:115 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c3e92b7a33ea4421b9e828dacf491111", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Running 0: 0%| | 0/1 [00:00SplitBlocks(50) pid=5966)\u001b[0m /Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages/ray/data/_internal/arrow_block.py:148: FutureWarning: promote has been superseded by mode='default'.\n", + "\u001b[36m(ReadParquet->SplitBlocks(50) pid=5966)\u001b[0m return transform_pyarrow.concat(tables)\n" ] }, { @@ -180,16 +212,27 @@ "name": "stderr", "output_type": "stream", "text": [ - "2023-12-15 12:08:28,216\tINFO split_read_output_blocks.py:101 -- Using autodetected parallelism=200 for stage ReadParquet to satisfy DataContext.get_current().min_parallelism=200.\n", - "2023-12-15 12:08:28,219\tINFO split_read_output_blocks.py:106 -- To satisfy the requested parallelism of 200, each read task output is split into 200 smaller blocks.\n", - "2023-12-15 12:08:28,221\tINFO streaming_executor.py:104 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadParquet] -> TaskPoolMapOperator[Map(transform_row)] -> LimitOperator[limit=1]\n", - "2023-12-15 12:08:28,222\tINFO streaming_executor.py:105 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", - "2023-12-15 12:08:28,224\tINFO streaming_executor.py:107 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n", - "\u001b[36m(ReadParquet->SplitBlocks(200) pid=38108)\u001b[0m /Users/luweizheng/anaconda3/envs/dispy/lib/python3.11/site-packages/ray/data/_internal/arrow_block.py:128: FutureWarning: promote has been superseded by mode='default'.\n", - "\u001b[36m(ReadParquet->SplitBlocks(200) pid=38108)\u001b[0m return transform_pyarrow.concat(tables) \n", - " \r" + "2024-02-15 19:12:02,068\tINFO set_read_parallelism.py:115 -- Using autodetected parallelism=200 for stage ReadParquet to satisfy DataContext.get_current().min_parallelism=200.\n", + "2024-02-15 19:12:02,069\tINFO set_read_parallelism.py:122 -- To satisfy the requested parallelism of 200, each read task output is split into 50 smaller blocks.\n", + "2024-02-15 19:12:02,069\tINFO streaming_executor.py:112 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadParquet] -> TaskPoolMapOperator[Map(transform_row)] -> LimitOperator[limit=1]\n", + "2024-02-15 19:12:02,069\tINFO streaming_executor.py:113 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=0, gpu=0, object_store_memory=0), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", + "2024-02-15 19:12:02,070\tINFO streaming_executor.py:115 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" ] }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "28fbc3a6677f4b1b809b0662a977f691", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Running 0: 0%| | 0/1 [00:00 TaskPoolMapOperator[ReadParquet] -> TaskPoolMapOperator[MapBatches(transform_df)] -> LimitOperator[limit=10]\n", - "2023-12-15 12:08:35,842\tINFO streaming_executor.py:105 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", - "2023-12-15 12:08:35,843\tINFO streaming_executor.py:107 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n", - "\u001b[36m(ReadParquet->SplitBlocks(200) pid=38103)\u001b[0m /Users/luweizheng/anaconda3/envs/dispy/lib/python3.11/site-packages/ray/data/_internal/arrow_block.py:128: FutureWarning: promote has been superseded by mode='default'.\n", - "\u001b[36m(ReadParquet->SplitBlocks(200) pid=38103)\u001b[0m return transform_pyarrow.concat(tables) \n", - " \r" + "2024-02-15 19:12:04,789\tINFO set_read_parallelism.py:115 -- Using autodetected parallelism=200 for stage ReadParquet to satisfy DataContext.get_current().min_parallelism=200.\n", + "2024-02-15 19:12:04,790\tINFO set_read_parallelism.py:122 -- To satisfy the requested parallelism of 200, each read task output is split into 50 smaller blocks.\n", + "2024-02-15 19:12:04,790\tINFO streaming_executor.py:112 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadParquet] -> TaskPoolMapOperator[MapBatches(transform_df)] -> LimitOperator[limit=10]\n", + "2024-02-15 19:12:04,790\tINFO streaming_executor.py:113 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=0, gpu=0, object_store_memory=0), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", + "2024-02-15 19:12:04,790\tINFO streaming_executor.py:115 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "3af6e0ef7f5643a2b57cc3fcb0292e1c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Running 0: 0%| | 0/1 [00:00SplitBlocks(50) pid=5960)\u001b[0m /Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages/ray/data/_internal/arrow_block.py:148: FutureWarning: promote has been superseded by mode='default'.\n", + "\u001b[36m(ReadParquet->SplitBlocks(50) pid=5960)\u001b[0m return transform_pyarrow.concat(tables)\n" ] }, { @@ -289,16 +351,27 @@ "name": "stderr", "output_type": "stream", "text": [ - "2023-12-15 12:08:37,202\tINFO split_read_output_blocks.py:101 -- Using autodetected parallelism=200 for stage ReadParquet to satisfy DataContext.get_current().min_parallelism=200.\n", - "2023-12-15 12:08:37,203\tINFO split_read_output_blocks.py:106 -- To satisfy the requested parallelism of 200, each read task output is split into 200 smaller blocks.\n", - "2023-12-15 12:08:37,205\tINFO streaming_executor.py:104 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadParquet] -> TaskPoolMapOperator[MapBatches()]\n", - "2023-12-15 12:08:37,207\tINFO streaming_executor.py:105 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", - "2023-12-15 12:08:37,208\tINFO streaming_executor.py:107 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n", - "\u001b[36m(ReadParquet->SplitBlocks(200) pid=38106)\u001b[0m /Users/luweizheng/anaconda3/envs/dispy/lib/python3.11/site-packages/ray/data/_internal/arrow_block.py:128: FutureWarning: promote has been superseded by mode='default'.\n", - "\u001b[36m(ReadParquet->SplitBlocks(200) pid=38106)\u001b[0m return transform_pyarrow.concat(tables) \n", - " \r" + "2024-02-15 19:12:05,225\tINFO set_read_parallelism.py:115 -- Using autodetected parallelism=200 for stage ReadParquet to satisfy DataContext.get_current().min_parallelism=200.\n", + "2024-02-15 19:12:05,225\tINFO set_read_parallelism.py:122 -- To satisfy the requested parallelism of 200, each read task output is split into 50 smaller blocks.\n", + "2024-02-15 19:12:05,225\tINFO streaming_executor.py:112 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadParquet] -> TaskPoolMapOperator[MapBatches()]\n", + "2024-02-15 19:12:05,225\tINFO streaming_executor.py:113 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=0, gpu=0, object_store_memory=0), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", + "2024-02-15 19:12:05,226\tINFO streaming_executor.py:115 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" ] }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b77916ceb4b74390badf1461ce5fca01", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Running 0: 0%| | 0/1 [00:00 TaskPoolMapOperator[ReadParquet] -> TaskPoolMapOperator[MapBatches(transform_df)] -> LimitOperator[limit=100] -> ActorPoolMapOperator[MapBatches(TorchPredictor)] -> LimitOperator[limit=3]\n", - "2023-12-15 12:08:44,230\tINFO streaming_executor.py:105 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", - "2023-12-15 12:08:44,230\tINFO streaming_executor.py:107 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n", - "2023-12-15 12:08:44,276\tINFO actor_pool_map_operator.py:114 -- MapBatches(TorchPredictor): Waiting for 2 pool actors to start...\n", - "\u001b[36m(ReadParquet->SplitBlocks(200) pid=38103)\u001b[0m /Users/luweizheng/anaconda3/envs/dispy/lib/python3.11/site-packages/ray/data/_internal/arrow_block.py:128: FutureWarning: promote has been superseded by mode='default'.\n", - "\u001b[36m(ReadParquet->SplitBlocks(200) pid=38103)\u001b[0m return transform_pyarrow.concat(tables) \n", - "2023-12-15 12:08:49,380\tWARNING actor_pool_map_operator.py:271 -- To ensure full parallelization across an actor pool of size 2, the Dataset should consist of at least 2 distinct blocks. Consider increasing the parallelism when creating the Dataset.\n" + "2024-02-15 19:12:06,821\tWARNING util.py:546 -- The argument ``compute`` is deprecated in Ray 2.9. Please specify argument ``concurrency`` instead. For more information, see https://docs.ray.io/en/master/data/transforming-data.html#stateful-transforms.\n", + "2024-02-15 19:12:06,829\tINFO set_read_parallelism.py:115 -- Using autodetected parallelism=200 for stage ReadParquet to satisfy DataContext.get_current().min_parallelism=200.\n", + "2024-02-15 19:12:06,830\tINFO set_read_parallelism.py:122 -- To satisfy the requested parallelism of 200, each read task output is split into 50 smaller blocks.\n", + "2024-02-15 19:12:06,830\tINFO streaming_executor.py:112 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadParquet] -> TaskPoolMapOperator[MapBatches(transform_df)] -> LimitOperator[limit=100] -> ActorPoolMapOperator[MapBatches(TorchPredictor)] -> LimitOperator[limit=3]\n", + "2024-02-15 19:12:06,831\tINFO streaming_executor.py:113 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=0, gpu=0, object_store_memory=0), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", + "2024-02-15 19:12:06,832\tINFO streaming_executor.py:115 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n", + "2024-02-15 19:12:06,846\tINFO actor_pool_map_operator.py:114 -- MapBatches(TorchPredictor): Waiting for 2 pool actors to start...\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "95bb2562908a46ec97ef831ae5ad6072", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Running 0: 0%| | 0/1 [00:00SplitBlocks(50) pid=5964)\u001b[0m /Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages/ray/data/_internal/arrow_block.py:148: FutureWarning: promote has been superseded by mode='default'.\n", + "\u001b[36m(ReadParquet->SplitBlocks(50) pid=5964)\u001b[0m return transform_pyarrow.concat(tables)\n", + "2024-02-15 19:12:08,283\tWARNING actor_pool_map_operator.py:278 -- To ensure full parallelization across an actor pool of size 2, the Dataset should consist of at least 2 distinct blocks. Consider increasing the parallelism when creating the Dataset.\n" ] }, { @@ -398,29 +492,81 @@ "name": "stderr", "output_type": "stream", "text": [ - "2023-12-15 12:08:49,415\tINFO streaming_executor.py:104 -- Executing DAG InputDataBuffer[Input] -> AllToAllOperator[Aggregate] -> LimitOperator[limit=20]\n", - "2023-12-15 12:08:49,415\tINFO streaming_executor.py:105 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", - "2023-12-15 12:08:49,417\tINFO streaming_executor.py:107 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n", - "\n", - "\u001b[A\n", - "\u001b[A\n", - "\n", - "\u001b[A\u001b[A\n", - "\n", - "Sort Sample 0: 0%| | 0/4 [00:00 AllToAllOperator[Aggregate] -> LimitOperator[limit=20]\n", + "2024-02-15 19:12:08,333\tINFO streaming_executor.py:113 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=0, gpu=0, object_store_memory=0), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", + "2024-02-15 19:12:08,333\tINFO streaming_executor.py:115 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/luweizheng/anaconda3/envs/dispy/lib/python3.11/site-packages/ray/data/_internal/arrow_block.py:128: FutureWarning: promote has been superseded by mode='default'.\n", - " return transform_pyarrow.concat(tables)\n", - " \n", - "\u001b[A\n", - "\n", - "\u001b[A\u001b[A" - ] + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "54684a32e65142cda779ee0745f7b35e", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "- Aggregate 1: 0%| | 0/4 [00:00 + + + + + + + + + + + + + + + + + + + + + diff --git a/drawio/ch-data-science/model-training-input-output.drawio b/drawio/ch-data-science/model-training-input-output.drawio new file mode 100644 index 0000000..e5f945b --- /dev/null +++ b/drawio/ch-data-science/model-training-input-output.drawio @@ -0,0 +1,172 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/drawio/ch-data-science/model-training.drawio b/drawio/ch-data-science/model-training.drawio new file mode 100644 index 0000000..c9d1618 --- /dev/null +++ b/drawio/ch-data-science/model-training.drawio @@ -0,0 +1,298 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/drawio/ch-mpi-large-model/data-parallel-all-reduce.drawio b/drawio/ch-mpi-large-model/data-parallel-all-reduce.drawio new file mode 100644 index 0000000..2367a6f --- /dev/null +++ b/drawio/ch-mpi-large-model/data-parallel-all-reduce.drawio @@ -0,0 +1,330 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/drawio/ch-mpi-large-model/data-parallel-distributed.drawio b/drawio/ch-mpi-large-model/data-parallel-distributed.drawio new file mode 100644 index 0000000..6a8904a --- /dev/null +++ b/drawio/ch-mpi-large-model/data-parallel-distributed.drawio @@ -0,0 +1,328 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/drawio/ch-mpi-large-model/data-parallel-single.drawio b/drawio/ch-mpi-large-model/data-parallel-single.drawio new file mode 100644 index 0000000..2741c7f --- /dev/null +++ b/drawio/ch-mpi-large-model/data-parallel-single.drawio @@ -0,0 +1,163 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/drawio/ch-mpi-large-model/data-parallel.drawio b/drawio/ch-mpi-large-model/data-parallel.drawio new file mode 100644 index 0000000..8d88f2b --- /dev/null +++ b/drawio/ch-mpi-large-model/data-parallel.drawio @@ -0,0 +1,100 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/drawio/ch-mpi-large-model/pipeline-parallel-data-parallel.drawio b/drawio/ch-mpi-large-model/pipeline-parallel-data-parallel.drawio new file mode 100644 index 0000000..3289139 --- /dev/null +++ b/drawio/ch-mpi-large-model/pipeline-parallel-data-parallel.drawio @@ -0,0 +1,294 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/drawio/ch-mpi-large-model/pipeline-parallel-distributed.drawio b/drawio/ch-mpi-large-model/pipeline-parallel-distributed.drawio new file mode 100644 index 0000000..6c0f67d --- /dev/null +++ b/drawio/ch-mpi-large-model/pipeline-parallel-distributed.drawio @@ -0,0 +1,129 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/drawio/ch-mpi-large-model/pipeline-parallel.drawio b/drawio/ch-mpi-large-model/pipeline-parallel.drawio new file mode 100644 index 0000000..35bf846 --- /dev/null +++ b/drawio/ch-mpi-large-model/pipeline-parallel.drawio @@ -0,0 +1,100 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/img/ch-dask-dataframe/dataframe-model.svg b/img/ch-dask-dataframe/dataframe-model.svg new file mode 100644 index 0000000..6f737a9 --- /dev/null +++ b/img/ch-dask-dataframe/dataframe-model.svg @@ -0,0 +1,4 @@ + + + +
数据
数据
行标签
行标签
列标签
列标签
Text is not SVG - cannot display
\ No newline at end of file diff --git a/img/ch-dask-dataframe/divisions.png b/img/ch-dask-dataframe/divisions.png new file mode 100644 index 0000000..ef58e59 Binary files /dev/null and b/img/ch-dask-dataframe/divisions.png differ diff --git a/img/ch-dask-dataframe/groupby.svg b/img/ch-dask-dataframe/groupby.svg new file mode 100644 index 0000000..b683ab9 --- /dev/null +++ b/img/ch-dask-dataframe/groupby.svg @@ -0,0 +1,1616 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/img/ch-dask-dataframe/nyc-flights-graph.svg b/img/ch-dask-dataframe/nyc-flights-graph.svg index 59de5c4..b9d0a86 100644 --- a/img/ch-dask-dataframe/nyc-flights-graph.svg +++ b/img/ch-dask-dataframe/nyc-flights-graph.svg @@ -8,255 +8,255 @@ viewBox="0.00 0.00 1058.21 436.28" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"> - + --4992946418407614279 +-7150382809904974857 read-csv - + --2791454735931807732 +-6194689680917011400 0 - + --4992946418407614279->-2791454735931807732 +-7150382809904974857->-6194689680917011400 - + --1856934773411932769 +6071941830101125281 to_pyarrow_string - + --2791454735931807732->-1856934773411932769 +-6194689680917011400->6071941830101125281 - + --66498125748364619 +-65660599333282282 read-csv - + -3716186020170190693 +312951075184987025 1 - + --66498125748364619->3716186020170190693 +-65660599333282282->312951075184987025 - + -5227787437159759806 +-8135483071169533727 to_pyarrow_string - + -3716186020170190693->5227787437159759806 +312951075184987025->-8135483071169533727 - + -4172821046690527989 +-1102500011605602925 read-csv - + --1176888008802505673 +-8978480336772077469 2 - + -4172821046690527989->-1176888008802505673 +-1102500011605602925->-8978480336772077469 - + -5068749226614284286 +-1050760860597841152 to_pyarrow_string - + --1176888008802505673->5068749226614284286 +-8978480336772077469->-1050760860597841152 - + --7189200816447331052 +-1906626916754175967 read-csv - + -5330752747299492752 +-2470839580670079044 3 - + --7189200816447331052->5330752747299492752 +-1906626916754175967->-2470839580670079044 - + -3386821119738866121 +4071937302134756076 to_pyarrow_string - + -5330752747299492752->3386821119738866121 +-2470839580670079044->4071937302134756076 - + -5177257767402434271 +5178095293817516608 read-csv - + -7440036120417123049 +4036801175431919381 4 - + -5177257767402434271->7440036120417123049 +5178095293817516608->4036801175431919381 - + -3425514041675701871 +8508987607055959954 to_pyarrow_string - + -7440036120417123049->3425514041675701871 +4036801175431919381->8508987607055959954 - + --9030167133868224737 +-9029329607453142400 read-csv - + -2546962091444426683 +-856272853540776985 5 - + --9030167133868224737->2546962091444426683 +-9029329607453142400->-856272853540776985 - + -7664833214114594479 +-2363636268343853305 to_pyarrow_string - + -2546962091444426683->7664833214114594479 +-856272853540776985->-2363636268343853305 - + --810036887397515834 +8774990811659256194 0 - + --1856934773411932769->-810036887397515834 +6071941830101125281->8774990811659256194 - + -5697603868704482591 +3881916782686559828 1 - + -5227787437159759806->5697603868704482591 +-8135483071169533727->3881916782686559828 - + -804529839731786225 +-8057186534920993363 2 - + -5068749226614284286->804529839731786225 +-1050760860597841152->-8057186534920993363 - + -2913813212849416522 +1098126126831493759 3 - + -3386821119738866121->2913813212849416522 +4071937302134756076->1098126126831493759 - + --9025290104758136669 +7605766882933492184 4 - + -3425514041675701871->-9025290104758136669 +8508987607055959954->7605766882933492184 - + -4528379939978718581 +-4333336434674061007 5 - + -7664833214114594479->4528379939978718581 +-2363636268343853305->-4333336434674061007 diff --git a/img/ch-data-science/model-training-input-output.svg b/img/ch-data-science/model-training-input-output.svg new file mode 100644 index 0000000..05a2b07 --- /dev/null +++ b/img/ch-data-science/model-training-input-output.svg @@ -0,0 +1,4 @@ + + + +
前向传播
前向传播
i - 1层输出
i - 1层输出
i 层输出
i 层输出
反向传播
反向传播
i 层输出
i 层输出
损失对 i 层模型权重的导数
损失对 i 层模型权重的导数
+
+
i 层模型权重
i 层模型权重
+
+
i 层模型权重
i 层模型权重
$$\boldsym...
损失对 i 层输出的导数
损失对 i 层输出的导数
+
+
$$\boldsym...
$$\boldsym...
$$\boldsym...
$$\boldsym...
$$\boldsym...
$$\boldsym...
$$\boldsym...
更新模型权重
更新模型权重
i 层优化器状态
i 层优化器状态
+
+
i 层模型权重
i 层模型权重
+
+
$$\boldsym...
$$\boldsym...
$$\boldsym...
$$\boldsym...
损失对 i 层模型权重的导数
损失对 i 层模型权重的导数
$$\boldsym...
$$\boldsym...
更新后的 i 层模型权重
更新后的 i 层模型权重
$$\boldsym...
$$\boldsym...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/img/ch-data-science/model-training.svg b/img/ch-data-science/model-training.svg new file mode 100644 index 0000000..4dc4d74 --- /dev/null +++ b/img/ch-data-science/model-training.svg @@ -0,0 +1,4 @@ + + + +
FWD 1
FWD 1
FWD 2
FWD 2
FWD 3
FWD 3
BWD 3
BWD 3
BWD 2
BWD 2
BWD 1
BWD 1

标注值
标注值
$$\boldsym...
$$\boldsym...
$$\boldsym...
$$\boldsym...
$$\boldsym...
$$\boldsym...
$$\bol...
$$\boldsym...
$$\bolds...
$$\boldsym...
$$\bolds...
$$L$$
$$y...
$$\...
预测值
预测值
$$\boldsym...
$$\boldsym...
$$\boldsym...
$$\boldsym...
$$\boldsym...
$$\boldsym...
$$\boldsym...
$$\boldsym...
$$\boldsym...
$$\boldsym...
输入
输入
$$\boldsym...
$$\bolds...
优化器状态
优化器状态
$$\boldsym...
$$\bolds...
优化器状态
优化器状态
$$\boldsym...
$$\bolds...
优化器状态
优化器状态
Text is not SVG - cannot display
\ No newline at end of file diff --git a/img/ch-mpi-large-model/data-parallel-all-reduce.svg b/img/ch-mpi-large-model/data-parallel-all-reduce.svg new file mode 100644 index 0000000..f561054 --- /dev/null +++ b/img/ch-mpi-large-model/data-parallel-all-reduce.svg @@ -0,0 +1,4 @@ + + + +
FWD 1
FWD 1
FWD 2
FWD 2
FWD 3
FWD 3
BWD 3
BWD 3
BWD 2
BWD 2
BWD 1
BWD 1

4, 7, 8
4, 7, 8
预测值
预测值
4, 1, 8
4, 1, 8
标注值
标注值
输入
输入
$$\boldsym...
$$\boldsym...
$$\boldsym...
$$\boldsym...
$$\boldsym...
$$\boldsym...
FWD 1
FWD 1
FWD 2
FWD 2
FWD 3
FWD 3
BWD 3
BWD 3
BWD 2
BWD 2
BWD 1
BWD 1

1, 6, 2
1, 6, 2
预测值
预测值
7, 6, 2
7, 6, 2
标注值
标注值
$$\boldsym...
$$\boldsym...
$$\boldsym...
$$\boldsym...
$$\boldsym...
$$\boldsym...
7
7
6
6
2
2
输入
输入
4
4
1
1
8
8
$$L$$
$$L$$
GPU 0
GPU 0
GPU 1
GPU 1
MPI.AllReduce
MPI.AllReduce
Text is not SVG - cannot display
\ No newline at end of file diff --git a/img/ch-mpi-large-model/data-parallel-distributed.svg b/img/ch-mpi-large-model/data-parallel-distributed.svg new file mode 100644 index 0000000..db46b4c --- /dev/null +++ b/img/ch-mpi-large-model/data-parallel-distributed.svg @@ -0,0 +1,4 @@ + + + +
FWD 1
FWD 1
FWD 2
FWD 2
FWD 3
FWD 3
BWD 3
BWD 3
BWD 2
BWD 2
BWD 1
BWD 1

4, 7, 8
4, 7, 8
预测值
预测值
4, 1, 8
4, 1, 8
标注值
标注值
输入
输入
$$\boldsym...
$$\boldsym...
$$\boldsym...
$$\boldsym...
$$\boldsym...
$$\boldsym...
FWD 1
FWD 1
FWD 2
FWD 2
FWD 3
FWD 3
BWD 3
BWD 3
BWD 2
BWD 2
BWD 1
BWD 1

1, 6, 2
1, 6, 2
预测值
预测值
7, 6, 2
7, 6, 2
标注值
标注值
输入
输入
$$\boldsym...
$$\boldsym...
$$\boldsym...
$$\boldsym...
$$\boldsym...
$$\boldsym...
7
7
6
6
2
2
4
4
1
1
8
8
数据拆分
数据拆分
模型权重拷贝到多个 GPU 上
模型权重拷贝到多个 GPU 上
$$L$$
$$L$$
GPU 0
GPU 0
GPU 1
GPU 1
Text is not SVG - cannot display
\ No newline at end of file diff --git a/img/ch-mpi-large-model/data-parallel-single.svg b/img/ch-mpi-large-model/data-parallel-single.svg new file mode 100644 index 0000000..0f8f9b6 --- /dev/null +++ b/img/ch-mpi-large-model/data-parallel-single.svg @@ -0,0 +1,4 @@ + + + +
FWD 1
FWD 1
FWD 2
FWD 2
FWD 3
FWD 3
BWD 3
BWD 3
BWD 2
BWD 2
BWD 1
BWD 1

4, 7, 8, 1, 6, 2
4, 7, 8, 1, 6, 2
预测值
预测值
4, 1, 8, 7, 6, 2
4, 1, 8, 7, 6, 2
4
4
1
1
8
8
7
7
6
6
2
2
标注值
标注值
$$L$$
Input
Input
$$\boldsym...
$$\boldsym...
$$\boldsym...
$$\boldsym...
$$\boldsym...
$$\boldsym...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/img/ch-mpi-large-model/data-parallel.svg b/img/ch-mpi-large-model/data-parallel.svg new file mode 100644 index 0000000..8a096fa --- /dev/null +++ b/img/ch-mpi-large-model/data-parallel.svg @@ -0,0 +1,4 @@ + + + +
GPU 0
GPU 0
模型拷贝
模型拷贝
GPU 1
GPU 1
模型拷贝
模型拷贝
GPU 2
GPU 2
模型拷贝
模型拷贝
GPU 2
GPU 2
模型拷贝
模型拷贝
训练数据
训练数据
Text is not SVG - cannot display
\ No newline at end of file diff --git a/img/ch-mpi-large-model/pipeline-parallel-data-parallel.svg b/img/ch-mpi-large-model/pipeline-parallel-data-parallel.svg new file mode 100644 index 0000000..e4a8901 --- /dev/null +++ b/img/ch-mpi-large-model/pipeline-parallel-data-parallel.svg @@ -0,0 +1,4 @@ + + + +
FWD 1
FWD 1
FWD 2
FWD 2
BWD 2
BWD 2
BWD 1
BWD 1

预测值
预测值
标注值
标注值
输入
输入
$$\boldsym...
$$\boldsym...
$$\boldsym...
$$\boldsym...
$$L$$
GPU 0
GPU 0
MPI.Recv
MPI.Recv
MPI.Send
MPI.Send
GPU 1
GPU 1
MPI.Recv
MPI.Recv
MPI.Send
MPI.Send
BWD 1
BWD 1
BWD 2
BWD 2
FWD 2
FWD 2
FWD 1
FWD 1

预测值
预测值
标注值
标注值
输入
输入
$$\boldsym...
$$\boldsym...
$$\boldsym...
$$\boldsym...
$$L$$
GPU 2
GPU 2
MPI.Recv
MPI.Recv
MPI.Recv
MPI.Recv
GPU 3
GPU 3
MPI.Send
MPI.Send
MPI.Send
MPI.Send
数据并行 Communicator
数据并行 Communicator
流水线并行 Communicator
流水线并行 Communicator
MPI.AllReduce
MPI.AllReduce
MPI.AllReduce
MPI.AllReduce
Text is not SVG - cannot display
\ No newline at end of file diff --git a/img/ch-mpi-large-model/pipeline-parallel-distributed.svg b/img/ch-mpi-large-model/pipeline-parallel-distributed.svg new file mode 100644 index 0000000..8c317d9 --- /dev/null +++ b/img/ch-mpi-large-model/pipeline-parallel-distributed.svg @@ -0,0 +1,4 @@ + + + +
FWD 1
FWD 1
FWD 2
FWD 2
BWD 2
BWD 2
BWD 1
BWD 1

预测值
预测值
标注值
标注值
输入
输入
$$\boldsym...
$$\boldsym...
$$\boldsym...
$$\boldsym...
$$L$$
GPU 0
GPU 0
MPI.Recv
MPI.Recv
MPI.Send
MPI.Send
GPU 1
GPU 1
MPI.Recv
MPI.Recv
MPI.Send
MPI.Send
Text is not SVG - cannot display
\ No newline at end of file diff --git a/img/ch-mpi-large-model/pipeline-parallel.svg b/img/ch-mpi-large-model/pipeline-parallel.svg new file mode 100644 index 0000000..03d030d --- /dev/null +++ b/img/ch-mpi-large-model/pipeline-parallel.svg @@ -0,0 +1,4 @@ + + + +
GPU 0
GPU 0
模型第 0 层
模型第 0 层
GPU 1
GPU 1
模型第 1 层
模型第 1 层
GPU 2
GPU 2
模型第 2 层
模型第 2 层
GPU 3
GPU 3
模型第 3 层
模型第 3 层
前向传播
前向传播
前向传播
前向传播
Text is not SVG - cannot display
\ No newline at end of file diff --git a/references.bib b/references.bib index b04dbf5..3dd7cd4 100644 --- a/references.bib +++ b/references.bib @@ -69,4 +69,46 @@ @inproceedings{hewitt1973Universal year = {1973}, pages = {235--245}, publisher = {{William Kaufmann}} -} \ No newline at end of file +} + +@inproceedings{huang2019GPipe, +author = {Huang, Yanping and Cheng, Youlong and Bapna, Ankur and Firat, Orhan and Chen, Mia Xu and Chen, Dehao and Lee, HyoukJoong and Ngiam, Jiquan and Le, Quoc V. and Wu, Yonghui and Chen, Zhifeng}, +title = {GPipe: efficient training of giant neural networks using pipeline parallelism}, +year = {2019}, +publisher = {Curran Associates Inc.}, +address = {Red Hook, NY, USA}, +booktitle = {Proceedings of the 33rd International Conference on Neural Information Processing Systems}, +articleno = {10}, +numpages = {10} +} + +@inproceedings{narayanan2019PipeDream, +author = {Narayanan, Deepak and Harlap, Aaron and Phanishayee, Amar and Seshadri, Vivek and Devanur, Nikhil R. and Ganger, Gregory R. and Gibbons, Phillip B. and Zaharia, Matei}, +title = {PipeDream: generalized pipeline parallelism for DNN training}, +year = {2019}, +isbn = {9781450368735}, +publisher = {Association for Computing Machinery}, +address = {New York, NY, USA}, +url = {https://doi.org/10.1145/3341301.3359646}, +doi = {10.1145/3341301.3359646}, +booktitle = {Proceedings of the 27th ACM Symposium on Operating Systems Principles}, +pages = {1–15}, +numpages = {15}, +location = {Huntsville, Ontario, Canada}, +series = {SOSP '19} +} + +@inproceedings{narayanan2021Efficient, + title = {Efficient Large-Scale Language Model Training on {{GPU}} Clusters Using Megatron-{{LM}}}, + booktitle = {Proceedings of the {{International Conference}} for {{High Performance Computing}}, {{Networking}}, {{Storage}} and {{Analysis}}}, + author = {Narayanan, Deepak and Shoeybi, Mohammad and Casper, Jared and LeGresley, Patrick and Patwary, Mostofa and Korthikanti, Vijay and Vainbrand, Dmitri and Kashinkunti, Prethvi and Bernauer, Julie and Catanzaro, Bryan and Phanishayee, Amar and Zaharia, Matei}, + year = {2021}, + month = nov, + pages = {1--15}, + publisher = {ACM}, + address = {St. Louis Missouri}, + doi = {10.1145/3458817.3476209}, + urldate = {2023-11-13}, + isbn = {978-1-4503-8442-1}, + langid = {english} +}