diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 35b6ad6..ced8a1b 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -34,4 +34,4 @@ jobs: run: ssh-keyscan -p ${{ secrets.REMOTE_PORT }} -H ${{ secrets.REMOTE_HOST }} >> ~/.ssh/known_hosts - name: Deploy with rsync - run: rsync -avz _build/html/* ${{ secrets.REMOTE_USER }}@${{ secrets.REMOTE_HOST }}:/var/www/distributed-python/ + run: rsync -avz _build/html/* ${{ secrets.REMOTE_USER }}@${{ secrets.REMOTE_HOST }}:/var/www/scale-python/ diff --git a/README.md b/README.md index 33b2120..0f77b8d 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ -# Python 分布式编程 +# Python 数据科学加速 -开源的、面向下一代人工智能应用的 Python 分布式编程书籍。 +开源的、面向下一代数据科学和人工智能应用的 Python 并行加速编程书籍。 ## 内容介绍 diff --git a/ch-dask-dataframe/indexing.ipynb b/ch-dask-dataframe/indexing.ipynb index 51eed82..8206544 100644 --- a/ch-dask-dataframe/indexing.ipynb +++ b/ch-dask-dataframe/indexing.ipynb @@ -16,15 +16,27 @@ "hide-cell" ] }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages/distributed/node.py:182: UserWarning: Port 8787 is already in use.\n", + "Perhaps you already have a cluster running?\n", + "Hosting the HTTP server on port 51899 instead\n", + " warnings.warn(\n" + ] + } + ], "source": [ "%config InlineBackend.figure_format = 'svg'\n", "import os\n", - "import urllib\n", - "import shutil\n", - "from zipfile import ZipFile\n", + "import sys\n", + "sys.path.append(\"..\")\n", + "from datasets import nyc_flights\n", "\n", "import dask\n", + "dask.config.set({'dataframe.query-planning': False})\n", "import dask.dataframe as dd\n", "import pandas as pd\n", "from dask.distributed import LocalCluster, Client\n", @@ -52,7 +64,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -123,7 +135,7 @@ "3 qux three 4 40" ] }, - "execution_count": 15, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -147,7 +159,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -220,7 +232,7 @@ "qux three 4 40" ] }, - "execution_count": 19, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -239,7 +251,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -310,7 +322,7 @@ "3 qux three 4 40" ] }, - "execution_count": 20, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -343,7 +355,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -370,7 +382,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -397,7 +409,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -433,28 +445,28 @@ " \n", " \n", " 2018-01-01 00:00:00\n", - " 984\n", - " 0.660595\n", + " 992\n", + " -0.711756\n", " \n", " \n", " 2018-01-01 00:00:01\n", - " 960\n", - " -0.747564\n", + " 1018\n", + " -0.838596\n", " \n", " \n", " 2018-01-01 00:00:02\n", - " 1039\n", - " 0.777117\n", + " 1000\n", + " -0.735968\n", " \n", " \n", " 2018-01-01 00:00:03\n", - " 1038\n", - " -0.501949\n", + " 1004\n", + " 0.904384\n", " \n", " \n", " 2018-01-01 00:00:04\n", - " 992\n", - " 0.767979\n", + " 1021\n", + " 0.025423\n", " \n", " \n", " ...\n", @@ -463,28 +475,28 @@ " \n", " \n", " 2022-12-31 23:59:55\n", - " 1005\n", - " -0.102774\n", + " 1020\n", + " 0.961542\n", " \n", " \n", " 2022-12-31 23:59:56\n", - " 1040\n", - " -0.648857\n", + " 963\n", + " -0.663948\n", " \n", " \n", " 2022-12-31 23:59:57\n", - " 1019\n", - " -0.310174\n", + " 1010\n", + " 0.510401\n", " \n", " \n", " 2022-12-31 23:59:58\n", - " 987\n", - " 0.889037\n", + " 964\n", + " -0.882126\n", " \n", " \n", " 2022-12-31 23:59:59\n", - " 977\n", - " -0.078216\n", + " 1020\n", + " -0.532950\n", " \n", " \n", "\n", @@ -494,22 +506,22 @@ "text/plain": [ " id x\n", "timestamp \n", - "2018-01-01 00:00:00 984 0.660595\n", - "2018-01-01 00:00:01 960 -0.747564\n", - "2018-01-01 00:00:02 1039 0.777117\n", - "2018-01-01 00:00:03 1038 -0.501949\n", - "2018-01-01 00:00:04 992 0.767979\n", + "2018-01-01 00:00:00 992 -0.711756\n", + "2018-01-01 00:00:01 1018 -0.838596\n", + "2018-01-01 00:00:02 1000 -0.735968\n", + "2018-01-01 00:00:03 1004 0.904384\n", + "2018-01-01 00:00:04 1021 0.025423\n", "... ... ...\n", - "2022-12-31 23:59:55 1005 -0.102774\n", - "2022-12-31 23:59:56 1040 -0.648857\n", - "2022-12-31 23:59:57 1019 -0.310174\n", - "2022-12-31 23:59:58 987 0.889037\n", - "2022-12-31 23:59:59 977 -0.078216\n", + "2022-12-31 23:59:55 1020 0.961542\n", + "2022-12-31 23:59:56 963 -0.663948\n", + "2022-12-31 23:59:57 1010 0.510401\n", + "2022-12-31 23:59:58 964 -0.882126\n", + "2022-12-31 23:59:59 1020 -0.532950\n", "\n", "[157766400 rows x 2 columns]" ] }, - "execution_count": 4, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -527,31 +539,31 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 8, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages/dask/dataframe/io/csv.py:640: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " head = reader(BytesIO(b_sample), nrows=sample_rows, **head_kwargs)\n" + ] + }, { "data": { "text/plain": [ "(None, None, None, None, None, None, None)" ] }, - "execution_count": 5, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "folder_path = os.path.join(os.getcwd(), \"../data/\")\n", - "download_url = \"https://dp.godaai.org/nyc-flights.zip\"\n", - "zip_file_path = os.path.join(folder_path, \"nyc-flights.zip\")\n", - "if not os.path.exists(os.path.join(folder_path, \"nyc-flights\")):\n", - " with urllib.request.urlopen(download_url) as response, open(zip_file_path, 'wb') as out_file:\n", - " shutil.copyfileobj(response, out_file)\n", - " zf = ZipFile(zip_file_path, 'r')\n", - " zf.extractall(folder_path)\n", - " zf.close()\n", - "file_path = os.path.join(folder_path, \"nyc-flights\", \"*.csv\")\n", + "folder_path = nyc_flights()\n", + "file_path = os.path.join(folder_path, \"*.csv\")\n", "flights_ddf = dd.read_csv(file_path,\n", " parse_dates={'Date': [0, 1, 2]},\n", " dtype={'TailNum': object,\n", @@ -569,7 +581,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -602,7 +614,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -633,16 +645,38 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 11, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-04-23 16:05:06,483 - distributed.shuffle._scheduler_plugin - WARNING - Shuffle 008ee90768895dabe7a3e94389222068 initialized by task ('shuffle-transfer-008ee90768895dabe7a3e94389222068', 0) executed on worker tcp://127.0.0.1:51911\n", + "2024-04-23 16:05:06,505 - distributed.shuffle._scheduler_plugin - WARNING - Shuffle 008ee90768895dabe7a3e94389222068 deactivated due to stimulus 'task-finished-1713859506.50483'\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ " col2\n", "col1 \n", - "01 a\n", + "01 a\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-04-23 16:05:06,545 - distributed.shuffle._scheduler_plugin - WARNING - Shuffle 01fddf4f11082a43a6075f7888029dd3 initialized by task ('shuffle-transfer-01fddf4f11082a43a6075f7888029dd3', 1) executed on worker tcp://127.0.0.1:51912\n", + "2024-04-23 16:05:06,604 - distributed.shuffle._scheduler_plugin - WARNING - Shuffle 01fddf4f11082a43a6075f7888029dd3 deactivated due to stimulus 'task-finished-1713859506.6028118'\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ " col2\n", "col1 \n", "02 c\n", @@ -675,17 +709,25 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 12, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-04-23 16:05:16,522 - distributed.shuffle._scheduler_plugin - WARNING - Shuffle d162433f4ca23d129354be4d414ea589 initialized by task ('shuffle-transfer-d162433f4ca23d129354be4d414ea589', 999) executed on worker tcp://127.0.0.1:51914\n", + "2024-04-23 16:05:27,101 - distributed.shuffle._scheduler_plugin - WARNING - Shuffle d162433f4ca23d129354be4d414ea589 deactivated due to stimulus 'task-finished-1713859527.100699'\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ "before set_index npartitions: 1826\n", - "after set_index npartitions: 163\n", - "CPU times: user 6.1 s, sys: 3.47 s, total: 9.57 s\n", - "Wall time: 19.6 s\n" + "after set_index npartitions: 165\n", + "CPU times: user 6.63 s, sys: 3.65 s, total: 10.3 s\n", + "Wall time: 20.6 s\n" ] } ], @@ -706,7 +748,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -716,15 +758,23 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 14, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-04-23 16:05:38,056 - distributed.shuffle._scheduler_plugin - WARNING - Shuffle d162433f4ca23d129354be4d414ea589 initialized by task ('shuffle-transfer-d162433f4ca23d129354be4d414ea589', 999) executed on worker tcp://127.0.0.1:51914\n", + "2024-04-23 16:05:49,629 - distributed.shuffle._scheduler_plugin - WARNING - Shuffle d162433f4ca23d129354be4d414ea589 deactivated due to stimulus 'task-finished-1713859549.629161'\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 3.25 s, sys: 1.09 s, total: 4.34 s\n", - "Wall time: 11.7 s\n" + "CPU times: user 3.24 s, sys: 1.7 s, total: 4.94 s\n", + "Wall time: 11.9 s\n" ] } ], @@ -743,15 +793,15 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 1.94 s, sys: 743 ms, total: 2.68 s\n", - "Wall time: 8.18 s\n" + "CPU times: user 1.88 s, sys: 1.09 s, total: 2.97 s\n", + "Wall time: 8.38 s\n" ] } ], @@ -782,9 +832,17 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 17, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/4n/v40br47s46ggrjm9bdm64lwh0000gn/T/ipykernel_76150/639704942.py:3: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " pdf = pd.read_csv(file_path,\n" + ] + }, { "data": { "text/html": [ @@ -837,14 +895,14 @@ "1 JFK 9.311532" ] }, - "execution_count": 13, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# pandas\n", - "file_path = os.path.join(folder_path, \"nyc-flights\", \"1991.csv\")\n", + "file_path = os.path.join(folder_path, \"1991.csv\")\n", "pdf = pd.read_csv(file_path,\n", " parse_dates={'Date': [0, 1, 2]},\n", " dtype={'TailNum': object,\n", @@ -865,7 +923,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -920,7 +978,7 @@ "1 JFK 9.311532" ] }, - "execution_count": 22, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -940,9 +998,23 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 19, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n" + ] + }, { "data": { "text/html": [ @@ -995,7 +1067,7 @@ "1 JFK 10.766914" ] }, - "execution_count": 14, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -1017,6 +1089,13 @@ "source": [ "client.shutdown()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/ch-dask-dataframe/map-partitions.ipynb b/ch-dask-dataframe/map-partitions.ipynb index e077d3f..e3357cb 100644 --- a/ch-dask-dataframe/map-partitions.ipynb +++ b/ch-dask-dataframe/map-partitions.ipynb @@ -4,57 +4,37 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "(sec-dask-map_partitions)=\n", "# `map_partitions`\n", "\n", - "除了 {numref}`sec-dask-dataframe-shuffle` 中提到的一些需要通信的计算外,有一种最简单的并行方式,英文术语为 Embarrassingly Parallel,中文可翻译为易并行。它指的是该类计算不需要太多跨 Worker 的协调和通信。比如,对某个字段加一,每个 Worker 内执行加法操作即可,Worker 之间没有通信的开销。Dask DataFrame 中可以使用 `map_partitions()` 来做这类 Embarrassingly Parallel 的操作。`map_partitions(func)` 的参数是一个 `func`,这个 `func` 将在每个 Partition 上执行。\n", + "除了 {numref}`sec-dask-dataframe-shuffle` 中提到的一些需要通信的计算外,有一种最简单的并行方式,英文术语为 Embarrassingly Parallel,中文可翻译为**易并行**。它指的是该类计算不需要太多跨 Worker 的协调和通信。比如,对某个字段加一,每个 Worker 内执行加法操作即可,Worker 之间没有通信的开销。Dask DataFrame 中可以使用 `map_partitions(func)` 来做这类 Embarrassingly Parallel 的操作。`map_partitions(func)` 的参数是一个 `func`,这个 `func` 将在每个 pandas DataFrame 上执行,`func` 内可以使用 pandas DataFrame 的各类操作。如 {numref}`fig-dask-map-partitions` 所示,`map_partitions(func)` 对原来的 pandas DataFrame 进行了转换操作。\n", "\n", - "下面的案例对缺失值进行填充,它没有跨 Worker 的通信开销,因此是一种 Embarrassingly Parallel 的典型应用场景。" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import urllib\n", - "import shutil\n", - "from zipfile import ZipFile\n", - "import warnings\n", + "```{figure} ../img/ch-dask-dataframe/map-partitions.svg\n", + "---\n", + "width: 600px\n", + "name: fig-dask-map-partitions\n", + "---\n", + "map_partitions()\n", + "```\n", "\n", - "warnings.simplefilter(action='ignore', category=FutureWarning)\n", + "## 案例:纽约出租车数据\n", "\n", - "folder_path = os.path.join(os.getcwd(), \"../data/\")\n", - "download_url_prefix = \"https://gender-pay-gap.service.gov.uk/viewing/download-data/\"\n", - "file_path_prefix = os.path.join(folder_path, \"gender-pay\")\n", - "if not os.path.exists(file_path_prefix):\n", - " os.makedirs(file_path_prefix)\n", - "for year in [2017, 2018, 2019, 2020, 2021, 2022]:\n", - " download_url = download_url_prefix + str(year)\n", - " file_path = os.path.join(file_path_prefix, f\"{str(year)}.csv\")\n", - " if not os.path.exists(file_path):\n", - " with urllib.request.urlopen(download_url) as response, open(file_path, 'wb') as out_file:\n", - " shutil.copyfileobj(response, out_file)" + "我们使用纽约出租车数据集进行简单的数据预处理:计算每个订单的时长。原数据集中,`tpep_pickup_datetime` 和 `tpep_dropoff_datetime` 分别为乘客上车和下车时间,现在只需要将下车时间 `tpep_dropoff_datetime` 减去上车时间 `tpep_pickup_datetime`。这个计算没有跨 Worker 的通信开销,因此是一种 Embarrassingly Parallel 的典型应用场景。" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 4, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages/distributed/node.py:182: UserWarning: Port 8787 is already in use.\n", - "Perhaps you already have a cluster running?\n", - "Hosting the HTTP server on port 57481 instead\n", - " warnings.warn(\n" - ] - } - ], + "outputs": [], "source": [ + "import sys\n", + "sys.path.append(\"..\")\n", + "from datasets import nyc_taxi\n", + "\n", + "import pandas as pd\n", + "import dask\n", + "dask.config.set({'dataframe.query-planning': False})\n", "import dask.dataframe as dd\n", "import pandas as pd\n", "from dask.distributed import LocalCluster, Client\n", @@ -65,31 +45,249 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ - "ddf = dd.read_csv(os.path.join(file_path_prefix, \"*.csv\"),\n", - " dtype={'CompanyNumber': 'str', 'DiffMeanHourlyPercent': 'float64'})\n", + "dataset_path = nyc_taxi()\n", + "ddf = dd.read_parquet(dataset_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
VendorIDtrip_durationtpep_pickup_datetimetpep_dropoff_datetimepassenger_counttrip_distanceRatecodeIDstore_and_fwd_flagPULocationIDDOLocationIDpayment_typefare_amountextramta_taxtip_amounttolls_amountimprovement_surchargetotal_amountcongestion_surchargeAirport_fee
0112532023-06-01 00:08:482023-06-01 00:29:411.03.401.0N140238121.93.500.56.700.01.033.602.50.00
116142023-06-01 00:15:042023-06-01 00:25:180.03.401.0N50151115.63.500.53.000.01.023.602.50.00
2111232023-06-01 00:48:242023-06-01 01:07:071.010.201.0N13897140.87.750.510.000.01.060.050.01.75
3214062023-06-01 00:54:032023-06-01 01:17:293.09.831.0N100244139.41.000.58.880.01.053.282.50.00
425142023-06-01 00:18:442023-06-01 00:27:181.01.171.0N13723419.31.000.50.720.01.015.022.50.00
\n", + "
" + ], + "text/plain": [ + " VendorID trip_duration tpep_pickup_datetime tpep_dropoff_datetime \\\n", + "0 1 1253 2023-06-01 00:08:48 2023-06-01 00:29:41 \n", + "1 1 614 2023-06-01 00:15:04 2023-06-01 00:25:18 \n", + "2 1 1123 2023-06-01 00:48:24 2023-06-01 01:07:07 \n", + "3 2 1406 2023-06-01 00:54:03 2023-06-01 01:17:29 \n", + "4 2 514 2023-06-01 00:18:44 2023-06-01 00:27:18 \n", + "\n", + " passenger_count trip_distance RatecodeID store_and_fwd_flag \\\n", + "0 1.0 3.40 1.0 N \n", + "1 0.0 3.40 1.0 N \n", + "2 1.0 10.20 1.0 N \n", + "3 3.0 9.83 1.0 N \n", + "4 1.0 1.17 1.0 N \n", + "\n", + " PULocationID DOLocationID payment_type fare_amount extra mta_tax \\\n", + "0 140 238 1 21.9 3.50 0.5 \n", + "1 50 151 1 15.6 3.50 0.5 \n", + "2 138 97 1 40.8 7.75 0.5 \n", + "3 100 244 1 39.4 1.00 0.5 \n", + "4 137 234 1 9.3 1.00 0.5 \n", + "\n", + " tip_amount tolls_amount improvement_surcharge total_amount \\\n", + "0 6.70 0.0 1.0 33.60 \n", + "1 3.00 0.0 1.0 23.60 \n", + "2 10.00 0.0 1.0 60.05 \n", + "3 8.88 0.0 1.0 53.28 \n", + "4 0.72 0.0 1.0 15.02 \n", + "\n", + " congestion_surcharge Airport_fee \n", + "0 2.5 0.00 \n", + "1 2.5 0.00 \n", + "2 0.0 1.75 \n", + "3 2.5 0.00 \n", + "4 2.5 0.00 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def transform(df):\n", + " df[\"trip_duration\"] = (df[\"tpep_dropoff_datetime\"] - df[\"tpep_pickup_datetime\"]).dt.seconds\n", + " # 将 `trip_duration` 挪到前面\n", + " dur_column = df.pop('trip_duration')\n", + " df.insert(1, dur_column.name, dur_column)\n", + " return df\n", "\n", - "def fillna(df):\n", - " return df.fillna(value={\"PostCode\": \"UNKNOWN\"})\n", - " \n", - "ddf = ddf.map_partitions(fillna)" + "ddf = ddf.map_partitions(transform)\n", + "ddf.compute()\n", + "ddf.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Dask DataFrame 模拟了 pandas DataFrame,如果这个 API 的计算模式是 Embarrassingly Parallel,它的底层很可能就是使用 `map_partitions()` 实现的。\n", + "Dask DataFrame 的某些 API 的计算模式是 Embarrassingly Parallel,它的底层很可能就是使用 `map_partitions()` 实现的。\n", "\n", - "{numref}`sec-dask-dataframe-indexing` 提到过,Dask DataFrame 会在某个列上进行切分。我们可以在 `map_partitions()` 的 `func` 中实现任何我们想做的事情,但如果对这些切分的列做了改动,需要 `clear_divisions()` 或者重新 `set_index()`。" + "{numref}`sec-dask-dataframe-indexing` 提到过,Dask DataFrame 会在某个列上进行切分,但如果对这些切分的列做了改动,需要 `clear_divisions()` 或者重新 `set_index()`。" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -114,43 +312,29 @@ " \n", " \n", " \n", - " EmployerName\n", - " EmployerId\n", - " Address\n", - " PostCode\n", - " CompanyNumber\n", - " SicCodes\n", - " DiffMeanHourlyPercent\n", - " DiffMedianHourlyPercent\n", - " DiffMeanBonusPercent\n", - " DiffMedianBonusPercent\n", - " MaleBonusPercent\n", - " FemaleBonusPercent\n", - " MaleLowerQuartile\n", - " FemaleLowerQuartile\n", - " MaleLowerMiddleQuartile\n", - " FemaleLowerMiddleQuartile\n", - " MaleUpperMiddleQuartile\n", - " FemaleUpperMiddleQuartile\n", - " MaleTopQuartile\n", - " FemaleTopQuartile\n", - " CompanyLinkToGPGInfo\n", - " ResponsiblePerson\n", - " EmployerSize\n", - " CurrentName\n", - " SubmittedAfterTheDeadline\n", - " DueDate\n", - " DateSubmitted\n", + " VendorID\n", + " trip_duration\n", + " tpep_pickup_datetime\n", + " tpep_dropoff_datetime\n", + " passenger_count\n", + " trip_distance\n", + " RatecodeID\n", + " store_and_fwd_flag\n", + " PULocationID\n", + " DOLocationID\n", + " payment_type\n", + " fare_amount\n", + " extra\n", + " mta_tax\n", + " tip_amount\n", + " tolls_amount\n", + " improvement_surcharge\n", + " total_amount\n", + " congestion_surcharge\n", + " Airport_fee\n", " \n", " \n", - " npartitions=6\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " npartitions=1\n", " \n", " \n", " \n", @@ -176,17 +360,17 @@ " \n", " \n", " \n", - " string\n", + " int32\n", + " int32\n", + " datetime64[us]\n", + " datetime64[us]\n", " int64\n", - " string\n", - " string\n", - " string\n", - " string\n", - " float64\n", - " float64\n", - " float64\n", - " float64\n", " float64\n", + " int64\n", + " string\n", + " int32\n", + " int32\n", + " int64\n", " float64\n", " float64\n", " float64\n", @@ -196,73 +380,6 @@ " float64\n", " float64\n", " float64\n", - " string\n", - " string\n", - " string\n", - " string\n", - " bool\n", - " string\n", - " string\n", - " \n", - " \n", - " \n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " \n", - " \n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", " \n", " \n", " \n", @@ -286,62 +403,22 @@ " ...\n", " ...\n", " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " \n", - " \n", - " \n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", " \n", " \n", "\n", "\n", - "
Dask Name: fillna, 3 graph layers
" + "
Dask Name: transform, 2 graph layers
" ], "text/plain": [ "Dask DataFrame Structure:\n", - " EmployerName EmployerId Address PostCode CompanyNumber SicCodes DiffMeanHourlyPercent DiffMedianHourlyPercent DiffMeanBonusPercent DiffMedianBonusPercent MaleBonusPercent FemaleBonusPercent MaleLowerQuartile FemaleLowerQuartile MaleLowerMiddleQuartile FemaleLowerMiddleQuartile MaleUpperMiddleQuartile FemaleUpperMiddleQuartile MaleTopQuartile FemaleTopQuartile CompanyLinkToGPGInfo ResponsiblePerson EmployerSize CurrentName SubmittedAfterTheDeadline DueDate DateSubmitted\n", - "npartitions=6 \n", - " string int64 string string string string float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 float64 string string string string bool string string\n", - " ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", - "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", - " ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", - " ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", - "Dask Name: fillna, 3 graph layers" + " VendorID trip_duration tpep_pickup_datetime tpep_dropoff_datetime passenger_count trip_distance RatecodeID store_and_fwd_flag PULocationID DOLocationID payment_type fare_amount extra mta_tax tip_amount tolls_amount improvement_surcharge total_amount congestion_surcharge Airport_fee\n", + "npartitions=1 \n", + " int32 int32 datetime64[us] datetime64[us] int64 float64 int64 string int32 int32 int64 float64 float64 float64 float64 float64 float64 float64 float64 float64\n", + " ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", + "Dask Name: transform, 2 graph layers" ] }, - "execution_count": 9, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -352,7 +429,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ diff --git a/ch-dask-dataframe/read-write.ipynb b/ch-dask-dataframe/read-write.ipynb index 3b1f5fe..a9ba5cd 100644 --- a/ch-dask-dataframe/read-write.ipynb +++ b/ch-dask-dataframe/read-write.ipynb @@ -40,63 +40,40 @@ "\n", "## 数据切分与并行读取\n", "\n", - "我们以读取逗号分隔的数值(Comma-Separated Values,CSV)文件为例,来展示 Dask DataFrame 与 pandas 的区别。" + "### 案例:飞机起降数据\n", + "\n", + "飞机起降数据集由多个逗号分隔的数值(Comma-Separated Values,CSV)文件组成,每个文件对应一个年份,我们以读取多个 CSV 为例,来展示 Dask DataFrame 与 pandas 的区别。" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/Users/luweizheng/Projects/godaai/distributed-python/ch-dask-dataframe/../data/nyc-flights/*.csv\n" - ] - } - ], + "outputs": [], "source": [ "import os\n", "import glob\n", - "import urllib\n", - "import shutil\n", - "from zipfile import ZipFile\n", - "import warnings\n", "\n", + "import sys\n", + "sys.path.append(\"..\")\n", + "from datasets import nyc_flights\n", + "\n", + "import warnings\n", "warnings.simplefilter(action='ignore', category=FutureWarning)\n", "\n", - "folder_path = os.path.join(os.getcwd(), \"../data/\")\n", - "download_url = \"https://dp.godaai.org/nyc-flights.zip\"\n", - "zip_file_path = os.path.join(folder_path, \"nyc-flights.zip\")\n", - "if not os.path.exists(os.path.join(folder_path, \"nyc-flights\")):\n", - " with urllib.request.urlopen(download_url) as response, open(zip_file_path, 'wb') as out_file:\n", - " shutil.copyfileobj(response, out_file)\n", - " zf = ZipFile(zip_file_path, 'r')\n", - " zf.extractall(folder_path)\n", - " zf.close()\n", + "folder_path = nyc_flights()\n", "\n", - "file_path = os.path.join(folder_path, \"nyc-flights\", \"*.csv\")\n", - "print(file_path)" + "file_path = os.path.join(folder_path, \"*.csv\")" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 15, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages/distributed/node.py:182: UserWarning: Port 8787 is already in use.\n", - "Perhaps you already have a cluster running?\n", - "Hosting the HTTP server on port 55077 instead\n", - " warnings.warn(\n" - ] - } - ], + "outputs": [], "source": [ + "import dask\n", + "dask.config.set({'dataframe.query-planning': False})\n", "import dask.dataframe as dd\n", "import pandas as pd\n", "from dask.distributed import LocalCluster, Client\n", @@ -114,7 +91,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -130,7 +107,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -153,9 +130,17 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 18, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n" + ] + }, { "data": { "text/html": [ @@ -302,7 +287,7 @@ "[3 rows x 21 columns]" ] }, - "execution_count": 4, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -313,7 +298,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -457,7 +442,7 @@ "[3 rows x 21 columns]" ] }, - "execution_count": 5, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -477,7 +462,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -486,255 +471,255 @@ "\n", "\n", "\n", - "\n", + "\n", "\n", - "-7150382809904974857\n", + "3521720512806484412\n", "\n", "read-csv\n", "\n", - "\n", + "\n", "\n", - "-6194689680917011400\n", + "6736009212905862099\n", "\n", "0\n", "\n", - "\n", + "\n", "\n", - "-7150382809904974857->-6194689680917011400\n", + "3521720512806484412->6736009212905862099\n", "\n", "\n", "\n", - "\n", + "\n", "\n", - "6071941830101125281\n", + "-1563468653830855738\n", "\n", "to_pyarrow_string\n", "\n", - "\n", + "\n", "\n", - "-6194689680917011400->6071941830101125281\n", + "6736009212905862099->-1563468653830855738\n", "\n", "\n", "\n", - "\n", + "\n", "\n", - "-65660599333282282\n", + "2484881100534163769\n", "\n", "read-csv\n", "\n", - "\n", + "\n", "\n", - "312951075184987025\n", + "-2555422199051202395\n", "\n", "1\n", "\n", - "\n", + "\n", "\n", - "-65660599333282282->312951075184987025\n", + "2484881100534163769->-2555422199051202395\n", "\n", "\n", "\n", - "\n", + "\n", "\n", - "-8135483071169533727\n", + "2675850518608036870\n", "\n", "to_pyarrow_string\n", "\n", - "\n", + "\n", "\n", - "312951075184987025->-8135483071169533727\n", + "-2555422199051202395->2675850518608036870\n", "\n", "\n", "\n", - "\n", + "\n", "\n", - "-1102500011605602925\n", + "1680754195385590727\n", "\n", "read-csv\n", "\n", - "\n", + "\n", "\n", - "-8978480336772077469\n", + "3952218557050796030\n", "\n", "2\n", "\n", - "\n", + "\n", "\n", - "-1102500011605602925->-8978480336772077469\n", + "1680754195385590727->3952218557050796030\n", "\n", "\n", "\n", - "\n", + "\n", "\n", - "-1050760860597841152\n", + "4484414144468516194\n", "\n", "to_pyarrow_string\n", "\n", - "\n", + "\n", "\n", - "-8978480336772077469->-1050760860597841152\n", + "3952218557050796030->4484414144468516194\n", "\n", "\n", "\n", - "\n", + "\n", "\n", - "-1906626916754175967\n", + "1719447117322426477\n", "\n", "read-csv\n", "\n", - "\n", + "\n", "\n", - "-2470839580670079044\n", + "-7986884760556757161\n", "\n", "3\n", "\n", - "\n", + "\n", "\n", - "-1906626916754175967->-2470839580670079044\n", + "1719447117322426477->-7986884760556757161\n", "\n", "\n", "\n", - "\n", + "\n", "\n", - "4071937302134756076\n", + "8723733316907408802\n", "\n", "to_pyarrow_string\n", "\n", - "\n", + "\n", "\n", - "-2470839580670079044->4071937302134756076\n", + "-7986884760556757161->8723733316907408802\n", "\n", "\n", "\n", - "\n", + "\n", "\n", - "5178095293817516608\n", + "5958766289761319085\n", "\n", "read-csv\n", "\n", - "\n", + "\n", "\n", - "4036801175431919381\n", + "5566785284180098089\n", "\n", "4\n", "\n", - "\n", + "\n", "\n", - "5178095293817516608->4036801175431919381\n", + "5958766289761319085->5566785284180098089\n", "\n", "\n", "\n", - "\n", + "\n", "\n", - "8508987607055959954\n", + "7919606411758835760\n", "\n", "to_pyarrow_string\n", "\n", - "\n", + "\n", "\n", - "4036801175431919381->8508987607055959954\n", + "5566785284180098089->7919606411758835760\n", "\n", "\n", "\n", - "\n", + "\n", "\n", - "-9029329607453142400\n", + "-121519200098467208\n", "\n", "read-csv\n", "\n", - "\n", + "\n", "\n", - "-856272853540776985\n", + "7676068657297728386\n", "\n", "5\n", "\n", - "\n", + "\n", "\n", - "-9029329607453142400->-856272853540776985\n", + "-121519200098467208->7676068657297728386\n", "\n", "\n", "\n", - "\n", + "\n", "\n", - "-2363636268343853305\n", + "2288159746428799421\n", "\n", "to_pyarrow_string\n", "\n", - "\n", + "\n", "\n", - "-856272853540776985->-2363636268343853305\n", + "7676068657297728386->2288159746428799421\n", "\n", "\n", "\n", - "\n", + "\n", "\n", - "8774990811659256194\n", + "-5288713516117402287\n", "\n", "0\n", "\n", - "\n", + "\n", "\n", - "6071941830101125281->8774990811659256194\n", + "-1563468653830855738->-5288713516117402287\n", "\n", "\n", "\n", - "\n", + "\n", "\n", - "3881916782686559828\n", + "8264956528619452963\n", "\n", "1\n", "\n", - "\n", + "\n", "\n", - "-8135483071169533727->3881916782686559828\n", + "2675850518608036870->8264956528619452963\n", "\n", "\n", "\n", - "\n", + "\n", "\n", - "-8057186534920993363\n", + "-8072504171972468356\n", "\n", "2\n", "\n", - "\n", + "\n", "\n", - "-1050760860597841152->-8057186534920993363\n", + "4484414144468516194->-8072504171972468356\n", "\n", "\n", "\n", - "\n", + "\n", "\n", - "1098126126831493759\n", + "5481165872764386894\n", "\n", "3\n", "\n", - "\n", + "\n", "\n", - "4071937302134756076->1098126126831493759\n", + "8723733316907408802->5481165872764386894\n", "\n", "\n", "\n", - "\n", + "\n", "\n", - "7605766882933492184\n", + "-6457937444843166297\n", "\n", "4\n", "\n", - "\n", + "\n", "\n", - "8508987607055959954->7605766882933492184\n", + "7919606411758835760->-6457937444843166297\n", "\n", "\n", "\n", - "\n", + "\n", "\n", - "-4333336434674061007\n", + "49703311258832128\n", "\n", "5\n", "\n", - "\n", + "\n", "\n", - "-2363636268343853305->-4333336434674061007\n", + "2288159746428799421->49703311258832128\n", "\n", "\n", "\n", @@ -745,7 +730,7 @@ "" ] }, - "execution_count": 6, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -779,9 +764,17 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 21, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n" + ] + }, { "name": "stdout", "output_type": "stream", @@ -813,10 +806,10 @@ "name": "stderr", "output_type": "stream", "text": [ - "2024-02-20 13:26:56,131 - distributed.worker - WARNING - Compute Failed\n", - "Key: ('to_pyarrow_string-be40c5ef4347788c673803e766c9f5ac', 5)\n", + "2024-04-23 15:57:08,658 - distributed.worker - WARNING - Compute Failed\n", + "Key: ('to_pyarrow_string-1ab0cc1658d1dd5cfa26f384700ac39c', 5)\n", "Function: execute_task\n", - "args: ((subgraph_callable-104274d7-85ea-4720-954d-78db2282e8c8, [(, , 0, 24979433, b'\\n'), None, True, True]))\n", + "args: ((subgraph_callable-490521c8fd346c8bafcb4403419b222a, [(, , 0, 24979433, b'\\n'), None, True, True]))\n", "kwargs: {}\n", "Exception: 'ValueError(\\'Mismatched dtypes found in `pd.read_csv`/`pd.read_table`.\\\\n\\\\n+----------------+---------+----------+\\\\n| Column | Found | Expected |\\\\n+----------------+---------+----------+\\\\n| CRSElapsedTime | float64 | int64 |\\\\n| TailNum | object | float64 |\\\\n+----------------+---------+----------+\\\\n\\\\nThe following columns also raised exceptions on conversion:\\\\n\\\\n- TailNum\\\\n ValueError(\"could not convert string to float: \\\\\\'N14346\\\\\\'\")\\\\n\\\\nUsually this is due to dask\\\\\\'s dtype inference failing, and\\\\n*may* be fixed by specifying dtypes manually by adding:\\\\n\\\\ndtype={\\\\\\'CRSElapsedTime\\\\\\': \\\\\\'float64\\\\\\',\\\\n \\\\\\'TailNum\\\\\\': \\\\\\'object\\\\\\'}\\\\n\\\\nto the call to `read_csv`/`read_table`.\\')'\n", "\n" @@ -841,9 +834,17 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 22, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n" + ] + }, { "data": { "text/html": [ @@ -990,7 +991,7 @@ "[3 rows x 21 columns]" ] }, - "execution_count": 8, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -1013,7 +1014,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -1074,7 +1075,7 @@ "[0 rows x 21 columns]" ] }, - "execution_count": 9, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -1092,9 +1093,515 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 24, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n" + ] + }, { "name": "stdout", "output_type": "stream", @@ -1104,17 +1611,41 @@ "JFK 255485\n", "LGA 591305\n", "Name: Origin, dtype: int64\n", - "CPU times: user 1.67 s, sys: 328 ms, total: 2 s\n", - "Wall time: 5.49 s\n" + "CPU times: user 288 ms, sys: 63.2 ms, total: 351 ms\n", + "Wall time: 1.44 s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n" ] } ], "source": [ "%%time\n", - "ddf = dd.read_csv(file_path, \n", - " parse_dates={\"Date\": [0, 1, 2]},\n", - " dtype={\"TailNum\": str, \"CRSElapsedTime\": float, \"Cancelled\": bool},\n", - " blocksize=50_000)\n", + "ddf = dd.read_csv(file_path,\n", + " parse_dates={'Date': [0, 1, 2]},\n", + " dtype={'TailNum': object,\n", + " 'CRSElapsedTime': float,\n", + " 'Cancelled': bool},\n", + " blocksize=500_000)\n", "\n", "origin_cnt = ddf[~ddf.Cancelled].groupby(\"Origin\").Origin.count().compute()\n", "print(origin_cnt)" @@ -1122,9 +1653,65 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n", + "/Users/luweizheng/miniconda3/envs/dask/lib/python3.11/site-packages/dask/dataframe/io/csv.py:195: FutureWarning: Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated. Combine the desired columns with pd.to_datetime after parsing instead.\n", + " df = reader(bio, **kwargs)\n" + ] + }, { "name": "stdout", "output_type": "stream", @@ -1134,8 +1721,8 @@ "JFK 255485\n", "LGA 591305\n", "Name: Origin, dtype: int64\n", - "CPU times: user 51.4 ms, sys: 11.1 ms, total: 62.5 ms\n", - "Wall time: 539 ms\n" + "CPU times: user 65.5 ms, sys: 17.8 ms, total: 83.4 ms\n", + "Wall time: 1.05 s\n" ] } ], @@ -1144,7 +1731,8 @@ "ddf = dd.read_csv(file_path, \n", " parse_dates={\"Date\": [0, 1, 2]},\n", " dtype={\"TailNum\": str, \"CRSElapsedTime\": float, \"Cancelled\": bool},\n", - " blocksize=5_000_000)\n", + " blocksize=5_000_000\n", + " )\n", "\n", "origin_cnt = ddf[~ddf.Cancelled].groupby(\"Origin\").Origin.count().compute()\n", "print(origin_cnt)" @@ -1223,7 +1811,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 13, "metadata": { "tags": [ "hide-cell" @@ -1258,7 +1846,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.7" + "version": "3.11.8" } }, "nbformat": 4, diff --git a/ch-data-science/index.md b/ch-data-science/index.md index 7ddbb49..188b9b9 100644 --- a/ch-data-science/index.md +++ b/ch-data-science/index.md @@ -1,6 +1,6 @@ # 数据科学 -Python 分布式编程主要服务于数据科学和人工智能等领域,本章主要介绍数据科学相关背景知识,如果相关基础较好,也可以直接跳过本章。 +本章主要介绍数据科学相关背景知识,如果相关基础较好,也可以直接跳过本章。 ```{tableofcontents} ``` \ No newline at end of file diff --git a/ch-mpi-large-model/data-parallel.md b/ch-mpi-large-model/data-parallel.md index 8cff42d..65e9904 100644 --- a/ch-mpi-large-model/data-parallel.md +++ b/ch-mpi-large-model/data-parallel.md @@ -5,7 +5,7 @@ ```{figure} ../img/ch-mpi-large-model/data-parallel.svg --- -width: 600px +width: 500px name: fig-data-parallel-img --- 数据并行示意图 diff --git a/ch-ray-data/data-load-inspect-save.ipynb b/ch-ray-data/data-load-inspect-save.ipynb index 39b9421..3eaf6b7 100644 --- a/ch-ray-data/data-load-inspect-save.ipynb +++ b/ch-ray-data/data-load-inspect-save.ipynb @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 14, "metadata": { "tags": [ "hide-cell" @@ -23,11 +23,16 @@ "name": "stderr", "output_type": "stream", "text": [ - "2023-12-13 21:10:50,705\tINFO worker.py:1673 -- Started a local Ray instance.\n" + "2024-04-23 15:46:00,903\tINFO worker.py:1740 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8267 \u001b[39m\u001b[22m\n" ] }, { "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c1d146493b2841bf93612d26a92c2490", + "version_major": 2, + "version_minor": 0 + }, "text/html": [ "
\n", "
\n", @@ -48,23 +53,27 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", + "\n", + "\n", "
Python version:3.11.53.11.7
Ray version:2.8.02.11.0
Dashboard:http://127.0.0.1:8267
\n", "\n", "
\n", "
\n" ], "text/plain": [ - "RayContext(dashboard_url='', python_version='3.11.5', ray_version='2.8.0', ray_commit='105355bd253d6538ed34d331f6a4bdf0e38ace3a', protocol_version=None)" + "RayContext(dashboard_url='127.0.0.1:8267', python_version='3.11.7', ray_version='2.11.0', ray_commit='2eb4a8119f903f79b78c01eaa9db06c6c390051c')" ] }, - "execution_count": 16, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -72,9 +81,12 @@ "source": [ "import os\n", "import shutil\n", - "import urllib.request\n", "from pathlib import Path\n", "\n", + "import sys\n", + "sys.path.append(\"..\")\n", + "from datasets import nyc_taxi\n", + "\n", "import ray\n", "\n", "if ray.is_initialized:\n", @@ -91,12 +103,12 @@ "\n", "Ray Data 提供了很多预置的数据加载方法,包括读取文件、读取 pandas DataFrame 这种内存数据、读取数据库中的数据。这里我们以一个纽约出租车司机的例子来演示读取 Parquet 文件。\n", "\n", - "首先下载该数据,下面的代码会判断是否下载到目标文件夹,如果已经下载,则无需重复下载。" + "首先下载该数据,并使用 `ray.data` 提供的 [`read_parquet()`](https://docs.ray.io/en/latest/data/api/doc/ray.data.read_parquet.html) 方法读取数据,得到一个 `Dataset`。。" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 15, "metadata": { "tags": [ "hide-output" @@ -104,52 +116,102 @@ }, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "文件夹 /Users/luweizheng/Projects/py-101/distributed-python/ch-ray-data/../data/nyc-taxi 已存在,无需操作。\n" - ] - } - ], - "source": [ - "folder_path = os.path.join(os.getcwd(), \"../data/nyc-taxi\")\n", - "download_url = \"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-06.parquet\"\n", - "file_name = download_url.split(\"/\")[-1]\n", - "parquet_file_path = os.path.join(folder_path, file_name)\n", - "if not os.path.exists(folder_path):\n", - " # 创建文件夹\n", - " os.makedirs(folder_path)\n", - " print(f\"文件夹 {folder_path} 不存在,已创建。\")\n", - " # 下载并保存 Parquet 文件\n", - " with urllib.request.urlopen(download_url) as response, open(parquet_file_path, 'wb') as out_file:\n", - " shutil.copyfileobj(response, out_file)\n", - " print(\"数据已下载并保存为 Parquet 文件。\")\n", - "else:\n", - " print(f\"文件夹 {folder_path} 已存在,无需操作。\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "使用 `ray.data` 提供的 [`read_parquet()`](https://docs.ray.io/en/latest/data/api/doc/ray.data.read_parquet.html) 方法读取数据,得到一个 `Dataset`。" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "661d0924a50a4963810566b722a7ad6d", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Parquet Files Sample 0: 0%| | 0/2 [00:00 TaskPoolMapOperator[ReadParquet] -> LimitOperator[limit=1]\n" ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7bd391923ff1435bb3f0e03a5136aa0d", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "- ReadParquet->SplitBlocks(5) 1: 0%| | 0/12 [00:00 TaskPoolMapOperator[ReadParquet] -> LimitOperator[limit=2]\n", - "2023-12-13 21:10:57,881\tINFO streaming_executor.py:105 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", - "2023-12-13 21:10:57,882\tINFO streaming_executor.py:107 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n", - "\u001b[36m(ReadParquet->SplitBlocks(173) pid=42048)\u001b[0m /Users/luweizheng/anaconda3/envs/dispy/lib/python3.11/site-packages/ray/data/_internal/arrow_block.py:128: FutureWarning: promote has been superseded by mode='default'.\n", - "\u001b[36m(ReadParquet->SplitBlocks(173) pid=42048)\u001b[0m return transform_pyarrow.concat(tables) \n", - " " + "2024-04-23 15:46:04,082\tINFO streaming_executor.py:112 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-04-23_15-45-59_228244_75431/logs\n", + "2024-04-23 15:46:04,083\tINFO streaming_executor.py:113 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadParquet] -> LimitOperator[limit=2]\n" ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'passenger_count': 1, 'tip_amount': 6.7, 'payment_type': 1}\n", - "{'passenger_count': 1, 'tip_amount': 10.0, 'payment_type': 1}\n" - ] + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1353777be82f46c6ac53dddadf0eac47", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "- ReadParquet->SplitBlocks(17) 1: 0%| | 0/12 [00:00 6.0\n", ")\n", @@ -368,27 +457,30 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 19, "metadata": {}, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "2023-12-13 21:10:58,370\tINFO plan.py:757 -- Using autodetected parallelism=173 for stage ReadParquet to satisfy output blocks of size at least DataContext.get_current().target_min_block_size=1.0MiB.\n", - "2023-12-13 21:10:58,372\tINFO plan.py:762 -- To satisfy the requested parallelism of 173, each read task output is split into 173 smaller blocks.\n", - "2023-12-13 21:10:58,389\tINFO plan.py:757 -- Using autodetected parallelism=173 for stage ReadParquet to satisfy output blocks of size at least DataContext.get_current().target_min_block_size=1.0MiB.\n", - "2023-12-13 21:10:58,390\tINFO plan.py:762 -- To satisfy the requested parallelism of 173, each read task output is split into 173 smaller blocks.\n", - " \r" - ] + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "408a4a5a85ea48a4a36a4b85342197f8", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Read progress 0: 0%| | 0/12 [00:00 TaskPoolMapOperator[ReadParquet] -> LimitOperator[limit=1]\n", - "2023-12-13 21:11:00,841\tINFO streaming_executor.py:105 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", - "2023-12-13 21:11:00,842\tINFO streaming_executor.py:107 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n", - "\u001b[36m(ReadParquet->SplitBlocks(173) pid=42046)\u001b[0m /Users/luweizheng/anaconda3/envs/dispy/lib/python3.11/site-packages/ray/data/_internal/arrow_block.py:128: FutureWarning: promote has been superseded by mode='default'.\n", - "\u001b[36m(ReadParquet->SplitBlocks(173) pid=42046)\u001b[0m return transform_pyarrow.concat(tables)\n" + "2024-04-23 15:46:06,493\tINFO streaming_executor.py:112 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-04-23_15-45-59_228244_75431/logs\n", + "2024-04-23 15:46:06,493\tINFO streaming_executor.py:113 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadParquet] -> LimitOperator[limit=1]\n" ] }, { "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "79e31b15590646f191fd10002e54d78a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "- ReadParquet->SplitBlocks(17) 1: 0%| | 0/12 [00:00 TaskPoolMapOperator[ReadParquet] -> LimitOperator[limit=2]\n", - "2023-12-13 21:11:01,261\tINFO streaming_executor.py:105 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", - "2023-12-13 21:11:01,262\tINFO streaming_executor.py:107 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n", - " \r" + "2024-04-23 15:46:06,732\tINFO streaming_executor.py:112 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-04-23_15-45-59_228244_75431/logs\n", + "2024-04-23 15:46:06,732\tINFO streaming_executor.py:113 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadParquet] -> LimitOperator[limit=2]\n" ] }, { "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7aebbbdf009849248dec2f702f64010a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "- ReadParquet->SplitBlocks(17) 1: 0%| | 0/12 [00:00 TaskPoolMapOperator[ReadParquet] -> TaskPoolMapOperator[Write]\n", - "2023-12-13 21:11:01,756\tINFO streaming_executor.py:105 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", - "2023-12-13 21:11:01,758\tINFO streaming_executor.py:107 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n", - "Running: 1.0/8.0 CPU, 0.0/0.0 GPU, 173.03 MiB/512.0 MiB object_store_memory: 0%| | 0/173 [00:00 TaskPoolMapOperator[ReadParquet] -> TaskPoolMapOperator[Write]\n" ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "2023-12-13 21:11:03,071\tWARNING plan.py:577 -- Warning: The Ray cluster currently does not have any available CPUs. The Dataset job will hang unless more CPUs are freed up. A common reason is that cluster resources are used by Actors or Tune trials; see the following link for more details: https://docs.ray.io/en/latest/data/data-internals.html#ray-data-and-tune\n" - ] + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "80397722f6684cf9ae389f2666280041", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "- ReadParquet->SplitBlocks(17) 1: 0%| | 0/12 [00:00 TaskPoolMapOperator[ReadParquet] -> AllToAllOperator[Repartition] -> TaskPoolMapOperator[Write]\n", - "2023-12-13 21:29:25,053\tINFO streaming_executor.py:105 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", - "2023-12-13 21:29:25,054\tINFO streaming_executor.py:107 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n", - "\n", - "\u001b[A\n", - "Running: 0.0/8.0 CPU, 0.0/0.0 GPU, 0.0 MiB/512.0 MiB object_store_memory: 0%| | 0/3 [00:00 TaskPoolMapOperator[ReadParquet] -> AllToAllOperator[Repartition] -> TaskPoolMapOperator[Write]\n" ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Running: 1.0/8.0 CPU, 0.0/0.0 GPU, 173.91 MiB/512.0 MiB object_store_memory: 0%| | 0/3 [00:00SplitBlocks(173) pid=42044)\u001b[0m /Users/luweizheng/anaconda3/envs/dispy/lib/python3.11/site-packages/ray/data/_internal/arrow_block.py:128: FutureWarning: promote has been superseded by mode='default'.\n", - "\n", - "Running: 1.0/8.0 CPU, 0.0/0.0 GPU, 173.91 MiB/512.0 MiB object_store_memory: 0%| | 0/3 [00:00SplitBlocks(173) pid=42044)\u001b[0m return transform_pyarrow.concat(tables) \n", - "\n", - "Running: 0.0/8.0 CPU, 0.0/0.0 GPU, 0.0 MiB/512.0 MiB object_store_memory: 0%| | 0/3 [00:01SplitBlocks(17) 1: 0%| | 0/12 [00:00\n", + "
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "
Python version:3.11.7
Ray version:2.11.0
Dashboard:http://127.0.0.1:8265
\n", + "\n", + "
\n", + "\n" + ], + "text/plain": [ + "RayContext(dashboard_url='127.0.0.1:8265', python_version='3.11.7', ray_version='2.11.0', ray_commit='2eb4a8119f903f79b78c01eaa9db06c6c390051c')" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ "import os\n", - "import shutil\n", - "import warnings\n", - "import urllib.request\n", + "import sys\n", "from typing import Any, Dict\n", "\n", + "sys.path.append(\"..\")\n", + "from datasets import nyc_taxi\n", + "\n", "import numpy as np\n", "import pandas as pd\n", "import torch\n", "import ray\n", "\n", + "import warnings\n", "warnings.simplefilter(action='ignore', category=FutureWarning)\n", "\n", "if ray.is_initialized:\n", " ray.shutdown()\n", "\n", - "ray.init()\n", - "\n", - "folder_path = os.path.join(os.getcwd(), \"../data/nyc-taxi\")\n", - "download_url = \"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-06.parquet\"\n", - "file_name = download_url.split(\"/\")[-1]\n", - "parquet_file_path = os.path.join(folder_path, file_name)\n", - "if not os.path.exists(folder_path):\n", - " # 创建文件夹\n", - " os.makedirs(folder_path)\n", - " print(f\"文件夹 {folder_path} 不存在,已创建。\")\n", - " # 下载并保存 Parquet 文件\n", - " with urllib.request.urlopen(download_url) as response, open(parquet_file_path, 'wb') as out_file:\n", - " shutil.copyfileobj(response, out_file)\n", - " print(\"数据已下载并保存为 Parquet 文件。\")\n", - "else:\n", - " print(f\"文件夹 {folder_path} 已存在,无需操作。\")" + "ray.init()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "读取数据到 `Dataset` 类,先查看原有的数据格式,其中 `tpep_pickup_datetime` 和 `tpep_dropoff_datetime` 分别为乘客上车和下车时间,包含了日期和时间。" + "读取数据到 `Dataset` 类,查看原有的数据格式。" ] }, { @@ -106,12 +139,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "128b9acfa02540eca47e5a1f91c66b4f", + "model_id": "f76cbb15ee5640edb2fbb39e0ce7ad0c", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "Parquet Files Sample 0: 0%| | 0/1 [00:00 TaskPoolMapOperator[ReadParquet] -> LimitOperator[limit=1]\n", - "2024-02-15 19:12:01,510\tINFO streaming_executor.py:113 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=0, gpu=0, object_store_memory=0), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", - "2024-02-15 19:12:01,510\tINFO streaming_executor.py:115 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + "2024-04-23 15:42:28,728\tINFO dataset.py:2370 -- Tip: Use `take_batch()` instead of `take() / show()` to return records in pandas or numpy batch format.\n", + "2024-04-23 15:42:28,731\tINFO streaming_executor.py:112 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-04-23_15-42-25_575692_75375/logs\n", + "2024-04-23 15:42:28,731\tINFO streaming_executor.py:113 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadParquet] -> LimitOperator[limit=1]\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "c3e92b7a33ea4421b9e828dacf491111", + "model_id": "395f0b0d7c5e4dfc8ba32f921ae2af2c", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "Running 0: 0%| | 0/1 [00:00SplitBlocks(5) 1: 0%| | 0/12 [00:00SplitBlocks(50) pid=5966)\u001b[0m /Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages/ray/data/_internal/arrow_block.py:148: FutureWarning: promote has been superseded by mode='default'.\n", - "\u001b[36m(ReadParquet->SplitBlocks(50) pid=5966)\u001b[0m return transform_pyarrow.concat(tables)\n" - ] + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "fb37e85d9c9a49d48c284d6656c89ab4", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "- limit=1 2: 0%| | 0/12 [00:00 TaskPoolMapOperator[ReadParquet] -> TaskPoolMapOperator[Map(transform_row)] -> LimitOperator[limit=1]\n", - "2024-02-15 19:12:02,069\tINFO streaming_executor.py:113 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=0, gpu=0, object_store_memory=0), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", - "2024-02-15 19:12:02,070\tINFO streaming_executor.py:115 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + "2024-04-23 15:42:29,570\tINFO streaming_executor.py:112 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-04-23_15-42-25_575692_75375/logs\n", + "2024-04-23 15:42:29,571\tINFO streaming_executor.py:113 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadParquet] -> TaskPoolMapOperator[Map(transform_row)] -> LimitOperator[limit=1]\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "28fbc3a6677f4b1b809b0662a977f691", + "model_id": "d94215b543d749b3babaa614cc28ffa3", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "- ReadParquet->SplitBlocks(5) 1: 0%| | 0/12 [00:00 TaskPoolMapOperator[ReadParquet] -> TaskPoolMapOperator[MapBatches(transform_df)] -> LimitOperator[limit=10]\n", - "2024-02-15 19:12:04,790\tINFO streaming_executor.py:113 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=0, gpu=0, object_store_memory=0), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", - "2024-02-15 19:12:04,790\tINFO streaming_executor.py:115 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + "2024-04-23 15:42:43,882\tINFO streaming_executor.py:112 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-04-23_15-42-25_575692_75375/logs\n", + "2024-04-23 15:42:43,882\tINFO streaming_executor.py:113 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadParquet] -> TaskPoolMapOperator[MapBatches(transform_df)] -> LimitOperator[limit=1]\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "3af6e0ef7f5643a2b57cc3fcb0292e1c", + "model_id": "c5464c754fb243e4a607eb3368707918", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "Running 0: 0%| | 0/1 [00:00SplitBlocks(5) 1: 0%| | 0/12 [00:00SplitBlocks(50) pid=5960)\u001b[0m /Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages/ray/data/_internal/arrow_block.py:148: FutureWarning: promote has been superseded by mode='default'.\n", - "\u001b[36m(ReadParquet->SplitBlocks(50) pid=5960)\u001b[0m return transform_pyarrow.concat(tables)\n" - ] + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "9499c0dac4f14c55acde44de35bdb088", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "- MapBatches(transform_df) 2: 0%| | 0/12 [00:00 TaskPoolMapOperator[ReadParquet] -> TaskPoolMapOperator[MapBatches()]\n", - "2024-02-15 19:12:05,225\tINFO streaming_executor.py:113 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=0, gpu=0, object_store_memory=0), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", - "2024-02-15 19:12:05,226\tINFO streaming_executor.py:115 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + "2024-04-23 15:42:44,780\tINFO streaming_executor.py:112 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-04-23_15-42-25_575692_75375/logs\n", + "2024-04-23 15:42:44,780\tINFO streaming_executor.py:113 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadParquet] -> TaskPoolMapOperator[MapBatches()]\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "b77916ceb4b74390badf1461ce5fca01", + "model_id": "8390741d4e5548c9980167341e3d35a6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "- ReadParquet->SplitBlocks(5) 1: 0%| | 0/12 [00:00) 2: 0%| | 0/12 [00:00 TaskPoolMapOperator[ReadParquet] -> TaskPoolMapOperator[MapBatches(transform_df)] -> LimitOperator[limit=100] -> ActorPoolMapOperator[MapBatches(TorchPredictor)] -> LimitOperator[limit=3]\n", - "2024-02-15 19:12:06,831\tINFO streaming_executor.py:113 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=0, gpu=0, object_store_memory=0), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", - "2024-02-15 19:12:06,832\tINFO streaming_executor.py:115 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n", - "2024-02-15 19:12:06,846\tINFO actor_pool_map_operator.py:114 -- MapBatches(TorchPredictor): Waiting for 2 pool actors to start...\n" + "2024-04-23 15:42:58,123\tWARNING util.py:560 -- The argument ``compute`` is deprecated in Ray 2.9. Please specify argument ``concurrency`` instead. For more information, see https://docs.ray.io/en/master/data/transforming-data.html#stateful-transforms.\n", + "2024-04-23 15:42:58,126\tINFO streaming_executor.py:112 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-04-23_15-42-25_575692_75375/logs\n", + "2024-04-23 15:42:58,126\tINFO streaming_executor.py:113 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadParquet] -> TaskPoolMapOperator[MapBatches(transform_df)] -> LimitOperator[limit=100] -> ActorPoolMapOperator[MapBatches(TorchPredictor)] -> LimitOperator[limit=3]\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "95bb2562908a46ec97ef831ae5ad6072", + "model_id": "37cb4284f0594a68968ff52547dc771a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "- ReadParquet->SplitBlocks(5) 1: 0%| | 0/12 [00:00SplitBlocks(50) pid=5964)\u001b[0m /Users/luweizheng/miniconda3/envs/dispy/lib/python3.11/site-packages/ray/data/_internal/arrow_block.py:148: FutureWarning: promote has been superseded by mode='default'.\n", - "\u001b[36m(ReadParquet->SplitBlocks(50) pid=5964)\u001b[0m return transform_pyarrow.concat(tables)\n", - "2024-02-15 19:12:08,283\tWARNING actor_pool_map_operator.py:278 -- To ensure full parallelization across an actor pool of size 2, the Dataset should consist of at least 2 distinct blocks. Consider increasing the parallelism when creating the Dataset.\n" + "2024-04-23 15:43:00,636\tWARNING actor_pool_map_operator.py:292 -- To ensure full parallelization across an actor pool of size 2, the Dataset should consist of at least 2 distinct blocks. Consider increasing the parallelism when creating the Dataset.\n" ] }, { "data": { "text/plain": [ - "[{'output': 3.4000000953674316},\n", - " {'output': 3.4000000953674316},\n", - " {'output': 10.199999809265137}]" + "[{'output': 0.9700000286102295},\n", + " {'output': 1.100000023841858},\n", + " {'output': 2.509999990463257}]" ] }, "execution_count": 6, @@ -492,16 +691,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "2024-02-15 19:12:08,306\tWARNING plan.py:588 -- Warning: The Ray cluster currently does not have any available CPUs. The Dataset job will hang unless more CPUs are freed up. A common reason is that cluster resources are used by Actors or Tune trials; see the following link for more details: https://docs.ray.io/en/latest/data/data-internals.html#ray-data-and-tune\n", - "2024-02-15 19:12:08,332\tINFO streaming_executor.py:112 -- Executing DAG InputDataBuffer[Input] -> AllToAllOperator[Aggregate] -> LimitOperator[limit=20]\n", - "2024-02-15 19:12:08,333\tINFO streaming_executor.py:113 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=0, gpu=0, object_store_memory=0), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", - "2024-02-15 19:12:08,333\tINFO streaming_executor.py:115 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + "2024-04-23 15:43:00,727\tINFO streaming_executor.py:112 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-04-23_15-42-25_575692_75375/logs\n", + "2024-04-23 15:43:00,727\tINFO streaming_executor.py:113 -- Execution plan of Dataset: InputDataBuffer[Input] -> AllToAllOperator[Aggregate] -> LimitOperator[limit=20]\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "54684a32e65142cda779ee0745f7b35e", + "model_id": "d97d55b85438459ea63d23561f58165d", "version_major": 2, "version_minor": 0 }, @@ -515,7 +712,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "f2083525d44f443bacf0c63eb410f32b", + "model_id": "78efa492f1cc4dd3ad583d32ae78393e", "version_major": 2, "version_minor": 0 }, @@ -529,7 +726,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "618743a4df1c44b19c29e9aa9839f31d", + "model_id": "164a214aecbd4cbc918974f34097577a", "version_major": 2, "version_minor": 0 }, @@ -543,7 +740,21 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "36ab45bb3d474ab589df37f2b39f7e69", + "model_id": "bb86ebe071484972a761a2776034e671", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "- limit=20 4: 0%| | 0/4 [00:00 + + +
map()
map()
map_batches()
map_batche...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/ch-ray-data/preprocessor.ipynb b/ch-ray-data/preprocessor.ipynb index 9c793d0..3d6fc37 100644 --- a/ch-ray-data/preprocessor.ipynb +++ b/ch-ray-data/preprocessor.ipynb @@ -27,7 +27,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "metadata": { "tags": [ "hide-cell" @@ -35,20 +35,21 @@ }, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/luweizheng/anaconda3/envs/dispy/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n", - "2023-12-15 14:07:48,341\tINFO util.py:159 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n", - "2023-12-15 14:07:50,654\tINFO worker.py:1673 -- Started a local Ray instance.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "文件夹 /Users/luweizheng/Projects/py-101/distributed-python/ch-ray-data/../data/nyc-taxi 已存在,无需操作。\n" + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[3], line 13\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mray\u001b[39;00m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ray\u001b[38;5;241m.\u001b[39mis_initialized:\n\u001b[0;32m---> 13\u001b[0m \u001b[43mray\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mshutdown\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 15\u001b[0m ray\u001b[38;5;241m.\u001b[39minit()\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/ray/_private/client_mode_hook.py:103\u001b[0m, in \u001b[0;36mclient_mode_hook..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 101\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m func\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minit\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m is_client_mode_enabled_by_default:\n\u001b[1;32m 102\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(ray, func\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m)(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m--> 103\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/ray/_private/worker.py:1838\u001b[0m, in \u001b[0;36mshutdown\u001b[0;34m(_exiting_interpreter)\u001b[0m\n\u001b[1;32m 1836\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m _global_node\u001b[38;5;241m.\u001b[39mis_head():\n\u001b[1;32m 1837\u001b[0m _global_node\u001b[38;5;241m.\u001b[39mdestroy_external_storage()\n\u001b[0;32m-> 1838\u001b[0m \u001b[43m_global_node\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mkill_all_processes\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcheck_alive\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mallow_graceful\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 1839\u001b[0m _global_node \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1840\u001b[0m storage\u001b[38;5;241m.\u001b[39m_reset()\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/ray/_private/node.py:1604\u001b[0m, in \u001b[0;36mNode.kill_all_processes\u001b[0;34m(self, check_alive, allow_graceful, wait)\u001b[0m\n\u001b[1;32m 1598\u001b[0m \u001b[38;5;66;03m# Kill the raylet first. This is important for suppressing errors at\u001b[39;00m\n\u001b[1;32m 1599\u001b[0m \u001b[38;5;66;03m# shutdown because we give the raylet a chance to exit gracefully and\u001b[39;00m\n\u001b[1;32m 1600\u001b[0m \u001b[38;5;66;03m# clean up its child worker processes. If we were to kill the plasma\u001b[39;00m\n\u001b[1;32m 1601\u001b[0m \u001b[38;5;66;03m# store (or Redis) first, that could cause the raylet to exit\u001b[39;00m\n\u001b[1;32m 1602\u001b[0m \u001b[38;5;66;03m# ungracefully, leading to more verbose output from the workers.\u001b[39;00m\n\u001b[1;32m 1603\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ray_constants\u001b[38;5;241m.\u001b[39mPROCESS_TYPE_RAYLET \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mall_processes:\n\u001b[0;32m-> 1604\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_kill_process_type\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1605\u001b[0m \u001b[43m \u001b[49m\u001b[43mray_constants\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mPROCESS_TYPE_RAYLET\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1606\u001b[0m \u001b[43m \u001b[49m\u001b[43mcheck_alive\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcheck_alive\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1607\u001b[0m \u001b[43m \u001b[49m\u001b[43mallow_graceful\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mallow_graceful\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1608\u001b[0m \u001b[43m \u001b[49m\u001b[43mwait\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mwait\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1609\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1611\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ray_constants\u001b[38;5;241m.\u001b[39mPROCESS_TYPE_GCS_SERVER \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mall_processes:\n\u001b[1;32m 1612\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_kill_process_type(\n\u001b[1;32m 1613\u001b[0m ray_constants\u001b[38;5;241m.\u001b[39mPROCESS_TYPE_GCS_SERVER,\n\u001b[1;32m 1614\u001b[0m check_alive\u001b[38;5;241m=\u001b[39mcheck_alive,\n\u001b[1;32m 1615\u001b[0m allow_graceful\u001b[38;5;241m=\u001b[39mallow_graceful,\n\u001b[1;32m 1616\u001b[0m wait\u001b[38;5;241m=\u001b[39mwait,\n\u001b[1;32m 1617\u001b[0m )\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/ray/_private/node.py:1425\u001b[0m, in \u001b[0;36mNode._kill_process_type\u001b[0;34m(self, process_type, allow_graceful, check_alive, wait)\u001b[0m\n\u001b[1;32m 1423\u001b[0m \u001b[38;5;66;03m# Ensure thread safety\u001b[39;00m\n\u001b[1;32m 1424\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mremoval_lock:\n\u001b[0;32m-> 1425\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_kill_process_impl\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1426\u001b[0m \u001b[43m \u001b[49m\u001b[43mprocess_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1427\u001b[0m \u001b[43m \u001b[49m\u001b[43mallow_graceful\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mallow_graceful\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1428\u001b[0m \u001b[43m \u001b[49m\u001b[43mcheck_alive\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcheck_alive\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1429\u001b[0m \u001b[43m \u001b[49m\u001b[43mwait\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mwait\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1430\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/site-packages/ray/_private/node.py:1481\u001b[0m, in \u001b[0;36mNode._kill_process_impl\u001b[0;34m(self, process_type, allow_graceful, check_alive, wait)\u001b[0m\n\u001b[1;32m 1479\u001b[0m timeout_seconds \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 1480\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1481\u001b[0m \u001b[43mprocess\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout_seconds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1482\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m subprocess\u001b[38;5;241m.\u001b[39mTimeoutExpired:\n\u001b[1;32m 1483\u001b[0m \u001b[38;5;28;01mpass\u001b[39;00m\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/subprocess.py:1264\u001b[0m, in \u001b[0;36mPopen.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 1262\u001b[0m endtime \u001b[38;5;241m=\u001b[39m _time() \u001b[38;5;241m+\u001b[39m timeout\n\u001b[1;32m 1263\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1264\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_wait\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1265\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyboardInterrupt\u001b[39;00m:\n\u001b[1;32m 1266\u001b[0m \u001b[38;5;66;03m# https://bugs.python.org/issue25942\u001b[39;00m\n\u001b[1;32m 1267\u001b[0m \u001b[38;5;66;03m# The first keyboard interrupt waits briefly for the child to\u001b[39;00m\n\u001b[1;32m 1268\u001b[0m \u001b[38;5;66;03m# exit under the common assumption that it also received the ^C\u001b[39;00m\n\u001b[1;32m 1269\u001b[0m \u001b[38;5;66;03m# generated SIGINT and will exit rapidly.\u001b[39;00m\n\u001b[1;32m 1270\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", + "File \u001b[0;32m~/miniconda3/envs/dispy/lib/python3.11/subprocess.py:2040\u001b[0m, in \u001b[0;36mPopen._wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 2038\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m TimeoutExpired(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs, timeout)\n\u001b[1;32m 2039\u001b[0m delay \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mmin\u001b[39m(delay \u001b[38;5;241m*\u001b[39m \u001b[38;5;241m2\u001b[39m, remaining, \u001b[38;5;241m.05\u001b[39m)\n\u001b[0;32m-> 2040\u001b[0m time\u001b[38;5;241m.\u001b[39msleep(delay)\n\u001b[1;32m 2041\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 2042\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreturncode \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], @@ -58,68 +59,40 @@ "import urllib.request\n", "from typing import Any, Dict\n", "\n", + "import sys\n", + "sys.path.append(\"..\")\n", + "from datasets import nyc_taxi\n", + "\n", "import ray\n", "\n", "if ray.is_initialized:\n", " ray.shutdown()\n", "\n", - "ray.init()\n", - "\n", - "folder_path = os.path.join(os.getcwd(), \"../data/nyc-taxi\")\n", - "download_url = \"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-06.parquet\"\n", - "file_name = download_url.split(\"/\")[-1]\n", - "parquet_file_path = os.path.join(folder_path, file_name)\n", - "if not os.path.exists(folder_path):\n", - " # 创建文件夹\n", - " os.makedirs(folder_path)\n", - " print(f\"文件夹 {folder_path} 不存在,已创建。\")\n", - " # 下载并保存 Parquet 文件\n", - " with urllib.request.urlopen(download_url) as response, open(parquet_file_path, 'wb') as out_file:\n", - " shutil.copyfileobj(response, out_file)\n", - " print(\"数据已下载并保存为 Parquet 文件。\")\n", - "else:\n", - " print(f\"文件夹 {folder_path} 已存在,无需操作。\")" + "ray.init()" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "(pid=6866) Parquet Files Sample 0: 0%| | 0/1 [00:00 TaskPoolMapOperator[ReadParquet] -> LimitOperator[limit=1]\n", - "2023-12-15 14:17:24,492\tINFO streaming_executor.py:105 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n", - "2023-12-15 14:17:24,493\tINFO streaming_executor.py:107 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n", - " \r" + "ename": "NameError", + "evalue": "name 'nyc_taxi' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[2], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mray\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdata\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpreprocessors\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m MinMaxScaler\n\u001b[0;32m----> 3\u001b[0m dataset_path \u001b[38;5;241m=\u001b[39m \u001b[43mnyc_taxi\u001b[49m()\n\u001b[1;32m 4\u001b[0m ds \u001b[38;5;241m=\u001b[39m ray\u001b[38;5;241m.\u001b[39mdata\u001b[38;5;241m.\u001b[39mread_parquet(dataset_path,\n\u001b[1;32m 5\u001b[0m columns\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtrip_distance\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m 6\u001b[0m ds\u001b[38;5;241m.\u001b[39mtake(\u001b[38;5;241m1\u001b[39m)\n", + "\u001b[0;31mNameError\u001b[0m: name 'nyc_taxi' is not defined" ] - }, - { - "data": { - "text/plain": [ - "[{'trip_distance': 3.4}]" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ "from ray.data.preprocessors import MinMaxScaler\n", "\n", - "ds = ray.data.read_parquet(parquet_file_path,\n", + "dataset_path = nyc_taxi()\n", + "ds = ray.data.read_parquet(dataset_path,\n", " columns=[\"trip_distance\"])\n", "ds.take(1)" ] @@ -133,7 +106,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -189,7 +162,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -310,7 +283,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.5" + "version": "3.11.7" } }, "nbformat": 4, diff --git a/ch-ray-train-tune/tune-algorithm-scheduler.ipynb b/ch-ray-train-tune/tune-algorithm-scheduler.ipynb index 1f17d97..205b29c 100644 --- a/ch-ray-train-tune/tune-algorithm-scheduler.ipynb +++ b/ch-ray-train-tune/tune-algorithm-scheduler.ipynb @@ -116,11 +116,11 @@ "outputs": [], "source": [ "import os\n", - "import urllib\n", - "import shutil\n", "import tempfile\n", "\n", - "from zipfile import ZipFile\n", + "import sys\n", + "sys.path.append(\"..\")\n", + "from datasets import nyc_flights\n", "\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", @@ -141,17 +141,8 @@ "from ray.tune.integration.xgboost import TuneReportCheckpointCallback\n", "from ray.tune.schedulers import PopulationBasedTraining\n", "\n", - "folder_path = os.path.join(os.getcwd(), \"../data/\")\n", - "download_url = \"https://dp.godaai.org/nyc-flights.zip\"\n", - "zip_file_path = os.path.join(folder_path, \"nyc-flights.zip\")\n", - "if not os.path.exists(os.path.join(folder_path, \"nyc-flights\")):\n", - " with urllib.request.urlopen(download_url) as response, open(zip_file_path, 'wb') as out_file:\n", - " shutil.copyfileobj(response, out_file)\n", - " zf = ZipFile(zip_file_path, 'r')\n", - " zf.extractall(folder_path)\n", - " zf.close()\n", - "\n", - "file_path = os.path.join(folder_path, \"nyc-flights\", \"1991.csv\")" + "folder_path = nyc_flights()\n", + "file_path = os.path.join(folder_path, \"1991.csv\")" ] }, { diff --git a/conf.py b/conf.py index 47937e7..cb16516 100644 --- a/conf.py +++ b/conf.py @@ -49,7 +49,7 @@ } html_static_path = ["_static"] html_css_files = ["custom.css"] -html_title = 'Python 分布式编程' +html_title = 'Python 数据科学加速' latex_engine = 'pdflatex' myst_enable_extensions = ['colon_fence', 'dollarmath', 'linkify', 'substitution', 'tasklist'] myst_url_schemes = ['mailto', 'http', 'https'] diff --git a/datasets.py b/datasets.py new file mode 100644 index 0000000..8331ca0 --- /dev/null +++ b/datasets.py @@ -0,0 +1,42 @@ +import os +import shutil +import urllib + +def nyc_taxi(data_path: str = "../data/nyc-taxi"): + + folder_path = os.path.join(os.getcwd(), "../data/nyc-taxi") + download_url = [ + "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet", + "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet", + "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet", + "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-04.parquet", + "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-05.parquet", + "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-06.parquet", + ] + if not os.path.exists(folder_path): + os.makedirs(folder_path) + for url in download_url: + file_name = url.split("/")[-1] + parquet_file_path = os.path.join(folder_path, file_name) + if not os.path.exists(os.path.join(folder_path, file_name)): + print(f"正在下载 {file_name}") + with urllib.request.urlopen(url) as response, open(parquet_file_path, 'wb') as out_file: + shutil.copyfileobj(response, out_file) + + return folder_path + + +def nyc_flights(data_path: str = "../data/"): + folder_path = os.path.join(os.getcwd(), data_path) + download_url = "https://dp.godaai.org/nyc-flights.zip" + zip_file_path = os.path.join(folder_path, "nyc-flights.zip") + if not os.path.exists(os.path.join(folder_path, "nyc-flights")): + print(f"正在下载 nyc-flights.zip") + with urllib.request.urlopen(download_url) as response, open(zip_file_path, 'wb') as out_file: + shutil.copyfileobj(response, out_file) + zf = ZipFile(zip_file_path, 'r') + zf.extractall(folder_path) + zf.close() + + folder_path = os.path.join(folder_path, "nyc-flights") + return folder_path \ No newline at end of file diff --git a/drawio/ch-dask-dataframe/map-partitions.drawio b/drawio/ch-dask-dataframe/map-partitions.drawio new file mode 100644 index 0000000..7dc874d --- /dev/null +++ b/drawio/ch-dask-dataframe/map-partitions.drawio @@ -0,0 +1,61 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/drawio/ch-mpi-large-model/data-paralism.drawio b/drawio/ch-mpi-large-model/data-paralism.drawio deleted file mode 100644 index b006cdd..0000000 --- a/drawio/ch-mpi-large-model/data-paralism.drawio +++ /dev/null @@ -1,628 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/drawio/ch-mpi-large-model/data-parallel.drawio b/drawio/ch-mpi-large-model/data-parallel.drawio index 8d88f2b..e0cb786 100644 --- a/drawio/ch-mpi-large-model/data-parallel.drawio +++ b/drawio/ch-mpi-large-model/data-parallel.drawio @@ -1,99 +1,99 @@ - + - + - + - + - + - - - - + - + - + - + - - - - + - + - + - + - - - - + - + - + - - - - - + + - + - + - + - + - + - + - + + + + + + + + + + + + + diff --git a/drawio/ch-mpi-large-model/pipeline-parallel.drawio b/drawio/ch-mpi-large-model/pipeline-parallel.drawio index 35bf846..79860f4 100644 --- a/drawio/ch-mpi-large-model/pipeline-parallel.drawio +++ b/drawio/ch-mpi-large-model/pipeline-parallel.drawio @@ -1,97 +1,97 @@ - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - - + + - + - + - + - + diff --git a/drawio/ch-mpi-large-model/pipline-parallel.drawio b/drawio/ch-mpi-large-model/pipline-parallel.drawio deleted file mode 100644 index 59ca275..0000000 --- a/drawio/ch-mpi-large-model/pipline-parallel.drawio +++ /dev/null @@ -1,500 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/drawio/ch-ray-data/map-map-batches.drawio b/drawio/ch-ray-data/map-map-batches.drawio index 651165b..198ae47 100644 --- a/drawio/ch-ray-data/map-map-batches.drawio +++ b/drawio/ch-ray-data/map-map-batches.drawio @@ -1,10 +1,10 @@ - + - + - + @@ -32,13 +32,13 @@ - + - + @@ -55,14 +55,14 @@ - + - + @@ -77,19 +77,19 @@ - + - + - + - + @@ -98,7 +98,7 @@ - + diff --git a/img/ch-dask-dataframe/map-partitions.svg b/img/ch-dask-dataframe/map-partitions.svg new file mode 100644 index 0000000..8200247 --- /dev/null +++ b/img/ch-dask-dataframe/map-partitions.svg @@ -0,0 +1,4 @@ + + + +
map_partitions()
map_partit...
pandas
DataFrame
pandas...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/img/ch-dask-dataframe/nyc-flights-graph.svg b/img/ch-dask-dataframe/nyc-flights-graph.svg index b9d0a86..b21a31d 100644 --- a/img/ch-dask-dataframe/nyc-flights-graph.svg +++ b/img/ch-dask-dataframe/nyc-flights-graph.svg @@ -1,262 +1,262 @@ - - + --7150382809904974857 +3521720512806484412 read-csv - + --6194689680917011400 +6736009212905862099 0 - + --7150382809904974857->-6194689680917011400 +3521720512806484412->6736009212905862099 - + -6071941830101125281 +-1563468653830855738 to_pyarrow_string - + --6194689680917011400->6071941830101125281 +6736009212905862099->-1563468653830855738 - + --65660599333282282 +2484881100534163769 read-csv - + -312951075184987025 +-2555422199051202395 1 - + --65660599333282282->312951075184987025 +2484881100534163769->-2555422199051202395 - + --8135483071169533727 +2675850518608036870 to_pyarrow_string - + -312951075184987025->-8135483071169533727 +-2555422199051202395->2675850518608036870 - + --1102500011605602925 +1680754195385590727 read-csv - + --8978480336772077469 +3952218557050796030 2 - + --1102500011605602925->-8978480336772077469 +1680754195385590727->3952218557050796030 - + --1050760860597841152 +4484414144468516194 to_pyarrow_string - + --8978480336772077469->-1050760860597841152 +3952218557050796030->4484414144468516194 - + --1906626916754175967 +1719447117322426477 read-csv - + --2470839580670079044 +-7986884760556757161 3 - + --1906626916754175967->-2470839580670079044 +1719447117322426477->-7986884760556757161 - + -4071937302134756076 +8723733316907408802 to_pyarrow_string - + --2470839580670079044->4071937302134756076 +-7986884760556757161->8723733316907408802 - + -5178095293817516608 +5958766289761319085 read-csv - + -4036801175431919381 +5566785284180098089 4 - + -5178095293817516608->4036801175431919381 +5958766289761319085->5566785284180098089 - + -8508987607055959954 +7919606411758835760 to_pyarrow_string - + -4036801175431919381->8508987607055959954 +5566785284180098089->7919606411758835760 - + --9029329607453142400 +-121519200098467208 read-csv - + --856272853540776985 +7676068657297728386 5 - + --9029329607453142400->-856272853540776985 +-121519200098467208->7676068657297728386 - + --2363636268343853305 +2288159746428799421 to_pyarrow_string - + --856272853540776985->-2363636268343853305 +7676068657297728386->2288159746428799421 - + -8774990811659256194 +-5288713516117402287 0 - + -6071941830101125281->8774990811659256194 +-1563468653830855738->-5288713516117402287 - + -3881916782686559828 +8264956528619452963 1 - + --8135483071169533727->3881916782686559828 +2675850518608036870->8264956528619452963 - + --8057186534920993363 +-8072504171972468356 2 - + --1050760860597841152->-8057186534920993363 +4484414144468516194->-8072504171972468356 - + -1098126126831493759 +5481165872764386894 3 - + -4071937302134756076->1098126126831493759 +8723733316907408802->5481165872764386894 - + -7605766882933492184 +-6457937444843166297 4 - + -8508987607055959954->7605766882933492184 +7919606411758835760->-6457937444843166297 - + --4333336434674061007 +49703311258832128 5 - + --2363636268343853305->-4333336434674061007 +2288159746428799421->49703311258832128 diff --git a/img/ch-mpi-large-model/data-parallel.svg b/img/ch-mpi-large-model/data-parallel.svg index 8a096fa..a3815ae 100644 --- a/img/ch-mpi-large-model/data-parallel.svg +++ b/img/ch-mpi-large-model/data-parallel.svg @@ -1,4 +1,4 @@ -
GPU 0
GPU 0
模型拷贝
模型拷贝
GPU 1
GPU 1
模型拷贝
模型拷贝
GPU 2
GPU 2
模型拷贝
模型拷贝
GPU 2
GPU 2
模型拷贝
模型拷贝
训练数据
训练数据
Text is not SVG - cannot display
\ No newline at end of file +
GPU 0
GPU 0
模型拷贝
模型拷贝
GPU 1
GPU 1
模型拷贝
模型拷贝
GPU 2
GPU 2
模型拷贝
模型拷贝
GPU 3
GPU 3
模型拷贝
模型拷贝
训练数据
训练数据
Text is not SVG - cannot display
\ No newline at end of file diff --git a/img/ch-mpi-large-model/pipeline-parallel.svg b/img/ch-mpi-large-model/pipeline-parallel.svg index 03d030d..fed840e 100644 --- a/img/ch-mpi-large-model/pipeline-parallel.svg +++ b/img/ch-mpi-large-model/pipeline-parallel.svg @@ -1,4 +1,4 @@ -
GPU 0
GPU 0
模型第 0 层
模型第 0 层
GPU 1
GPU 1
模型第 1 层
模型第 1 层
GPU 2
GPU 2
模型第 2 层
模型第 2 层
GPU 3
GPU 3
模型第 3 层
模型第 3 层
前向传播
前向传播
前向传播
前向传播
Text is not SVG - cannot display
\ No newline at end of file +
GPU 0
GPU 0
模型第 0 层
模型第 0 层
GPU 1
GPU 1
模型第 1 层
模型第 1 层
GPU 2
GPU 2
模型第 2 层
模型第 2 层
GPU 3
GPU 3
模型第 3 层
模型第 3 层
前向传播
前向传播
反向传播
反向传播
Text is not SVG - cannot display
\ No newline at end of file diff --git a/img/ch-ray-data/map-map-batches.svg b/img/ch-ray-data/map-map-batches.svg index 30c8538..90be5d0 100644 --- a/img/ch-ray-data/map-map-batches.svg +++ b/img/ch-ray-data/map-map-batches.svg @@ -1,4 +1,4 @@ -
map()
map()
map_batches()
map_batche...
Text is not SVG - cannot display
\ No newline at end of file +
map()
map()
map_batches()
map_batche...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/img/mpi-logo.png b/img/mpi-logo.png new file mode 100644 index 0000000..6d41a88 Binary files /dev/null and b/img/mpi-logo.png differ diff --git a/index.md b/index.md index 8e591bf..816cc3c 100644 --- a/index.md +++ b/index.md @@ -1,19 +1,21 @@ -# Python 分布式编程 +# Python 数据科学加速 ::::{grid} 2 :reverse: :::{grid-item} -:columns: 4 +:columns: 3 :class: sd-m-auto ::: :::{grid-item} -:columns: 8 +:columns: 9 :class: sd-fs-3 -开源的、面向下一代人工智能应用的 Python 分布式编程书籍。 +Dask、Ray、Xorbits、mpi4py + +开源的、面向下一代数据科学和人工智能应用的 Python 数据科学并行加速书籍。 % The SVG rendering breaks latex builds for the GitHub badge, so only include in HTML ```{only} html @@ -24,7 +26,7 @@ :::: -::::{card-carousel} 3 +::::{card-carousel} 2 :::{card} :margin: 3 @@ -48,7 +50,9 @@ :height: 100 ``` ::: +:::: +::::{card-carousel} 2 :::{card} :margin: 3 @@ -61,6 +65,19 @@ :height: 100 ``` ::: + +:::{card} +:margin: 3 +:class-body: text-center +:class-header: bg-light text-center + +**mpi4py** +^^^ +```{image} ./img/mpi-logo.png +:height: 100 +``` +::: + :::: ## 主要作者