From 68d9dcab5b543adb3bfe5b83563c61a9b8afae77 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Tue, 22 Oct 2024 01:43:47 +0530 Subject: [PATCH 01/32] DOC: fix SA01 for pandas.core.resample.Resampler.ohlc (#60036) --- ci/code_checks.sh | 3 --- pandas/core/groupby/groupby.py | 6 ++++++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 1974c98a1d1ff..427938c3c5fba 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -104,7 +104,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.groupby.DataFrameGroupBy.indices SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.nth PR02" \ -i "pandas.core.groupby.DataFrameGroupBy.nunique SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.ohlc SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \ -i "pandas.core.groupby.DataFrameGroupBy.sem SA01" \ -i "pandas.core.groupby.SeriesGroupBy.__iter__ RT03,SA01" \ @@ -114,7 +113,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.groupby.SeriesGroupBy.is_monotonic_decreasing SA01" \ -i "pandas.core.groupby.SeriesGroupBy.is_monotonic_increasing SA01" \ -i "pandas.core.groupby.SeriesGroupBy.nth PR02" \ - -i "pandas.core.groupby.SeriesGroupBy.ohlc SA01" \ -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \ -i "pandas.core.groupby.SeriesGroupBy.sem SA01" \ -i "pandas.core.resample.Resampler.__iter__ RT03,SA01" \ @@ -124,7 +122,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.resample.Resampler.max PR01,RT03,SA01" \ -i "pandas.core.resample.Resampler.mean SA01" \ -i "pandas.core.resample.Resampler.min PR01,RT03,SA01" \ - -i "pandas.core.resample.Resampler.ohlc SA01" \ -i "pandas.core.resample.Resampler.prod SA01" \ -i "pandas.core.resample.Resampler.quantile PR01,PR07" \ -i "pandas.core.resample.Resampler.sem SA01" \ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 8dfef9e70db52..f12ded6045e80 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3224,6 +3224,12 @@ def ohlc(self) -> DataFrame: DataFrame Open, high, low and close values within each group. + See Also + -------- + DataFrame.agg : Aggregate using one or more operations over the specified axis. + DataFrame.resample : Resample time-series data. + DataFrame.groupby : Group DataFrame using a mapper or by a Series of columns. + Examples -------- From 6d4ba801893a76cdd22da3e373ce7986ac98cda1 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Sat, 26 Oct 2024 14:30:21 -0400 Subject: [PATCH 02/32] CI/TST: Update pyreadstat tests and pin xarray on CI (#60109) --- ci/deps/actions-310.yaml | 2 +- ci/deps/actions-311-downstream_compat.yaml | 2 +- ci/deps/actions-311.yaml | 2 +- ci/deps/actions-312.yaml | 2 +- ci/deps/circle-311-arm64.yaml | 2 +- environment.yml | 2 +- pandas/tests/io/test_spss.py | 4 +++- requirements-dev.txt | 2 +- 8 files changed, 10 insertions(+), 8 deletions(-) diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index c33c0344e742f..b1c7fda910f67 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -52,7 +52,7 @@ dependencies: - scipy>=1.10.0 - sqlalchemy>=2.0.0 - tabulate>=0.9.0 - - xarray>=2022.12.0 + - xarray>=2022.12.0, <=2024.9.0 - xlrd>=2.0.1 - xlsxwriter>=3.0.5 - zstandard>=0.19.0 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index 8692b6e35ab2d..f7fc4c38add90 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -53,7 +53,7 @@ dependencies: - scipy>=1.10.0 - sqlalchemy>=2.0.0 - tabulate>=0.9.0 - - xarray>=2022.12.0 + - xarray>=2022.12.0, <=2024.9.0 - xlrd>=2.0.1 - xlsxwriter>=3.0.5 - zstandard>=0.19.0 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 8e7d9aba7878d..f1ab3c37c4c71 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -52,7 +52,7 @@ dependencies: - scipy>=1.10.0 - sqlalchemy>=2.0.0 - tabulate>=0.9.0 - - xarray>=2022.12.0 + - xarray>=2022.12.0, <=2024.9.0 - xlrd>=2.0.1 - xlsxwriter>=3.0.5 - zstandard>=0.19.0 diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml index 6c97960a62d40..d39d572eda619 100644 --- a/ci/deps/actions-312.yaml +++ b/ci/deps/actions-312.yaml @@ -52,7 +52,7 @@ dependencies: - scipy>=1.10.0 - sqlalchemy>=2.0.0 - tabulate>=0.9.0 - - xarray>=2022.12.0 + - xarray>=2022.12.0, <=2024.9.0 - xlrd>=2.0.1 - xlsxwriter>=3.0.5 - zstandard>=0.19.0 diff --git a/ci/deps/circle-311-arm64.yaml b/ci/deps/circle-311-arm64.yaml index c86534871b3d2..def7faeb8bcaa 100644 --- a/ci/deps/circle-311-arm64.yaml +++ b/ci/deps/circle-311-arm64.yaml @@ -52,7 +52,7 @@ dependencies: - scipy>=1.10.0 - sqlalchemy>=2.0.0 - tabulate>=0.9.0 - - xarray>=2022.12.0 + - xarray>=2022.12.0, <2024.10.0 - xlrd>=2.0.1 - xlsxwriter>=3.0.5 - zstandard>=0.19.0 diff --git a/environment.yml b/environment.yml index ab834735441f0..c05f8dbebd28e 100644 --- a/environment.yml +++ b/environment.yml @@ -55,7 +55,7 @@ dependencies: - scipy>=1.10.0 - sqlalchemy>=2.0.0 - tabulate>=0.9.0 - - xarray>=2022.12.0 + - xarray>=2022.12.0, <=2024.9.0 - xlrd>=2.0.1 - xlsxwriter>=3.0.5 - zstandard>=0.19.0 diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py index 1aa9f6dca0303..950f74a686b8d 100644 --- a/pandas/tests/io/test_spss.py +++ b/pandas/tests/io/test_spss.py @@ -177,4 +177,6 @@ def test_spss_metadata(datapath): "modification_time": datetime.datetime(2015, 2, 6, 14, 33, 36), } ) - assert df.attrs == metadata + if Version(pyreadstat.__version__) >= Version("1.2.8"): + metadata["mr_sets"] = {} + tm.assert_dict_equal(df.attrs, metadata) diff --git a/requirements-dev.txt b/requirements-dev.txt index 1bf42af6bf2cd..00e320e6370ce 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -44,7 +44,7 @@ s3fs>=2022.11.0 scipy>=1.10.0 SQLAlchemy>=2.0.0 tabulate>=0.9.0 -xarray>=2022.12.0 +xarray>=2022.12.0, <=2024.9.0 xlrd>=2.0.1 xlsxwriter>=3.0.5 zstandard>=0.19.0 From 8d2ca0bf84bcf44a800ac19bdb4ed7ec88c555e2 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sun, 27 Oct 2024 18:43:50 +0530 Subject: [PATCH 03/32] DOC: fix RT03,SA01,ES01 for pandas.core.resample.Resampler.__iter__ (#60033) --- ci/code_checks.sh | 3 --- pandas/core/groupby/groupby.py | 18 ++++++++++++++++-- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 427938c3c5fba..6f65c52d6f5a3 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -97,7 +97,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.arrays.NumpyExtensionArray SA01" \ -i "pandas.arrays.SparseArray PR07,SA01" \ -i "pandas.arrays.TimedeltaArray PR07,SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.__iter__ RT03,SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.boxplot PR07,RT03,SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.get_group RT03,SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.groups SA01" \ @@ -106,7 +105,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.groupby.DataFrameGroupBy.nunique SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \ -i "pandas.core.groupby.DataFrameGroupBy.sem SA01" \ - -i "pandas.core.groupby.SeriesGroupBy.__iter__ RT03,SA01" \ -i "pandas.core.groupby.SeriesGroupBy.get_group RT03,SA01" \ -i "pandas.core.groupby.SeriesGroupBy.groups SA01" \ -i "pandas.core.groupby.SeriesGroupBy.indices SA01" \ @@ -115,7 +113,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.groupby.SeriesGroupBy.nth PR02" \ -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \ -i "pandas.core.groupby.SeriesGroupBy.sem SA01" \ - -i "pandas.core.resample.Resampler.__iter__ RT03,SA01" \ -i "pandas.core.resample.Resampler.get_group RT03,SA01" \ -i "pandas.core.resample.Resampler.groups SA01" \ -i "pandas.core.resample.Resampler.indices SA01" \ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f12ded6045e80..a0bd25525c55f 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -767,10 +767,24 @@ def __iter__(self) -> Iterator[tuple[Hashable, NDFrameT]]: """ Groupby iterator. + This method provides an iterator over the groups created by the ``resample`` + or ``groupby`` operation on the object. The method yields tuples where + the first element is the label (group key) corresponding to each group or + resampled bin, and the second element is the subset of the data that falls + within that group or bin. + Returns ------- - Generator yielding sequence of (name, subsetted object) - for each group + Iterator + Generator yielding a sequence of (name, subsetted object) + for each group. + + See Also + -------- + Series.groupby : Group data by a specific key or column. + DataFrame.groupby : Group DataFrame using mapper or by columns. + DataFrame.resample : Resample a DataFrame. + Series.resample : Resample a Series. Examples -------- From c5bf9373b13dbabcd78a462259f455a42e546afa Mon Sep 17 00:00:00 2001 From: Niklas Rousset <75939868+niklasr22@users.noreply.github.com> Date: Mon, 28 Oct 2024 18:05:33 +0100 Subject: [PATCH 04/32] CLN: fix pre-commit stage names (#60076) --- .pre-commit-config.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1cb7b288aba69..87212309725c7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,9 +2,9 @@ minimum_pre_commit_version: 2.15.0 exclude: ^LICENSES/|\.(html|csv|svg)$ # reserve "manual" for relatively slow hooks which we still want to run in CI default_stages: [ - commit, - merge-commit, - push, + pre-commit, + pre-merge-commit, + pre-push, prepare-commit-msg, commit-msg, post-checkout, From e3e198f3cf0b7baac52ee9c5b45b54d997786d7c Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 28 Oct 2024 22:36:48 +0530 Subject: [PATCH 05/32] DOC: fix SA01 for pandas.errors.PossiblePrecisionLoss (#60061) --- ci/code_checks.sh | 1 - pandas/errors/__init__.py | 4 ++++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 6f65c52d6f5a3..30f0ec226d64f 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -140,7 +140,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.errors.OutOfBoundsTimedelta SA01" \ -i "pandas.errors.PerformanceWarning SA01" \ -i "pandas.errors.PossibleDataLossError SA01" \ - -i "pandas.errors.PossiblePrecisionLoss SA01" \ -i "pandas.errors.UndefinedVariableError PR01,SA01" \ -i "pandas.errors.UnsortedIndexError SA01" \ -i "pandas.errors.UnsupportedFunctionCall SA01" \ diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index efc032b0b559e..b6df34e33ecce 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -691,6 +691,10 @@ class PossiblePrecisionLoss(Warning): When the column value is outside or equal to the int64 value the column is converted to a float64 dtype. + See Also + -------- + DataFrame.to_stata : Export DataFrame object to Stata dta format. + Examples -------- >>> df = pd.DataFrame({"s": pd.Series([1, 2**53], dtype=np.int64)}) From e3a3a4a5fbc6451006822e08d1d54d991f6d2c3f Mon Sep 17 00:00:00 2001 From: Kevin Amparado <109636487+KevsterAmp@users.noreply.github.com> Date: Tue, 29 Oct 2024 05:05:18 +0800 Subject: [PATCH 06/32] ENH: Improve error mesage verbosity in string accessor (#59900) --- pandas/core/strings/accessor.py | 4 +++- pandas/tests/series/accessors/test_str_accessor.py | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 3cb0e75cfb815..05e1a36877e06 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -255,7 +255,9 @@ def _validate(data): inferred_dtype = lib.infer_dtype(values, skipna=True) if inferred_dtype not in allowed_types: - raise AttributeError("Can only use .str accessor with string values!") + raise AttributeError( + f"Can only use .str accessor with string values, not {inferred_dtype}" + ) return inferred_dtype def __getitem__(self, key): diff --git a/pandas/tests/series/accessors/test_str_accessor.py b/pandas/tests/series/accessors/test_str_accessor.py index 09d965ef1f322..ff530459b78fb 100644 --- a/pandas/tests/series/accessors/test_str_accessor.py +++ b/pandas/tests/series/accessors/test_str_accessor.py @@ -15,7 +15,8 @@ def test_str_attribute(self): # str accessor only valid with string values ser = Series(range(5)) - with pytest.raises(AttributeError, match="only use .str accessor"): + msg = "Can only use .str accessor with string values, not integer" + with pytest.raises(AttributeError, match=msg): ser.str.repeat(2) def test_str_accessor_updates_on_inplace(self): From 85c93d0521d51eb4f7083e7f5ab580ab1041b857 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 30 Oct 2024 01:39:44 +0530 Subject: [PATCH 07/32] DOC: fix SA01 for pandas.errors.OptionError (#60031) --- ci/code_checks.sh | 1 - pandas/_config/config.py | 4 ++++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 30f0ec226d64f..a5b4eb47df712 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -136,7 +136,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.errors.NullFrequencyError SA01" \ -i "pandas.errors.NumExprClobberingError SA01" \ -i "pandas.errors.NumbaUtilError SA01" \ - -i "pandas.errors.OptionError SA01" \ -i "pandas.errors.OutOfBoundsTimedelta SA01" \ -i "pandas.errors.PerformanceWarning SA01" \ -i "pandas.errors.PossibleDataLossError SA01" \ diff --git a/pandas/_config/config.py b/pandas/_config/config.py index 25760df6bd7a4..1d57aa806e0f1 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -105,6 +105,10 @@ class OptionError(AttributeError, KeyError): Backwards compatible with KeyError checks. + See Also + -------- + options : Access and modify global pandas settings. + Examples -------- >>> pd.options.context From 0bbca468ab2f35797494c1a6ec9cfd89e10464db Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 30 Oct 2024 01:40:21 +0530 Subject: [PATCH 08/32] DOC: fix SA01 for pandas.errors.CSSWarning (#60030) --- ci/code_checks.sh | 1 - pandas/errors/__init__.py | 8 ++++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index a5b4eb47df712..450678bf55fa2 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -127,7 +127,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.resample.Resampler.transform PR01,RT03,SA01" \ -i "pandas.core.resample.Resampler.var SA01" \ -i "pandas.errors.AttributeConflictWarning SA01" \ - -i "pandas.errors.CSSWarning SA01" \ -i "pandas.errors.ChainedAssignmentError SA01" \ -i "pandas.errors.DataError SA01" \ -i "pandas.errors.DuplicateLabelError SA01" \ diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index b6df34e33ecce..2fafb15822201 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -591,6 +591,14 @@ class CSSWarning(UserWarning): This can be due to the styling not having an equivalent value or because the styling isn't properly formatted. + See Also + -------- + DataFrame.style : Returns a Styler object for applying CSS-like styles. + io.formats.style.Styler : Helps style a DataFrame or Series according to the + data with HTML and CSS. + io.formats.style.Styler.to_excel : Export styled DataFrame to Excel. + io.formats.style.Styler.to_html : Export styled DataFrame to HTML. + Examples -------- >>> df = pd.DataFrame({"A": [1, 1, 1]}) From a518b8fd5117fae9872c83ccf79235976c0359dc Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 30 Oct 2024 01:41:18 +0530 Subject: [PATCH 09/32] DOC: fix PR01,SA01,ES01 for pandas.api.types.is_integer (#60034) --- ci/code_checks.sh | 1 - pandas/_libs/lib.pyx | 14 ++++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 450678bf55fa2..3ef47b7992463 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -85,7 +85,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timestamp.resolution PR02" \ -i "pandas.Timestamp.tzinfo GL08" \ -i "pandas.Timestamp.year GL08" \ - -i "pandas.api.types.is_integer PR01,SA01" \ -i "pandas.api.types.is_iterator PR07,SA01" \ -i "pandas.api.types.is_re_compilable PR07,SA01" \ -i "pandas.api.types.pandas_dtype PR07,RT03,SA01" \ diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 23e0f387466aa..8b6d73cda355b 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1122,9 +1122,23 @@ def is_integer(obj: object) -> bool: """ Return True if given object is integer. + This method checks whether the passed object is an integer type. It + returns `True` if the object is an integer, and `False` otherwise. + + Parameters + ---------- + obj : object + The object to check for integer type. + Returns ------- bool + `True` if the object is of integer type, otherwise `False`. + + See Also + -------- + api.types.is_float : Check if an object is of float type. + api.types.is_numeric_dtype : Check if an object is of numeric type. Examples -------- From b9c6fa81ed9ed1247412a5e23e4e88feb3bb0427 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 30 Oct 2024 01:42:03 +0530 Subject: [PATCH 10/32] DOC: fix SA01,ES01 for pandas.core.resample.Resampler.sum (#60037) --- ci/code_checks.sh | 1 - pandas/core/resample.py | 12 ++++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 3ef47b7992463..4578a831a20f0 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -122,7 +122,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.resample.Resampler.quantile PR01,PR07" \ -i "pandas.core.resample.Resampler.sem SA01" \ -i "pandas.core.resample.Resampler.std SA01" \ - -i "pandas.core.resample.Resampler.sum SA01" \ -i "pandas.core.resample.Resampler.transform PR01,RT03,SA01" \ -i "pandas.core.resample.Resampler.var SA01" \ -i "pandas.errors.AttributeConflictWarning SA01" \ diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 42fed83398737..ca4d3fc768efb 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1021,6 +1021,10 @@ def sum( """ Compute sum of group values. + This method provides a simple way to compute the sum of values within each + resampled group, particularly useful for aggregating time-based data into + daily, monthly, or yearly sums. + Parameters ---------- numeric_only : bool, default False @@ -1039,6 +1043,14 @@ def sum( Series or DataFrame Computed sum of values within each group. + See Also + -------- + core.resample.Resampler.mean : Compute mean of groups, excluding missing values. + core.resample.Resampler.count : Compute count of group, excluding missing + values. + DataFrame.resample : Resample time-series data. + Series.sum : Return the sum of the values over the requested axis. + Examples -------- >>> ser = pd.Series( From 40b5610db837f21cfae906e4e3c93b2877b2b034 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 30 Oct 2024 01:46:10 +0530 Subject: [PATCH 11/32] DOC: fix SA01,ES01 for pandas.tseries.offsets.SemiMonthBegin (#60060) * DOC: fix SA01 for pandas.tseries.offsets.SemiMonthBegin * DOC: fix ES01 for pandas.tseries.offsets.SemiMonthBegin --- ci/code_checks.sh | 1 - pandas/_libs/tslibs/offsets.pyx | 11 +++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 4578a831a20f0..cbecb855d3dfc 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -292,7 +292,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.tseries.offsets.Second.is_on_offset GL08" \ -i "pandas.tseries.offsets.Second.n GL08" \ -i "pandas.tseries.offsets.Second.normalize GL08" \ - -i "pandas.tseries.offsets.SemiMonthBegin SA01" \ -i "pandas.tseries.offsets.SemiMonthBegin.day_of_month GL08" \ -i "pandas.tseries.offsets.SemiMonthBegin.is_on_offset GL08" \ -i "pandas.tseries.offsets.SemiMonthBegin.n GL08" \ diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 4db96fbaa3aad..7569f8e8864a0 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -3371,6 +3371,10 @@ cdef class SemiMonthBegin(SemiMonthOffset): """ Two DateOffset's per month repeating on the first day of the month & day_of_month. + This offset moves dates to the first day of the month and an additional specified + day (typically the 15th by default), useful in scenarios where bi-monthly processing + occurs on set days. + Attributes ---------- n : int, default 1 @@ -3380,6 +3384,13 @@ cdef class SemiMonthBegin(SemiMonthOffset): day_of_month : int, {1, 3,...,27}, default 15 A specific integer for the day of the month. + See Also + -------- + tseries.offsets.SemiMonthEnd : Two DateOffset's per month repeating on the last day + of the month & day_of_month. + tseries.offsets.MonthEnd : Offset to the last calendar day of the month. + tseries.offsets.MonthBegin : Offset to the first calendar day of the month. + Examples -------- >>> ts = pd.Timestamp(2022, 1, 1) From 824750a6df1664ebd6ccb7d25fd2e74188845d02 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 30 Oct 2024 01:47:41 +0530 Subject: [PATCH 12/32] DOC: fix SA01 for pandas.errors.UnsupportedFunctionCall (#60072) --- ci/code_checks.sh | 1 - pandas/errors/__init__.py | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index cbecb855d3dfc..4f831ae8261fc 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -138,7 +138,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.errors.PossibleDataLossError SA01" \ -i "pandas.errors.UndefinedVariableError PR01,SA01" \ -i "pandas.errors.UnsortedIndexError SA01" \ - -i "pandas.errors.UnsupportedFunctionCall SA01" \ -i "pandas.errors.ValueLabelTypeMismatch SA01" \ -i "pandas.infer_freq SA01" \ -i "pandas.io.json.build_table_schema PR07,RT03,SA01" \ diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 2fafb15822201..0aaee1ec177ee 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -76,6 +76,12 @@ class UnsupportedFunctionCall(ValueError): For example, ``np.cumsum(groupby_object)``. + See Also + -------- + DataFrame.groupby : Group DataFrame using a mapper or by a Series of columns. + Series.groupby : Group Series using a mapper or by a Series of columns. + core.groupby.GroupBy.cumsum : Compute cumulative sum for each group. + Examples -------- >>> df = pd.DataFrame( From fc3aff432f6bd28d8f9ccacac94b202ddc6040b1 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 30 Oct 2024 01:49:29 +0530 Subject: [PATCH 13/32] DOC: fix RT03,SA01,ES01 for pandas.json_normalize (#60032) --- ci/code_checks.sh | 1 - pandas/io/json/_normalize.py | 13 +++++++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 4f831ae8261fc..b727d93879a86 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -142,7 +142,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.infer_freq SA01" \ -i "pandas.io.json.build_table_schema PR07,RT03,SA01" \ -i "pandas.io.stata.StataWriter.write_file SA01" \ - -i "pandas.json_normalize RT03,SA01" \ -i "pandas.plotting.andrews_curves RT03,SA01" \ -i "pandas.plotting.scatter_matrix PR07,SA01" \ -i "pandas.set_eng_float_format RT03,SA01" \ diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py index 7d3eefae39679..45c8876dbe3e5 100644 --- a/pandas/io/json/_normalize.py +++ b/pandas/io/json/_normalize.py @@ -279,6 +279,10 @@ def json_normalize( """ Normalize semi-structured JSON data into a flat table. + This method is designed to transform semi-structured JSON data, such as nested + dictionaries or lists, into a flat table. This is particularly useful when + handling JSON-like data structures that contain deeply nested fields. + Parameters ---------- data : dict, list of dicts, or Series of dicts @@ -310,8 +314,13 @@ def json_normalize( Returns ------- - frame : DataFrame - Normalize semi-structured JSON data into a flat table. + DataFrame + The normalized data, represented as a pandas DataFrame. + + See Also + -------- + DataFrame : Two-dimensional, size-mutable, potentially heterogeneous tabular data. + Series : One-dimensional ndarray with axis labels (including time series). Examples -------- From f770beee9ea737f2feecdd9ae4db2a12f8b1cce2 Mon Sep 17 00:00:00 2001 From: Dave Bunten Date: Tue, 29 Oct 2024 14:46:47 -0600 Subject: [PATCH 14/32] Add information about code coverage to docs (#60029) * add information about code coverage to docs * linting --- doc/source/development/contributing_codebase.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst index 670ffe6996302..c1cfb0d7a623b 100644 --- a/doc/source/development/contributing_codebase.rst +++ b/doc/source/development/contributing_codebase.rst @@ -298,6 +298,12 @@ So, before actually writing any code, you should write your tests. Often the te taken from the original GitHub issue. However, it is always worth considering additional use cases and writing corresponding tests. +We use `code coverage `_ to help understand +the amount of code which is covered by a test. We recommend striving to ensure code +you add or change within Pandas is covered by a test. Please see our +`code coverage dashboard through Codecov `_ +for more information. + Adding tests is one of the most common requests after code is pushed to pandas. Therefore, it is worth getting in the habit of writing tests ahead of time so this is never an issue. From 9d184aaed5582263e5ad01442a0f367291af3151 Mon Sep 17 00:00:00 2001 From: eightyseven Date: Wed, 30 Oct 2024 04:50:34 +0800 Subject: [PATCH 15/32] DOC: fix docstring for DataFrame.round() (#60040) update --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 24a164aa15427..6b646c5591fab 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10823,7 +10823,7 @@ def round( self, decimals: int | dict[IndexLabel, int] | Series = 0, *args, **kwargs ) -> DataFrame: """ - Round a DataFrame to a variable number of decimal places. + Round numeric columns in a DataFrame to a variable number of decimal places. Parameters ---------- From 1ca15d362ee395ab69ea3e47139bbb5377155562 Mon Sep 17 00:00:00 2001 From: Myles Scolnick Date: Tue, 29 Oct 2024 13:54:40 -0700 Subject: [PATCH 16/32] docs: add marimo to ecosystem.md (#60051) --- web/pandas/community/ecosystem.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md index 2ea10954fc929..6c69ff7602491 100644 --- a/web/pandas/community/ecosystem.md +++ b/web/pandas/community/ecosystem.md @@ -239,6 +239,17 @@ Console](https://docs.spyder-ide.org/current/panes/ipythonconsole.html), and Spy render Numpydoc documentation on pandas objects in rich text with Sphinx both automatically and on-demand. +### [marimo](https://marimo.io) + +marimo is a reactive notebook for Python and SQL that enhances productivity when working with dataframes. It provides several features to make data manipulation and visualization more interactive and fun: + +1. Rich, interactive displays: marimo can display pandas dataframes in interactive tables or charts with filtering and sorting capabilities. +2. Data selection: Users can select data in tables or pandas-backed plots, and the selections are automatically sent to Python as pandas dataframes. +3. No-code transformations: Users can interactively transform pandas dataframes using a GUI, without writing code. The generated code can be copied and pasted into the notebook. +4. Custom filters: marimo allows the creation of pandas-backed filters using UI elements like sliders and dropdowns. +5. Dataset explorer: marimo automatically discovers and displays all dataframes in the notebook, allowing users to explore and visualize data interactively. +6. SQL integration: marimo allows users to write SQL queries against any pandas dataframes existing in memory. + ## API ### [pandas-datareader](https://github.com/pydata/pandas-datareader) From b5c7f25b109a749b720df7c102d38e994d5f868e Mon Sep 17 00:00:00 2001 From: ZKaoChi <1953542921@qq.com> Date: Wed, 30 Oct 2024 04:57:26 +0800 Subject: [PATCH 17/32] DOC: Add Timedelta accepting float value (#60058) * DOC: Solution for issue #60044 * reduce characters for each line * DOC: Make the value parameter of pandas.Timedelta can accept float * Change the expression * environment update * environment update --- pandas/_libs/tslibs/timedeltas.pyx | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index bbefea7c47fc3..299730df86923 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1864,10 +1864,12 @@ class Timedelta(_Timedelta): Parameters ---------- - value : Timedelta, timedelta, np.timedelta64, str, or int + value : Timedelta, timedelta, np.timedelta64, str, int or float Input value. unit : str, default 'ns' - Denote the unit of the input, if input is an integer. + If input is an integer, denote the unit of the input. + If input is a float, denote the unit of the integer parts. + The decimal parts with resolution lower than 1 nanosecond are ignored. Possible values: From 9e10119dc8c3ad34cee53e113afedd90cf70a0ec Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Tue, 29 Oct 2024 16:01:05 -0500 Subject: [PATCH 18/32] BLD: relax meson/meson-python requirements (#60089) * relax meson/meson-python requirements This makes bugfixes from later meson/meson-python fixes available to build pandas. For eg: python 3.13t support in meson, needs an up-to-date version of meson. * Drop upper bound for meson-python --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d6a963e94f5b8..6dfee8f4910db 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,8 +2,8 @@ # Minimum requirements for the build system to execute. # See https://github.com/scipy/scipy/pull/12940 for the AIX issue. requires = [ - "meson-python==0.13.1", - "meson==1.2.1", + "meson-python>=0.13.1", + "meson>=1.2.1,<2", "wheel", "Cython~=3.0.5", # Note: sync with setup.py, environment.yml and asv.conf.json # Force numpy higher than 2.0rc1, so that built wheels are compatible From d8905e4bee2aa0e096ed7831fea7d395d7657120 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 30 Oct 2024 09:00:22 +0100 Subject: [PATCH 19/32] TST (string dtype): duplicate pandas/tests/indexes/object tests specifically for string dtypes (#60117) --- pandas/tests/indexes/object/test_astype.py | 18 --- pandas/tests/indexes/object/test_indexing.py | 82 ++----------- pandas/tests/indexes/string/__init__.py | 0 pandas/tests/indexes/string/test_astype.py | 21 ++++ pandas/tests/indexes/string/test_indexing.py | 118 +++++++++++++++++++ 5 files changed, 148 insertions(+), 91 deletions(-) create mode 100644 pandas/tests/indexes/string/__init__.py create mode 100644 pandas/tests/indexes/string/test_astype.py create mode 100644 pandas/tests/indexes/string/test_indexing.py diff --git a/pandas/tests/indexes/object/test_astype.py b/pandas/tests/indexes/object/test_astype.py index ce05b5e9f2238..7e0de138aacfb 100644 --- a/pandas/tests/indexes/object/test_astype.py +++ b/pandas/tests/indexes/object/test_astype.py @@ -3,25 +3,7 @@ from pandas import ( Index, NaT, - Series, ) -import pandas._testing as tm - - -def test_astype_str_from_bytes(): - # https://github.com/pandas-dev/pandas/issues/38607 - # GH#49658 pre-2.0 Index called .values.astype(str) here, which effectively - # did a .decode() on the bytes object. In 2.0 we go through - # ensure_string_array which does f"{val}" - idx = Index(["あ", b"a"], dtype="object") - result = idx.astype(str) - expected = Index(["あ", "a"], dtype="str") - tm.assert_index_equal(result, expected) - - # while we're here, check that Series.astype behaves the same - result = Series(idx).astype(str) - expected = Series(expected, dtype="str") - tm.assert_series_equal(result, expected) def test_astype_invalid_nas_to_tdt64_raises(): diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index ea3d068a673e8..89648bc316c16 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -3,12 +3,8 @@ import numpy as np import pytest -from pandas._libs.missing import ( - NA, - is_matching_na, -) +from pandas._libs.missing import is_matching_na -import pandas as pd from pandas import Index import pandas._testing as tm @@ -23,13 +19,13 @@ class TestGetIndexer: ) def test_get_indexer_strings(self, method, expected): expected = np.array(expected, dtype=np.intp) - index = Index(["b", "c"]) + index = Index(["b", "c"], dtype=object) actual = index.get_indexer(["a", "b", "c", "d"], method=method) tm.assert_numpy_array_equal(actual, expected) - def test_get_indexer_strings_raises(self, using_infer_string): - index = Index(["b", "c"]) + def test_get_indexer_strings_raises(self): + index = Index(["b", "c"], dtype=object) msg = "|".join( [ @@ -68,13 +64,9 @@ def test_get_indexer_with_NA_values( class TestGetIndexerNonUnique: - def test_get_indexer_non_unique_nas( - self, nulls_fixture, request, using_infer_string - ): + def test_get_indexer_non_unique_nas(self, nulls_fixture): # even though this isn't non-unique, this should still work - if using_infer_string and (nulls_fixture is None or nulls_fixture is NA): - request.applymarker(pytest.mark.xfail(reason="NAs are cast to NaN")) - index = Index(["a", "b", nulls_fixture]) + index = Index(["a", "b", nulls_fixture], dtype=object) indexer, missing = index.get_indexer_non_unique([nulls_fixture]) expected_indexer = np.array([2], dtype=np.intp) @@ -83,7 +75,7 @@ def test_get_indexer_non_unique_nas( tm.assert_numpy_array_equal(missing, expected_missing) # actually non-unique - index = Index(["a", nulls_fixture, "b", nulls_fixture]) + index = Index(["a", nulls_fixture, "b", nulls_fixture], dtype=object) indexer, missing = index.get_indexer_non_unique([nulls_fixture]) expected_indexer = np.array([1, 3], dtype=np.intp) @@ -92,10 +84,10 @@ def test_get_indexer_non_unique_nas( # matching-but-not-identical nans if is_matching_na(nulls_fixture, float("NaN")): - index = Index(["a", float("NaN"), "b", float("NaN")]) + index = Index(["a", float("NaN"), "b", float("NaN")], dtype=object) match_but_not_identical = True elif is_matching_na(nulls_fixture, Decimal("NaN")): - index = Index(["a", Decimal("NaN"), "b", Decimal("NaN")]) + index = Index(["a", Decimal("NaN"), "b", Decimal("NaN")], dtype=object) match_but_not_identical = True else: match_but_not_identical = False @@ -156,59 +148,3 @@ def test_get_indexer_non_unique_np_nats(self, np_nat_fixture, np_nat_fixture2): expected_indexer = np.array([1, 3], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected_indexer) tm.assert_numpy_array_equal(missing, expected_missing) - - -class TestSliceLocs: - @pytest.mark.parametrize( - "in_slice,expected", - [ - # error: Slice index must be an integer or None - (pd.IndexSlice[::-1], "yxdcb"), - (pd.IndexSlice["b":"y":-1], ""), # type: ignore[misc] - (pd.IndexSlice["b"::-1], "b"), # type: ignore[misc] - (pd.IndexSlice[:"b":-1], "yxdcb"), # type: ignore[misc] - (pd.IndexSlice[:"y":-1], "y"), # type: ignore[misc] - (pd.IndexSlice["y"::-1], "yxdcb"), # type: ignore[misc] - (pd.IndexSlice["y"::-4], "yb"), # type: ignore[misc] - # absent labels - (pd.IndexSlice[:"a":-1], "yxdcb"), # type: ignore[misc] - (pd.IndexSlice[:"a":-2], "ydb"), # type: ignore[misc] - (pd.IndexSlice["z"::-1], "yxdcb"), # type: ignore[misc] - (pd.IndexSlice["z"::-3], "yc"), # type: ignore[misc] - (pd.IndexSlice["m"::-1], "dcb"), # type: ignore[misc] - (pd.IndexSlice[:"m":-1], "yx"), # type: ignore[misc] - (pd.IndexSlice["a":"a":-1], ""), # type: ignore[misc] - (pd.IndexSlice["z":"z":-1], ""), # type: ignore[misc] - (pd.IndexSlice["m":"m":-1], ""), # type: ignore[misc] - ], - ) - def test_slice_locs_negative_step(self, in_slice, expected, any_string_dtype): - index = Index(list("bcdxy"), dtype=any_string_dtype) - - s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, in_slice.step) - result = index[s_start : s_stop : in_slice.step] - expected = Index(list(expected), dtype=any_string_dtype) - tm.assert_index_equal(result, expected) - - def test_slice_locs_negative_step_oob(self, any_string_dtype): - index = Index(list("bcdxy"), dtype=any_string_dtype) - - result = index[-10:5:1] - tm.assert_index_equal(result, index) - - result = index[4:-10:-1] - expected = Index(list("yxdcb"), dtype=any_string_dtype) - tm.assert_index_equal(result, expected) - - def test_slice_locs_dup(self): - index = Index(["a", "a", "b", "c", "d", "d"]) - assert index.slice_locs("a", "d") == (0, 6) - assert index.slice_locs(end="d") == (0, 6) - assert index.slice_locs("a", "c") == (0, 4) - assert index.slice_locs("b", "d") == (2, 6) - - index2 = index[::-1] - assert index2.slice_locs("d", "a") == (0, 6) - assert index2.slice_locs(end="a") == (0, 6) - assert index2.slice_locs("d", "b") == (0, 4) - assert index2.slice_locs("c", "a") == (2, 6) diff --git a/pandas/tests/indexes/string/__init__.py b/pandas/tests/indexes/string/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/indexes/string/test_astype.py b/pandas/tests/indexes/string/test_astype.py new file mode 100644 index 0000000000000..0349d85f23167 --- /dev/null +++ b/pandas/tests/indexes/string/test_astype.py @@ -0,0 +1,21 @@ +from pandas import ( + Index, + Series, +) +import pandas._testing as tm + + +def test_astype_str_from_bytes(): + # https://github.com/pandas-dev/pandas/issues/38607 + # GH#49658 pre-2.0 Index called .values.astype(str) here, which effectively + # did a .decode() on the bytes object. In 2.0 we go through + # ensure_string_array which does f"{val}" + idx = Index(["あ", b"a"], dtype="object") + result = idx.astype(str) + expected = Index(["あ", "a"], dtype="str") + tm.assert_index_equal(result, expected) + + # while we're here, check that Series.astype behaves the same + result = Series(idx).astype(str) + expected = Series(expected, dtype="str") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/string/test_indexing.py b/pandas/tests/indexes/string/test_indexing.py new file mode 100644 index 0000000000000..755b7109a5a04 --- /dev/null +++ b/pandas/tests/indexes/string/test_indexing.py @@ -0,0 +1,118 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import Index +import pandas._testing as tm + + +class TestGetIndexer: + @pytest.mark.parametrize( + "method,expected", + [ + ("pad", [-1, 0, 1, 1]), + ("backfill", [0, 0, 1, -1]), + ], + ) + def test_get_indexer_strings(self, any_string_dtype, method, expected): + expected = np.array(expected, dtype=np.intp) + index = Index(["b", "c"], dtype=any_string_dtype) + actual = index.get_indexer(["a", "b", "c", "d"], method=method) + + tm.assert_numpy_array_equal(actual, expected) + + def test_get_indexer_strings_raises(self, any_string_dtype): + index = Index(["b", "c"], dtype=any_string_dtype) + + msg = "|".join( + [ + "operation 'sub' not supported for dtype 'str", + r"unsupported operand type\(s\) for -: 'str' and 'str'", + ] + ) + with pytest.raises(TypeError, match=msg): + index.get_indexer(["a", "b", "c", "d"], method="nearest") + + with pytest.raises(TypeError, match=msg): + index.get_indexer(["a", "b", "c", "d"], method="pad", tolerance=2) + + with pytest.raises(TypeError, match=msg): + index.get_indexer( + ["a", "b", "c", "d"], method="pad", tolerance=[2, 2, 2, 2] + ) + + +class TestGetIndexerNonUnique: + @pytest.mark.xfail(reason="TODO(infer_string)", strict=False) + def test_get_indexer_non_unique_nas(self, any_string_dtype, nulls_fixture): + index = Index(["a", "b", None], dtype=any_string_dtype) + indexer, missing = index.get_indexer_non_unique([nulls_fixture]) + + expected_indexer = np.array([2], dtype=np.intp) + expected_missing = np.array([], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected_indexer) + tm.assert_numpy_array_equal(missing, expected_missing) + + # actually non-unique + index = Index(["a", None, "b", None], dtype=any_string_dtype) + indexer, missing = index.get_indexer_non_unique([nulls_fixture]) + + expected_indexer = np.array([1, 3], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected_indexer) + tm.assert_numpy_array_equal(missing, expected_missing) + + +class TestSliceLocs: + @pytest.mark.parametrize( + "in_slice,expected", + [ + # error: Slice index must be an integer or None + (pd.IndexSlice[::-1], "yxdcb"), + (pd.IndexSlice["b":"y":-1], ""), # type: ignore[misc] + (pd.IndexSlice["b"::-1], "b"), # type: ignore[misc] + (pd.IndexSlice[:"b":-1], "yxdcb"), # type: ignore[misc] + (pd.IndexSlice[:"y":-1], "y"), # type: ignore[misc] + (pd.IndexSlice["y"::-1], "yxdcb"), # type: ignore[misc] + (pd.IndexSlice["y"::-4], "yb"), # type: ignore[misc] + # absent labels + (pd.IndexSlice[:"a":-1], "yxdcb"), # type: ignore[misc] + (pd.IndexSlice[:"a":-2], "ydb"), # type: ignore[misc] + (pd.IndexSlice["z"::-1], "yxdcb"), # type: ignore[misc] + (pd.IndexSlice["z"::-3], "yc"), # type: ignore[misc] + (pd.IndexSlice["m"::-1], "dcb"), # type: ignore[misc] + (pd.IndexSlice[:"m":-1], "yx"), # type: ignore[misc] + (pd.IndexSlice["a":"a":-1], ""), # type: ignore[misc] + (pd.IndexSlice["z":"z":-1], ""), # type: ignore[misc] + (pd.IndexSlice["m":"m":-1], ""), # type: ignore[misc] + ], + ) + def test_slice_locs_negative_step(self, in_slice, expected, any_string_dtype): + index = Index(list("bcdxy"), dtype=any_string_dtype) + + s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, in_slice.step) + result = index[s_start : s_stop : in_slice.step] + expected = Index(list(expected), dtype=any_string_dtype) + tm.assert_index_equal(result, expected) + + def test_slice_locs_negative_step_oob(self, any_string_dtype): + index = Index(list("bcdxy"), dtype=any_string_dtype) + + result = index[-10:5:1] + tm.assert_index_equal(result, index) + + result = index[4:-10:-1] + expected = Index(list("yxdcb"), dtype=any_string_dtype) + tm.assert_index_equal(result, expected) + + def test_slice_locs_dup(self, any_string_dtype): + index = Index(["a", "a", "b", "c", "d", "d"], dtype=any_string_dtype) + assert index.slice_locs("a", "d") == (0, 6) + assert index.slice_locs(end="d") == (0, 6) + assert index.slice_locs("a", "c") == (0, 4) + assert index.slice_locs("b", "d") == (2, 6) + + index2 = index[::-1] + assert index2.slice_locs("d", "a") == (0, 6) + assert index2.slice_locs(end="a") == (0, 6) + assert index2.slice_locs("d", "b") == (0, 4) + assert index2.slice_locs("c", "a") == (2, 6) From 2ead19826b42a34bd641a14ef1089c7ea5f36a6a Mon Sep 17 00:00:00 2001 From: Abdulaziz Aloqeely <52792999+Aloqeely@users.noreply.github.com> Date: Wed, 30 Oct 2024 11:19:22 +0300 Subject: [PATCH 20/32] DEPR: Change stacklevel to 2 in DataFrame(mgr) deprecation (#58694) --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6b646c5591fab..c4defdb24370f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -718,7 +718,7 @@ def __init__( "is deprecated and will raise in a future version. " "Use public APIs instead.", DeprecationWarning, - stacklevel=1, # bump to 2 once pyarrow 15.0 is released with fix + stacklevel=2, ) data = data.copy(deep=False) From 7bd594c81acb5f6428e9ef54ba5a9da1f2860a89 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 30 Oct 2024 10:29:37 +0100 Subject: [PATCH 21/32] TST (string dtype): add explicit object vs str dtype to index fixture (#60116) --- pandas/conftest.py | 3 ++- pandas/tests/indexes/test_any_index.py | 2 +- pandas/tests/indexes/test_old_base.py | 2 +- pandas/tests/indexes/test_setops.py | 8 +++++++- pandas/tests/test_algos.py | 1 + 5 files changed, 12 insertions(+), 4 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index e2db9260ac37d..7ad322d050c0f 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -667,7 +667,8 @@ def _create_mi_with_dt64tz_level(): indices_dict = { - "string": Index([f"pandas_{i}" for i in range(10)]), + "object": Index([f"pandas_{i}" for i in range(10)], dtype=object), + "string": Index([f"pandas_{i}" for i in range(10)], dtype="str"), "datetime": date_range("2020-01-01", periods=10), "datetime-tz": date_range("2020-01-01", periods=10, tz="US/Pacific"), "period": period_range("2020-01-01", periods=10, freq="D"), diff --git a/pandas/tests/indexes/test_any_index.py b/pandas/tests/indexes/test_any_index.py index e1ed96195e0a7..a4c18732ef258 100644 --- a/pandas/tests/indexes/test_any_index.py +++ b/pandas/tests/indexes/test_any_index.py @@ -40,7 +40,7 @@ def test_map_identity_mapping(index, request): # GH#12766 result = index.map(lambda x: x) - if index.dtype == object and result.dtype == bool: + if index.dtype == object and (result.dtype == bool or result.dtype == "string"): assert (index == result).all() # TODO: could work that into the 'exact="equiv"'? return # FIXME: doesn't belong in this file anymore! diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index 0199e21bfc980..65feb07e05d9f 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -256,7 +256,7 @@ def test_ensure_copied_data(self, index): "RangeIndex cannot be initialized from data, " "MultiIndex and CategoricalIndex are tested separately" ) - elif index.dtype == object and index.inferred_type == "boolean": + elif index.dtype == object and index.inferred_type in ["boolean", "string"]: init_kwargs["dtype"] = index.dtype index_type = type(index) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index e5dc47be20677..5f934ca3e6e83 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -299,7 +299,13 @@ def test_difference_base(self, sort, index): first.difference([1, 2, 3], sort) @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") - def test_symmetric_difference(self, index): + def test_symmetric_difference(self, index, using_infer_string, request): + if ( + using_infer_string + and index.dtype == "object" + and index.inferred_type == "string" + ): + request.applymarker(pytest.mark.xfail(reason="TODO: infer_string")) if isinstance(index, CategoricalIndex): pytest.skip(f"Not relevant for {type(index).__name__}") if len(index) < 2: diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index dac74a0e32a42..81e7d3774b613 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -65,6 +65,7 @@ def test_factorize_complex(self): expected_uniques = np.array([(1 + 0j), (2 + 0j), (2 + 1j)], dtype=complex) tm.assert_numpy_array_equal(uniques, expected_uniques) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_factorize(self, index_or_series_obj, sort): obj = index_or_series_obj result_codes, result_uniques = obj.factorize(sort=sort) From 71eb4571b49553b4bb1db851e2d1610b703cd943 Mon Sep 17 00:00:00 2001 From: eightyseven Date: Thu, 31 Oct 2024 02:33:14 +0800 Subject: [PATCH 22/32] DOC: fix docstring of timedaltas.ceil (#60047) * fix docstring of timedaltas.cela * Update pandas/_libs/tslibs/timedeltas.pyx Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/_libs/tslibs/timedeltas.pyx | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 299730df86923..15b629624bafc 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -2178,8 +2178,10 @@ class Timedelta(_Timedelta): Parameters ---------- freq : str - Frequency string indicating the ceiling resolution. - It uses the same units as class constructor :class:`~pandas.Timedelta`. + Frequency string indicating the ceiling resolution. Must be a fixed + frequency like 's' (second) not 'ME' (month end). See + :ref:`frequency aliases ` for + a list of possible `freq` values. Returns ------- From d25b3c294f6ea8ae2b1cc52d037e60951bfae543 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Thu, 31 Oct 2024 00:04:18 +0530 Subject: [PATCH 23/32] DOC: fix RT03,SA01,ES01 for pandas.set_eng_float_format (#60135) --- ci/code_checks.sh | 1 - pandas/io/formats/format.py | 10 ++++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index b727d93879a86..7e3998c01cce6 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -144,7 +144,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.io.stata.StataWriter.write_file SA01" \ -i "pandas.plotting.andrews_curves RT03,SA01" \ -i "pandas.plotting.scatter_matrix PR07,SA01" \ - -i "pandas.set_eng_float_format RT03,SA01" \ -i "pandas.tseries.offsets.BDay PR02,SA01" \ -i "pandas.tseries.offsets.BQuarterBegin.is_on_offset GL08" \ -i "pandas.tseries.offsets.BQuarterBegin.n GL08" \ diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 5aecc6af712e5..861f5885f80c6 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1926,6 +1926,9 @@ def set_eng_float_format(accuracy: int = 3, use_eng_prefix: bool = False) -> Non """ Format float representation in DataFrame with SI notation. + Sets the floating-point display format for ``DataFrame`` objects using engineering + notation (SI units), allowing easier readability of values across wide ranges. + Parameters ---------- accuracy : int, default 3 @@ -1936,6 +1939,13 @@ def set_eng_float_format(accuracy: int = 3, use_eng_prefix: bool = False) -> Non Returns ------- None + This method does not return a value. it updates the global display format + for floats in DataFrames. + + See Also + -------- + set_option : Set the value of the specified option or options. + reset_option : Reset one or more options to their default value. Examples -------- From f0f3efca36e1088d77b442f9f781fad535f6f3cf Mon Sep 17 00:00:00 2001 From: sunlight <138234530+sunlight798@users.noreply.github.com> Date: Thu, 31 Oct 2024 02:35:35 +0800 Subject: [PATCH 24/32] DOC: fix PR07,SA01 for pandas.api.types.is_iterator (#60142) --- ci/code_checks.sh | 1 - pandas/_libs/lib.pyx | 8 ++++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 7e3998c01cce6..768e05b16cfe9 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -85,7 +85,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timestamp.resolution PR02" \ -i "pandas.Timestamp.tzinfo GL08" \ -i "pandas.Timestamp.year GL08" \ - -i "pandas.api.types.is_iterator PR07,SA01" \ -i "pandas.api.types.is_re_compilable PR07,SA01" \ -i "pandas.api.types.pandas_dtype PR07,RT03,SA01" \ -i "pandas.arrays.ArrowExtensionArray PR07,SA01" \ diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 8b6d73cda355b..de603beff7836 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -259,15 +259,23 @@ def is_iterator(obj: object) -> bool: Check if the object is an iterator. This is intended for generators, not list-like objects. + This method checks whether the passed object is an iterator. It + returns `True` if the object is an iterator, and `False` otherwise. Parameters ---------- obj : The object to check + The object to check for iterator type. Returns ------- is_iter : bool Whether `obj` is an iterator. + `True` if the object is of iterator type, otherwise `False`. + + See Also + -------- + api.types.is_list_like : Check if the input is list-like. Examples -------- From 0db1f53859686af2a2dfb1712ed62698723a37dd Mon Sep 17 00:00:00 2001 From: Kevin Amparado <109636487+KevsterAmp@users.noreply.github.com> Date: Thu, 31 Oct 2024 02:51:52 +0800 Subject: [PATCH 25/32] DOC: fix broken link on team webpage (#60141) rename link to about/governance.html --- web/pandas/about/team.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web/pandas/about/team.md b/web/pandas/about/team.md index 49b8a26ab56e8..b66e134fa5b2f 100644 --- a/web/pandas/about/team.md +++ b/web/pandas/about/team.md @@ -43,7 +43,7 @@ If you want to support pandas development, you can find information in the [dona Wes McKinney is the Benevolent Dictator for Life (BDFL). -The project governance is available in the [project governance page]({{ base_url }}governance.html). +The project governance is available in the [project governance page]({{ base_url }}about/governance.html). ## Workgroups From e7d54a54da8a179fbde5878dfb4e6440d0cfbac8 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 30 Oct 2024 20:06:30 +0100 Subject: [PATCH 26/32] BUG/TST (string dtype): fix and update tests for Stata IO (#60130) --- pandas/io/stata.py | 5 ++++ pandas/tests/io/test_stata.py | 51 +++++++++++++++++------------------ 2 files changed, 30 insertions(+), 26 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 04bd1e32603f4..722e2c79c4e6a 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -569,7 +569,11 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame: if getattr(data[col].dtype, "numpy_dtype", None) is not None: data[col] = data[col].astype(data[col].dtype.numpy_dtype) elif is_string_dtype(data[col].dtype): + # TODO could avoid converting string dtype to object here, + # but handle string dtype in _encode_strings data[col] = data[col].astype("object") + # generate_table checks for None values + data.loc[data[col].isna(), col] = None dtype = data[col].dtype empty_df = data.shape[0] == 0 @@ -2725,6 +2729,7 @@ def _encode_strings(self) -> None: continue column = self.data[col] dtype = column.dtype + # TODO could also handle string dtype here specifically if dtype.type is np.object_: inferred_dtype = infer_dtype(column, skipna=True) if not ((inferred_dtype == "string") or len(column) == 0): diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 9f5085ff2ad28..4b5369d61bed6 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -11,8 +11,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td import pandas as pd @@ -435,9 +433,8 @@ def test_write_dta6(self, datapath, temp_file): check_index_type=False, ) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) - def test_read_write_dta10(self, version, temp_file): + def test_read_write_dta10(self, version, temp_file, using_infer_string): original = DataFrame( data=[["string", "object", 1, 1.1, np.datetime64("2003-12-25")]], columns=["string", "object", "integer", "floating", "datetime"], @@ -451,9 +448,11 @@ def test_read_write_dta10(self, version, temp_file): original.to_stata(path, convert_dates={"datetime": "tc"}, version=version) written_and_read_again = self.read_dta(path) - expected = original[:] + expected = original.copy() # "tc" convert_dates means we store in ms expected["datetime"] = expected["datetime"].astype("M8[ms]") + if using_infer_string: + expected["object"] = expected["object"].astype("str") tm.assert_frame_equal( written_and_read_again.set_index("index"), @@ -1276,7 +1275,6 @@ def test_categorical_ordering(self, file, datapath): assert parsed[col].cat.ordered assert not parsed_unordered[col].cat.ordered - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings("ignore::UserWarning") @pytest.mark.parametrize( "file", @@ -1340,6 +1338,10 @@ def _convert_categorical(from_frame: DataFrame) -> DataFrame: if cat.categories.dtype == object: categories = pd.Index._with_infer(cat.categories._values) cat = cat.set_categories(categories) + elif cat.categories.dtype == "string" and len(cat.categories) == 0: + # if the read categories are empty, it comes back as object dtype + categories = cat.categories.astype(object) + cat = cat.set_categories(categories) from_frame[col] = cat return from_frame @@ -1369,7 +1371,6 @@ def test_iterator(self, datapath): from_chunks = pd.concat(itr) tm.assert_frame_equal(parsed, from_chunks) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings("ignore::UserWarning") @pytest.mark.parametrize( "file", @@ -1674,12 +1675,11 @@ def test_inf(self, infval, temp_file): path = temp_file df.to_stata(path) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_path_pathlib(self): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) df.index.name = "index" reader = lambda x: read_stata(x).set_index("index") @@ -1699,13 +1699,12 @@ def test_value_labels_iterator(self, write_index, temp_file): value_labels = dta_iter.value_labels() assert value_labels == {"A": {0: "A", 1: "B", 2: "C", 3: "E"}} - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_set_index(self, temp_file): # GH 17328 df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) df.index.name = "index" path = temp_file @@ -1733,9 +1732,9 @@ def test_date_parsing_ignores_format_details(self, column, datapath): formatted = df.loc[0, column + "_fmt"] assert unformatted == formatted - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") + # @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("byteorder", ["little", "big"]) - def test_writer_117(self, byteorder, temp_file): + def test_writer_117(self, byteorder, temp_file, using_infer_string): original = DataFrame( data=[ [ @@ -1802,6 +1801,9 @@ def test_writer_117(self, byteorder, temp_file): expected = original[:] # "tc" for convert_dates means we store with "ms" resolution expected["datetime"] = expected["datetime"].astype("M8[ms]") + if using_infer_string: + # object dtype (with only strings/None) comes back as string dtype + expected["object"] = expected["object"].astype("str") tm.assert_frame_equal( written_and_read_again.set_index("index"), @@ -1845,15 +1847,14 @@ def test_invalid_date_conversion(self, temp_file): with pytest.raises(ValueError, match=msg): original.to_stata(path, convert_dates={"wrong_name": "tc"}) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_nonfile_writing(self, version, temp_file): # GH 21041 bio = io.BytesIO() df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) df.index.name = "index" path = temp_file @@ -1864,13 +1865,12 @@ def test_nonfile_writing(self, version, temp_file): reread = read_stata(path, index_col="index") tm.assert_frame_equal(df, reread) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_gzip_writing(self, temp_file): # writing version 117 requires seek and cannot be used with gzip df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) df.index.name = "index" path = temp_file @@ -1907,8 +1907,7 @@ def test_unicode_dta_118_119(self, file, datapath): tm.assert_frame_equal(unicode_df, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_mixed_string_strl(self, temp_file): + def test_mixed_string_strl(self, temp_file, using_infer_string): # GH 23633 output = [{"mixed": "string" * 500, "number": 0}, {"mixed": None, "number": 1}] output = DataFrame(output) @@ -1925,6 +1924,8 @@ def test_mixed_string_strl(self, temp_file): output.to_stata(path, write_index=False, convert_strl=["mixed"], version=117) reread = read_stata(path) expected = output.fillna("") + if using_infer_string: + expected["mixed"] = expected["mixed"].astype("str") tm.assert_frame_equal(reread, expected) @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) @@ -2000,7 +2001,6 @@ def test_stata_119(self, datapath): reader._ensure_open() assert reader._nvar == 32999 - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("version", [118, 119, None]) @pytest.mark.parametrize("byteorder", ["little", "big"]) def test_utf8_writer(self, version, byteorder, temp_file): @@ -2348,13 +2348,12 @@ def test_iterator_errors(datapath, chunksize): pass -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_iterator_value_labels(temp_file): # GH 31544 values = ["c_label", "b_label"] + ["a_label"] * 500 df = DataFrame({f"col{k}": pd.Categorical(values, ordered=True) for k in range(2)}) df.to_stata(temp_file, write_index=False) - expected = pd.Index(["a_label", "b_label", "c_label"], dtype="object") + expected = pd.Index(["a_label", "b_label", "c_label"]) with read_stata(temp_file, chunksize=100) as reader: for j, chunk in enumerate(reader): for i in range(2): From f9ae4cfa1d20c8e15c6aa44020ad4c653a2efb8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregers=20Thomas=20Skat=20R=C3=B8rdam?= Date: Wed, 30 Oct 2024 20:23:11 +0100 Subject: [PATCH 27/32] DOC: to_latex braces in headers must be escaped (#60063) (#60103) --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 42516f0a85e07..1759e1ef91d85 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3339,7 +3339,7 @@ def to_latex( The subset of columns to write. Writes all columns by default. header : bool or list of str, default True Write out the column names. If a list of strings is given, - it is assumed to be aliases for the column names. + it is assumed to be aliases for the column names. Braces must be escaped. index : bool, default True Write row names (index). na_rep : str, default 'NaN' From 00d418936b401809a4a08c556ded57d388480868 Mon Sep 17 00:00:00 2001 From: auderson <48577571+auderson@users.noreply.github.com> Date: Thu, 31 Oct 2024 03:31:18 +0800 Subject: [PATCH 28/32] PERF: faster _coerce_to_data_and_mask() for astype("Float64") (#60121) * add fast path in _coerce_to_data_and_mask * update whatsnew * pre-commit --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/arrays/numeric.py | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index e5376177d3381..87d92f6618023 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -592,6 +592,7 @@ Performance improvements - Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`, :issue:`57752`) - Performance improvement in :func:`merge` if hash-join can be used (:issue:`57970`) - Performance improvement in :meth:`CategoricalDtype.update_dtype` when ``dtype`` is a :class:`CategoricalDtype` with non ``None`` categories and ordered (:issue:`59647`) +- Performance improvement in :meth:`DataFrame.astype` when converting to extension floating dtypes, e.g. "Float64" (:issue:`60066`) - Performance improvement in :meth:`to_hdf` avoid unnecessary reopenings of the HDF5 file to speedup data addition to files with a very large number of groups . (:issue:`58248`) - Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`) - Performance improvement in indexing operations for string dtypes (:issue:`56997`) diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index 2c0236273e731..f319a3cc05575 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -174,6 +174,8 @@ def _coerce_to_data_and_mask( raise TypeError(f"{values.dtype} cannot be converted to {name}") elif values.dtype.kind == "b" and checker(dtype): + # fastpath + mask = np.zeros(len(values), dtype=np.bool_) if not copy: values = np.asarray(values, dtype=default_dtype) else: @@ -190,6 +192,10 @@ def _coerce_to_data_and_mask( if values.dtype.kind in "iu": # fastpath mask = np.zeros(len(values), dtype=np.bool_) + elif values.dtype.kind == "f": + # np.isnan is faster than is_numeric_na() for floats + # github issue: #60066 + mask = np.isnan(values) else: mask = libmissing.is_numeric_na(values) else: From 0f94e7b3f35a42af3c9ae8902eb58b65a2e10805 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 30 Oct 2024 12:33:33 -0700 Subject: [PATCH 29/32] Bump pypa/cibuildwheel from 2.21.0 to 2.21.3 (#60035) Bumps [pypa/cibuildwheel](https://github.com/pypa/cibuildwheel) from 2.21.0 to 2.21.3. - [Release notes](https://github.com/pypa/cibuildwheel/releases) - [Changelog](https://github.com/pypa/cibuildwheel/blob/main/docs/changelog.md) - [Commits](https://github.com/pypa/cibuildwheel/compare/v2.21.0...v2.21.3) --- updated-dependencies: - dependency-name: pypa/cibuildwheel dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- .github/workflows/wheels.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index ce48cfa463974..4bff9e7e090da 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -156,7 +156,7 @@ jobs: run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV" - name: Build wheels - uses: pypa/cibuildwheel@v2.21.0 + uses: pypa/cibuildwheel@v2.21.3 with: package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: From 2323b5495819e20fe05892cc532d2bb3f83c3d0d Mon Sep 17 00:00:00 2001 From: steeleelliott03 <100764769+steeleelliott03@users.noreply.github.com> Date: Wed, 30 Oct 2024 15:47:30 -0400 Subject: [PATCH 30/32] DOC: Fix title capitalization in documentation files (#32550) (#59972) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Corrected title capitalization in various .rst files to match the standard of capitalizing only the first word, unless a term like DataFrame or Series is involved. Ran the script to find and correct heading issues in the following files: - doc/source/user_guide/timedeltas.rst - doc/source/whatsnew/v0.7.0.rst - doc/source/whatsnew/v0.23.4.rst - (… and so on) Fixes part of issue #32550. --- doc/source/user_guide/cookbook.rst | 6 +++--- doc/source/user_guide/gotchas.rst | 2 +- doc/source/user_guide/groupby.rst | 6 +++--- doc/source/user_guide/integer_na.rst | 2 +- doc/source/user_guide/io.rst | 2 +- doc/source/whatsnew/v1.0.2.rst | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index 42430fb1fbba0..1525afcac87f7 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -35,7 +35,7 @@ These are some neat pandas ``idioms`` ) df -if-then... +If-then... ********** An if-then on one column @@ -176,7 +176,7 @@ One could hard code: Selection --------- -Dataframes +DataFrames ********** The :ref:`indexing ` docs. @@ -1489,7 +1489,7 @@ of the data values: ) df -Constant series +Constant Series --------------- To assess if a series has a constant value, we can check if ``series.nunique() <= 1``. diff --git a/doc/source/user_guide/gotchas.rst b/doc/source/user_guide/gotchas.rst index 26eb656357bf6..842f30f06676e 100644 --- a/doc/source/user_guide/gotchas.rst +++ b/doc/source/user_guide/gotchas.rst @@ -121,7 +121,7 @@ Below is how to check if any of the values are ``True``: if pd.Series([False, True, False]).any(): print("I am any") -Bitwise boolean +Bitwise Boolean ~~~~~~~~~~~~~~~ Bitwise boolean operators like ``==`` and ``!=`` return a boolean :class:`Series` diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 8c80fa7052dd5..acb5a2b7919ac 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -618,7 +618,7 @@ this will make an extra copy. .. _groupby.aggregate.udf: -Aggregation with User-Defined Functions +Aggregation with user-defined functions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Users can also provide their own User-Defined Functions (UDFs) for custom aggregations. @@ -1261,7 +1261,7 @@ with df.groupby("A", group_keys=False).apply(lambda x: x, include_groups=False) -Numba Accelerated Routines +Numba accelerated routines -------------------------- .. versionadded:: 1.1 @@ -1696,7 +1696,7 @@ introduction ` and the dfg.groupby(["A", [0, 0, 0, 1, 1]]).ngroup() -Groupby by indexer to 'resample' data +GroupBy by indexer to 'resample' data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Resampling produces new hypothetical samples (resamples) from already existing observed data or from a model that generates data. These new samples are similar to the pre-existing samples. diff --git a/doc/source/user_guide/integer_na.rst b/doc/source/user_guide/integer_na.rst index 76a2f22b7987d..8d35d1583d3bd 100644 --- a/doc/source/user_guide/integer_na.rst +++ b/doc/source/user_guide/integer_na.rst @@ -147,7 +147,7 @@ Reduction and groupby operations such as :meth:`~DataFrame.sum` work as well. df.sum() df.groupby("B").A.sum() -Scalar NA Value +Scalar NA value --------------- :class:`arrays.IntegerArray` uses :attr:`pandas.NA` as its scalar diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index fa64bce60caf4..7c165c87adb46 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -5996,7 +5996,7 @@ Full documentation can be found `here Date: Wed, 30 Oct 2024 22:20:59 +0100 Subject: [PATCH 31/32] CI/TST: fix parquet tz test returning pytz fixed offset (pyarrow 18) (#60143) * CI/TST: fix parquet tz test returning pytz fixed offset (pyarrow 18) * only convert to pytz if installed --- pandas/tests/io/test_parquet.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 4c2ea036f08dc..6ef7105cf5ccc 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -17,7 +17,6 @@ pa_version_under13p0, pa_version_under15p0, pa_version_under17p0, - pa_version_under18p0, ) import pandas as pd @@ -974,21 +973,9 @@ def test_timestamp_nanoseconds(self, pa): df = pd.DataFrame({"a": pd.date_range("2017-01-01", freq="1ns", periods=10)}) check_round_trip(df, pa, write_kwargs={"version": ver}) - def test_timezone_aware_index(self, request, pa, timezone_aware_date_list): + def test_timezone_aware_index(self, pa, timezone_aware_date_list): pytest.importorskip("pyarrow", "11.0.0") - if ( - timezone_aware_date_list.tzinfo != datetime.timezone.utc - and pa_version_under18p0 - ): - request.applymarker( - pytest.mark.xfail( - reason=( - "pyarrow returns pytz.FixedOffset while pandas " - "constructs datetime.timezone https://github.com/pandas-dev/pandas/issues/37286" - ) - ) - ) idx = 5 * [timezone_aware_date_list] df = pd.DataFrame(index=idx, data={"index_as_col": idx}) @@ -1005,6 +992,18 @@ def test_timezone_aware_index(self, request, pa, timezone_aware_date_list): expected = df[:] if pa_version_under11p0: expected.index = expected.index.as_unit("ns") + if timezone_aware_date_list.tzinfo != datetime.timezone.utc: + # pyarrow returns pytz.FixedOffset while pandas constructs datetime.timezone + # https://github.com/pandas-dev/pandas/issues/37286 + try: + import pytz + except ImportError: + pass + else: + offset = df.index.tz.utcoffset(timezone_aware_date_list) + tz = pytz.FixedOffset(offset.total_seconds() / 60) + expected.index = expected.index.tz_convert(tz) + expected["index_as_col"] = expected["index_as_col"].dt.tz_convert(tz) check_round_trip(df, pa, check_dtype=False, expected=expected) def test_filter_row_groups(self, pa): From 2a1ca9da299db95231a96b271b8952ac3c9977fb Mon Sep 17 00:00:00 2001 From: Michael Date: Wed, 30 Oct 2024 19:04:24 -0400 Subject: [PATCH 32/32] TST: Retyping of categorical column with NaN (#60112) * consistent name usage * changed to numpy array of integers * Remove redundant assert Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/tests/dtypes/test_dtypes.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 33232e8df14e9..b7e37ff270e60 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -1243,3 +1243,12 @@ def test_loc_setitem_empty_labels_no_dtype_conversion(): assert df.a.dtype == "int64" tm.assert_frame_equal(df, expected) + + +def test_categorical_nan_no_dtype_conversion(): + # GH 43996 + + df = pd.DataFrame({"a": Categorical([np.nan], [1]), "b": [1]}) + expected = pd.DataFrame({"a": Categorical([1], [1]), "b": [1]}) + df.loc[0, "a"] = np.array([1]) + tm.assert_frame_equal(df, expected)