diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index ce48cfa463974..4bff9e7e090da 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -156,7 +156,7 @@ jobs: run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV" - name: Build wheels - uses: pypa/cibuildwheel@v2.21.0 + uses: pypa/cibuildwheel@v2.21.3 with: package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1cb7b288aba69..87212309725c7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,9 +2,9 @@ minimum_pre_commit_version: 2.15.0 exclude: ^LICENSES/|\.(html|csv|svg)$ # reserve "manual" for relatively slow hooks which we still want to run in CI default_stages: [ - commit, - merge-commit, - push, + pre-commit, + pre-merge-commit, + pre-push, prepare-commit-msg, commit-msg, post-checkout, diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 1974c98a1d1ff..768e05b16cfe9 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -85,8 +85,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timestamp.resolution PR02" \ -i "pandas.Timestamp.tzinfo GL08" \ -i "pandas.Timestamp.year GL08" \ - -i "pandas.api.types.is_integer PR01,SA01" \ - -i "pandas.api.types.is_iterator PR07,SA01" \ -i "pandas.api.types.is_re_compilable PR07,SA01" \ -i "pandas.api.types.pandas_dtype PR07,RT03,SA01" \ -i "pandas.arrays.ArrowExtensionArray PR07,SA01" \ @@ -97,43 +95,35 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.arrays.NumpyExtensionArray SA01" \ -i "pandas.arrays.SparseArray PR07,SA01" \ -i "pandas.arrays.TimedeltaArray PR07,SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.__iter__ RT03,SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.boxplot PR07,RT03,SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.get_group RT03,SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.groups SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.indices SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.nth PR02" \ -i "pandas.core.groupby.DataFrameGroupBy.nunique SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.ohlc SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \ -i "pandas.core.groupby.DataFrameGroupBy.sem SA01" \ - -i "pandas.core.groupby.SeriesGroupBy.__iter__ RT03,SA01" \ -i "pandas.core.groupby.SeriesGroupBy.get_group RT03,SA01" \ -i "pandas.core.groupby.SeriesGroupBy.groups SA01" \ -i "pandas.core.groupby.SeriesGroupBy.indices SA01" \ -i "pandas.core.groupby.SeriesGroupBy.is_monotonic_decreasing SA01" \ -i "pandas.core.groupby.SeriesGroupBy.is_monotonic_increasing SA01" \ -i "pandas.core.groupby.SeriesGroupBy.nth PR02" \ - -i "pandas.core.groupby.SeriesGroupBy.ohlc SA01" \ -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \ -i "pandas.core.groupby.SeriesGroupBy.sem SA01" \ - -i "pandas.core.resample.Resampler.__iter__ RT03,SA01" \ -i "pandas.core.resample.Resampler.get_group RT03,SA01" \ -i "pandas.core.resample.Resampler.groups SA01" \ -i "pandas.core.resample.Resampler.indices SA01" \ -i "pandas.core.resample.Resampler.max PR01,RT03,SA01" \ -i "pandas.core.resample.Resampler.mean SA01" \ -i "pandas.core.resample.Resampler.min PR01,RT03,SA01" \ - -i "pandas.core.resample.Resampler.ohlc SA01" \ -i "pandas.core.resample.Resampler.prod SA01" \ -i "pandas.core.resample.Resampler.quantile PR01,PR07" \ -i "pandas.core.resample.Resampler.sem SA01" \ -i "pandas.core.resample.Resampler.std SA01" \ - -i "pandas.core.resample.Resampler.sum SA01" \ -i "pandas.core.resample.Resampler.transform PR01,RT03,SA01" \ -i "pandas.core.resample.Resampler.var SA01" \ -i "pandas.errors.AttributeConflictWarning SA01" \ - -i "pandas.errors.CSSWarning SA01" \ -i "pandas.errors.ChainedAssignmentError SA01" \ -i "pandas.errors.DataError SA01" \ -i "pandas.errors.DuplicateLabelError SA01" \ @@ -142,22 +132,17 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.errors.NullFrequencyError SA01" \ -i "pandas.errors.NumExprClobberingError SA01" \ -i "pandas.errors.NumbaUtilError SA01" \ - -i "pandas.errors.OptionError SA01" \ -i "pandas.errors.OutOfBoundsTimedelta SA01" \ -i "pandas.errors.PerformanceWarning SA01" \ -i "pandas.errors.PossibleDataLossError SA01" \ - -i "pandas.errors.PossiblePrecisionLoss SA01" \ -i "pandas.errors.UndefinedVariableError PR01,SA01" \ -i "pandas.errors.UnsortedIndexError SA01" \ - -i "pandas.errors.UnsupportedFunctionCall SA01" \ -i "pandas.errors.ValueLabelTypeMismatch SA01" \ -i "pandas.infer_freq SA01" \ -i "pandas.io.json.build_table_schema PR07,RT03,SA01" \ -i "pandas.io.stata.StataWriter.write_file SA01" \ - -i "pandas.json_normalize RT03,SA01" \ -i "pandas.plotting.andrews_curves RT03,SA01" \ -i "pandas.plotting.scatter_matrix PR07,SA01" \ - -i "pandas.set_eng_float_format RT03,SA01" \ -i "pandas.tseries.offsets.BDay PR02,SA01" \ -i "pandas.tseries.offsets.BQuarterBegin.is_on_offset GL08" \ -i "pandas.tseries.offsets.BQuarterBegin.n GL08" \ @@ -303,7 +288,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.tseries.offsets.Second.is_on_offset GL08" \ -i "pandas.tseries.offsets.Second.n GL08" \ -i "pandas.tseries.offsets.Second.normalize GL08" \ - -i "pandas.tseries.offsets.SemiMonthBegin SA01" \ -i "pandas.tseries.offsets.SemiMonthBegin.day_of_month GL08" \ -i "pandas.tseries.offsets.SemiMonthBegin.is_on_offset GL08" \ -i "pandas.tseries.offsets.SemiMonthBegin.n GL08" \ diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index c33c0344e742f..b1c7fda910f67 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -52,7 +52,7 @@ dependencies: - scipy>=1.10.0 - sqlalchemy>=2.0.0 - tabulate>=0.9.0 - - xarray>=2022.12.0 + - xarray>=2022.12.0, <=2024.9.0 - xlrd>=2.0.1 - xlsxwriter>=3.0.5 - zstandard>=0.19.0 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index 8692b6e35ab2d..f7fc4c38add90 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -53,7 +53,7 @@ dependencies: - scipy>=1.10.0 - sqlalchemy>=2.0.0 - tabulate>=0.9.0 - - xarray>=2022.12.0 + - xarray>=2022.12.0, <=2024.9.0 - xlrd>=2.0.1 - xlsxwriter>=3.0.5 - zstandard>=0.19.0 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 8e7d9aba7878d..f1ab3c37c4c71 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -52,7 +52,7 @@ dependencies: - scipy>=1.10.0 - sqlalchemy>=2.0.0 - tabulate>=0.9.0 - - xarray>=2022.12.0 + - xarray>=2022.12.0, <=2024.9.0 - xlrd>=2.0.1 - xlsxwriter>=3.0.5 - zstandard>=0.19.0 diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml index 6c97960a62d40..d39d572eda619 100644 --- a/ci/deps/actions-312.yaml +++ b/ci/deps/actions-312.yaml @@ -52,7 +52,7 @@ dependencies: - scipy>=1.10.0 - sqlalchemy>=2.0.0 - tabulate>=0.9.0 - - xarray>=2022.12.0 + - xarray>=2022.12.0, <=2024.9.0 - xlrd>=2.0.1 - xlsxwriter>=3.0.5 - zstandard>=0.19.0 diff --git a/ci/deps/circle-311-arm64.yaml b/ci/deps/circle-311-arm64.yaml index c86534871b3d2..def7faeb8bcaa 100644 --- a/ci/deps/circle-311-arm64.yaml +++ b/ci/deps/circle-311-arm64.yaml @@ -52,7 +52,7 @@ dependencies: - scipy>=1.10.0 - sqlalchemy>=2.0.0 - tabulate>=0.9.0 - - xarray>=2022.12.0 + - xarray>=2022.12.0, <2024.10.0 - xlrd>=2.0.1 - xlsxwriter>=3.0.5 - zstandard>=0.19.0 diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst index 670ffe6996302..c1cfb0d7a623b 100644 --- a/doc/source/development/contributing_codebase.rst +++ b/doc/source/development/contributing_codebase.rst @@ -298,6 +298,12 @@ So, before actually writing any code, you should write your tests. Often the te taken from the original GitHub issue. However, it is always worth considering additional use cases and writing corresponding tests. +We use `code coverage `_ to help understand +the amount of code which is covered by a test. We recommend striving to ensure code +you add or change within Pandas is covered by a test. Please see our +`code coverage dashboard through Codecov `_ +for more information. + Adding tests is one of the most common requests after code is pushed to pandas. Therefore, it is worth getting in the habit of writing tests ahead of time so this is never an issue. diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index 42430fb1fbba0..1525afcac87f7 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -35,7 +35,7 @@ These are some neat pandas ``idioms`` ) df -if-then... +If-then... ********** An if-then on one column @@ -176,7 +176,7 @@ One could hard code: Selection --------- -Dataframes +DataFrames ********** The :ref:`indexing ` docs. @@ -1489,7 +1489,7 @@ of the data values: ) df -Constant series +Constant Series --------------- To assess if a series has a constant value, we can check if ``series.nunique() <= 1``. diff --git a/doc/source/user_guide/gotchas.rst b/doc/source/user_guide/gotchas.rst index 26eb656357bf6..842f30f06676e 100644 --- a/doc/source/user_guide/gotchas.rst +++ b/doc/source/user_guide/gotchas.rst @@ -121,7 +121,7 @@ Below is how to check if any of the values are ``True``: if pd.Series([False, True, False]).any(): print("I am any") -Bitwise boolean +Bitwise Boolean ~~~~~~~~~~~~~~~ Bitwise boolean operators like ``==`` and ``!=`` return a boolean :class:`Series` diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 8c80fa7052dd5..acb5a2b7919ac 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -618,7 +618,7 @@ this will make an extra copy. .. _groupby.aggregate.udf: -Aggregation with User-Defined Functions +Aggregation with user-defined functions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Users can also provide their own User-Defined Functions (UDFs) for custom aggregations. @@ -1261,7 +1261,7 @@ with df.groupby("A", group_keys=False).apply(lambda x: x, include_groups=False) -Numba Accelerated Routines +Numba accelerated routines -------------------------- .. versionadded:: 1.1 @@ -1696,7 +1696,7 @@ introduction ` and the dfg.groupby(["A", [0, 0, 0, 1, 1]]).ngroup() -Groupby by indexer to 'resample' data +GroupBy by indexer to 'resample' data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Resampling produces new hypothetical samples (resamples) from already existing observed data or from a model that generates data. These new samples are similar to the pre-existing samples. diff --git a/doc/source/user_guide/integer_na.rst b/doc/source/user_guide/integer_na.rst index 76a2f22b7987d..8d35d1583d3bd 100644 --- a/doc/source/user_guide/integer_na.rst +++ b/doc/source/user_guide/integer_na.rst @@ -147,7 +147,7 @@ Reduction and groupby operations such as :meth:`~DataFrame.sum` work as well. df.sum() df.groupby("B").A.sum() -Scalar NA Value +Scalar NA value --------------- :class:`arrays.IntegerArray` uses :attr:`pandas.NA` as its scalar diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index fa64bce60caf4..7c165c87adb46 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -5996,7 +5996,7 @@ Full documentation can be found `here =1.10.0 - sqlalchemy>=2.0.0 - tabulate>=0.9.0 - - xarray>=2022.12.0 + - xarray>=2022.12.0, <=2024.9.0 - xlrd>=2.0.1 - xlsxwriter>=3.0.5 - zstandard>=0.19.0 diff --git a/pandas/_config/config.py b/pandas/_config/config.py index 25760df6bd7a4..1d57aa806e0f1 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -105,6 +105,10 @@ class OptionError(AttributeError, KeyError): Backwards compatible with KeyError checks. + See Also + -------- + options : Access and modify global pandas settings. + Examples -------- >>> pd.options.context diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 23e0f387466aa..de603beff7836 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -259,15 +259,23 @@ def is_iterator(obj: object) -> bool: Check if the object is an iterator. This is intended for generators, not list-like objects. + This method checks whether the passed object is an iterator. It + returns `True` if the object is an iterator, and `False` otherwise. Parameters ---------- obj : The object to check + The object to check for iterator type. Returns ------- is_iter : bool Whether `obj` is an iterator. + `True` if the object is of iterator type, otherwise `False`. + + See Also + -------- + api.types.is_list_like : Check if the input is list-like. Examples -------- @@ -1122,9 +1130,23 @@ def is_integer(obj: object) -> bool: """ Return True if given object is integer. + This method checks whether the passed object is an integer type. It + returns `True` if the object is an integer, and `False` otherwise. + + Parameters + ---------- + obj : object + The object to check for integer type. + Returns ------- bool + `True` if the object is of integer type, otherwise `False`. + + See Also + -------- + api.types.is_float : Check if an object is of float type. + api.types.is_numeric_dtype : Check if an object is of numeric type. Examples -------- diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 4db96fbaa3aad..7569f8e8864a0 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -3371,6 +3371,10 @@ cdef class SemiMonthBegin(SemiMonthOffset): """ Two DateOffset's per month repeating on the first day of the month & day_of_month. + This offset moves dates to the first day of the month and an additional specified + day (typically the 15th by default), useful in scenarios where bi-monthly processing + occurs on set days. + Attributes ---------- n : int, default 1 @@ -3380,6 +3384,13 @@ cdef class SemiMonthBegin(SemiMonthOffset): day_of_month : int, {1, 3,...,27}, default 15 A specific integer for the day of the month. + See Also + -------- + tseries.offsets.SemiMonthEnd : Two DateOffset's per month repeating on the last day + of the month & day_of_month. + tseries.offsets.MonthEnd : Offset to the last calendar day of the month. + tseries.offsets.MonthBegin : Offset to the first calendar day of the month. + Examples -------- >>> ts = pd.Timestamp(2022, 1, 1) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index bbefea7c47fc3..15b629624bafc 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1864,10 +1864,12 @@ class Timedelta(_Timedelta): Parameters ---------- - value : Timedelta, timedelta, np.timedelta64, str, or int + value : Timedelta, timedelta, np.timedelta64, str, int or float Input value. unit : str, default 'ns' - Denote the unit of the input, if input is an integer. + If input is an integer, denote the unit of the input. + If input is a float, denote the unit of the integer parts. + The decimal parts with resolution lower than 1 nanosecond are ignored. Possible values: @@ -2176,8 +2178,10 @@ class Timedelta(_Timedelta): Parameters ---------- freq : str - Frequency string indicating the ceiling resolution. - It uses the same units as class constructor :class:`~pandas.Timedelta`. + Frequency string indicating the ceiling resolution. Must be a fixed + frequency like 's' (second) not 'ME' (month end). See + :ref:`frequency aliases ` for + a list of possible `freq` values. Returns ------- diff --git a/pandas/conftest.py b/pandas/conftest.py index e2db9260ac37d..7ad322d050c0f 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -667,7 +667,8 @@ def _create_mi_with_dt64tz_level(): indices_dict = { - "string": Index([f"pandas_{i}" for i in range(10)]), + "object": Index([f"pandas_{i}" for i in range(10)], dtype=object), + "string": Index([f"pandas_{i}" for i in range(10)], dtype="str"), "datetime": date_range("2020-01-01", periods=10), "datetime-tz": date_range("2020-01-01", periods=10, tz="US/Pacific"), "period": period_range("2020-01-01", periods=10, freq="D"), diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index 2c0236273e731..f319a3cc05575 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -174,6 +174,8 @@ def _coerce_to_data_and_mask( raise TypeError(f"{values.dtype} cannot be converted to {name}") elif values.dtype.kind == "b" and checker(dtype): + # fastpath + mask = np.zeros(len(values), dtype=np.bool_) if not copy: values = np.asarray(values, dtype=default_dtype) else: @@ -190,6 +192,10 @@ def _coerce_to_data_and_mask( if values.dtype.kind in "iu": # fastpath mask = np.zeros(len(values), dtype=np.bool_) + elif values.dtype.kind == "f": + # np.isnan is faster than is_numeric_na() for floats + # github issue: #60066 + mask = np.isnan(values) else: mask = libmissing.is_numeric_na(values) else: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 24a164aa15427..c4defdb24370f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -718,7 +718,7 @@ def __init__( "is deprecated and will raise in a future version. " "Use public APIs instead.", DeprecationWarning, - stacklevel=1, # bump to 2 once pyarrow 15.0 is released with fix + stacklevel=2, ) data = data.copy(deep=False) @@ -10823,7 +10823,7 @@ def round( self, decimals: int | dict[IndexLabel, int] | Series = 0, *args, **kwargs ) -> DataFrame: """ - Round a DataFrame to a variable number of decimal places. + Round numeric columns in a DataFrame to a variable number of decimal places. Parameters ---------- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 42516f0a85e07..1759e1ef91d85 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3339,7 +3339,7 @@ def to_latex( The subset of columns to write. Writes all columns by default. header : bool or list of str, default True Write out the column names. If a list of strings is given, - it is assumed to be aliases for the column names. + it is assumed to be aliases for the column names. Braces must be escaped. index : bool, default True Write row names (index). na_rep : str, default 'NaN' diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 8dfef9e70db52..a0bd25525c55f 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -767,10 +767,24 @@ def __iter__(self) -> Iterator[tuple[Hashable, NDFrameT]]: """ Groupby iterator. + This method provides an iterator over the groups created by the ``resample`` + or ``groupby`` operation on the object. The method yields tuples where + the first element is the label (group key) corresponding to each group or + resampled bin, and the second element is the subset of the data that falls + within that group or bin. + Returns ------- - Generator yielding sequence of (name, subsetted object) - for each group + Iterator + Generator yielding a sequence of (name, subsetted object) + for each group. + + See Also + -------- + Series.groupby : Group data by a specific key or column. + DataFrame.groupby : Group DataFrame using mapper or by columns. + DataFrame.resample : Resample a DataFrame. + Series.resample : Resample a Series. Examples -------- @@ -3224,6 +3238,12 @@ def ohlc(self) -> DataFrame: DataFrame Open, high, low and close values within each group. + See Also + -------- + DataFrame.agg : Aggregate using one or more operations over the specified axis. + DataFrame.resample : Resample time-series data. + DataFrame.groupby : Group DataFrame using a mapper or by a Series of columns. + Examples -------- diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 42fed83398737..ca4d3fc768efb 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1021,6 +1021,10 @@ def sum( """ Compute sum of group values. + This method provides a simple way to compute the sum of values within each + resampled group, particularly useful for aggregating time-based data into + daily, monthly, or yearly sums. + Parameters ---------- numeric_only : bool, default False @@ -1039,6 +1043,14 @@ def sum( Series or DataFrame Computed sum of values within each group. + See Also + -------- + core.resample.Resampler.mean : Compute mean of groups, excluding missing values. + core.resample.Resampler.count : Compute count of group, excluding missing + values. + DataFrame.resample : Resample time-series data. + Series.sum : Return the sum of the values over the requested axis. + Examples -------- >>> ser = pd.Series( diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 3cb0e75cfb815..05e1a36877e06 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -255,7 +255,9 @@ def _validate(data): inferred_dtype = lib.infer_dtype(values, skipna=True) if inferred_dtype not in allowed_types: - raise AttributeError("Can only use .str accessor with string values!") + raise AttributeError( + f"Can only use .str accessor with string values, not {inferred_dtype}" + ) return inferred_dtype def __getitem__(self, key): diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index efc032b0b559e..0aaee1ec177ee 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -76,6 +76,12 @@ class UnsupportedFunctionCall(ValueError): For example, ``np.cumsum(groupby_object)``. + See Also + -------- + DataFrame.groupby : Group DataFrame using a mapper or by a Series of columns. + Series.groupby : Group Series using a mapper or by a Series of columns. + core.groupby.GroupBy.cumsum : Compute cumulative sum for each group. + Examples -------- >>> df = pd.DataFrame( @@ -591,6 +597,14 @@ class CSSWarning(UserWarning): This can be due to the styling not having an equivalent value or because the styling isn't properly formatted. + See Also + -------- + DataFrame.style : Returns a Styler object for applying CSS-like styles. + io.formats.style.Styler : Helps style a DataFrame or Series according to the + data with HTML and CSS. + io.formats.style.Styler.to_excel : Export styled DataFrame to Excel. + io.formats.style.Styler.to_html : Export styled DataFrame to HTML. + Examples -------- >>> df = pd.DataFrame({"A": [1, 1, 1]}) @@ -691,6 +705,10 @@ class PossiblePrecisionLoss(Warning): When the column value is outside or equal to the int64 value the column is converted to a float64 dtype. + See Also + -------- + DataFrame.to_stata : Export DataFrame object to Stata dta format. + Examples -------- >>> df = pd.DataFrame({"s": pd.Series([1, 2**53], dtype=np.int64)}) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 5aecc6af712e5..861f5885f80c6 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1926,6 +1926,9 @@ def set_eng_float_format(accuracy: int = 3, use_eng_prefix: bool = False) -> Non """ Format float representation in DataFrame with SI notation. + Sets the floating-point display format for ``DataFrame`` objects using engineering + notation (SI units), allowing easier readability of values across wide ranges. + Parameters ---------- accuracy : int, default 3 @@ -1936,6 +1939,13 @@ def set_eng_float_format(accuracy: int = 3, use_eng_prefix: bool = False) -> Non Returns ------- None + This method does not return a value. it updates the global display format + for floats in DataFrames. + + See Also + -------- + set_option : Set the value of the specified option or options. + reset_option : Reset one or more options to their default value. Examples -------- diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py index 7d3eefae39679..45c8876dbe3e5 100644 --- a/pandas/io/json/_normalize.py +++ b/pandas/io/json/_normalize.py @@ -279,6 +279,10 @@ def json_normalize( """ Normalize semi-structured JSON data into a flat table. + This method is designed to transform semi-structured JSON data, such as nested + dictionaries or lists, into a flat table. This is particularly useful when + handling JSON-like data structures that contain deeply nested fields. + Parameters ---------- data : dict, list of dicts, or Series of dicts @@ -310,8 +314,13 @@ def json_normalize( Returns ------- - frame : DataFrame - Normalize semi-structured JSON data into a flat table. + DataFrame + The normalized data, represented as a pandas DataFrame. + + See Also + -------- + DataFrame : Two-dimensional, size-mutable, potentially heterogeneous tabular data. + Series : One-dimensional ndarray with axis labels (including time series). Examples -------- diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 04bd1e32603f4..722e2c79c4e6a 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -569,7 +569,11 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame: if getattr(data[col].dtype, "numpy_dtype", None) is not None: data[col] = data[col].astype(data[col].dtype.numpy_dtype) elif is_string_dtype(data[col].dtype): + # TODO could avoid converting string dtype to object here, + # but handle string dtype in _encode_strings data[col] = data[col].astype("object") + # generate_table checks for None values + data.loc[data[col].isna(), col] = None dtype = data[col].dtype empty_df = data.shape[0] == 0 @@ -2725,6 +2729,7 @@ def _encode_strings(self) -> None: continue column = self.data[col] dtype = column.dtype + # TODO could also handle string dtype here specifically if dtype.type is np.object_: inferred_dtype = infer_dtype(column, skipna=True) if not ((inferred_dtype == "string") or len(column) == 0): diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 33232e8df14e9..b7e37ff270e60 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -1243,3 +1243,12 @@ def test_loc_setitem_empty_labels_no_dtype_conversion(): assert df.a.dtype == "int64" tm.assert_frame_equal(df, expected) + + +def test_categorical_nan_no_dtype_conversion(): + # GH 43996 + + df = pd.DataFrame({"a": Categorical([np.nan], [1]), "b": [1]}) + expected = pd.DataFrame({"a": Categorical([1], [1]), "b": [1]}) + df.loc[0, "a"] = np.array([1]) + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/indexes/object/test_astype.py b/pandas/tests/indexes/object/test_astype.py index ce05b5e9f2238..7e0de138aacfb 100644 --- a/pandas/tests/indexes/object/test_astype.py +++ b/pandas/tests/indexes/object/test_astype.py @@ -3,25 +3,7 @@ from pandas import ( Index, NaT, - Series, ) -import pandas._testing as tm - - -def test_astype_str_from_bytes(): - # https://github.com/pandas-dev/pandas/issues/38607 - # GH#49658 pre-2.0 Index called .values.astype(str) here, which effectively - # did a .decode() on the bytes object. In 2.0 we go through - # ensure_string_array which does f"{val}" - idx = Index(["あ", b"a"], dtype="object") - result = idx.astype(str) - expected = Index(["あ", "a"], dtype="str") - tm.assert_index_equal(result, expected) - - # while we're here, check that Series.astype behaves the same - result = Series(idx).astype(str) - expected = Series(expected, dtype="str") - tm.assert_series_equal(result, expected) def test_astype_invalid_nas_to_tdt64_raises(): diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index ea3d068a673e8..89648bc316c16 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -3,12 +3,8 @@ import numpy as np import pytest -from pandas._libs.missing import ( - NA, - is_matching_na, -) +from pandas._libs.missing import is_matching_na -import pandas as pd from pandas import Index import pandas._testing as tm @@ -23,13 +19,13 @@ class TestGetIndexer: ) def test_get_indexer_strings(self, method, expected): expected = np.array(expected, dtype=np.intp) - index = Index(["b", "c"]) + index = Index(["b", "c"], dtype=object) actual = index.get_indexer(["a", "b", "c", "d"], method=method) tm.assert_numpy_array_equal(actual, expected) - def test_get_indexer_strings_raises(self, using_infer_string): - index = Index(["b", "c"]) + def test_get_indexer_strings_raises(self): + index = Index(["b", "c"], dtype=object) msg = "|".join( [ @@ -68,13 +64,9 @@ def test_get_indexer_with_NA_values( class TestGetIndexerNonUnique: - def test_get_indexer_non_unique_nas( - self, nulls_fixture, request, using_infer_string - ): + def test_get_indexer_non_unique_nas(self, nulls_fixture): # even though this isn't non-unique, this should still work - if using_infer_string and (nulls_fixture is None or nulls_fixture is NA): - request.applymarker(pytest.mark.xfail(reason="NAs are cast to NaN")) - index = Index(["a", "b", nulls_fixture]) + index = Index(["a", "b", nulls_fixture], dtype=object) indexer, missing = index.get_indexer_non_unique([nulls_fixture]) expected_indexer = np.array([2], dtype=np.intp) @@ -83,7 +75,7 @@ def test_get_indexer_non_unique_nas( tm.assert_numpy_array_equal(missing, expected_missing) # actually non-unique - index = Index(["a", nulls_fixture, "b", nulls_fixture]) + index = Index(["a", nulls_fixture, "b", nulls_fixture], dtype=object) indexer, missing = index.get_indexer_non_unique([nulls_fixture]) expected_indexer = np.array([1, 3], dtype=np.intp) @@ -92,10 +84,10 @@ def test_get_indexer_non_unique_nas( # matching-but-not-identical nans if is_matching_na(nulls_fixture, float("NaN")): - index = Index(["a", float("NaN"), "b", float("NaN")]) + index = Index(["a", float("NaN"), "b", float("NaN")], dtype=object) match_but_not_identical = True elif is_matching_na(nulls_fixture, Decimal("NaN")): - index = Index(["a", Decimal("NaN"), "b", Decimal("NaN")]) + index = Index(["a", Decimal("NaN"), "b", Decimal("NaN")], dtype=object) match_but_not_identical = True else: match_but_not_identical = False @@ -156,59 +148,3 @@ def test_get_indexer_non_unique_np_nats(self, np_nat_fixture, np_nat_fixture2): expected_indexer = np.array([1, 3], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected_indexer) tm.assert_numpy_array_equal(missing, expected_missing) - - -class TestSliceLocs: - @pytest.mark.parametrize( - "in_slice,expected", - [ - # error: Slice index must be an integer or None - (pd.IndexSlice[::-1], "yxdcb"), - (pd.IndexSlice["b":"y":-1], ""), # type: ignore[misc] - (pd.IndexSlice["b"::-1], "b"), # type: ignore[misc] - (pd.IndexSlice[:"b":-1], "yxdcb"), # type: ignore[misc] - (pd.IndexSlice[:"y":-1], "y"), # type: ignore[misc] - (pd.IndexSlice["y"::-1], "yxdcb"), # type: ignore[misc] - (pd.IndexSlice["y"::-4], "yb"), # type: ignore[misc] - # absent labels - (pd.IndexSlice[:"a":-1], "yxdcb"), # type: ignore[misc] - (pd.IndexSlice[:"a":-2], "ydb"), # type: ignore[misc] - (pd.IndexSlice["z"::-1], "yxdcb"), # type: ignore[misc] - (pd.IndexSlice["z"::-3], "yc"), # type: ignore[misc] - (pd.IndexSlice["m"::-1], "dcb"), # type: ignore[misc] - (pd.IndexSlice[:"m":-1], "yx"), # type: ignore[misc] - (pd.IndexSlice["a":"a":-1], ""), # type: ignore[misc] - (pd.IndexSlice["z":"z":-1], ""), # type: ignore[misc] - (pd.IndexSlice["m":"m":-1], ""), # type: ignore[misc] - ], - ) - def test_slice_locs_negative_step(self, in_slice, expected, any_string_dtype): - index = Index(list("bcdxy"), dtype=any_string_dtype) - - s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, in_slice.step) - result = index[s_start : s_stop : in_slice.step] - expected = Index(list(expected), dtype=any_string_dtype) - tm.assert_index_equal(result, expected) - - def test_slice_locs_negative_step_oob(self, any_string_dtype): - index = Index(list("bcdxy"), dtype=any_string_dtype) - - result = index[-10:5:1] - tm.assert_index_equal(result, index) - - result = index[4:-10:-1] - expected = Index(list("yxdcb"), dtype=any_string_dtype) - tm.assert_index_equal(result, expected) - - def test_slice_locs_dup(self): - index = Index(["a", "a", "b", "c", "d", "d"]) - assert index.slice_locs("a", "d") == (0, 6) - assert index.slice_locs(end="d") == (0, 6) - assert index.slice_locs("a", "c") == (0, 4) - assert index.slice_locs("b", "d") == (2, 6) - - index2 = index[::-1] - assert index2.slice_locs("d", "a") == (0, 6) - assert index2.slice_locs(end="a") == (0, 6) - assert index2.slice_locs("d", "b") == (0, 4) - assert index2.slice_locs("c", "a") == (2, 6) diff --git a/pandas/tests/indexes/string/__init__.py b/pandas/tests/indexes/string/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/indexes/string/test_astype.py b/pandas/tests/indexes/string/test_astype.py new file mode 100644 index 0000000000000..0349d85f23167 --- /dev/null +++ b/pandas/tests/indexes/string/test_astype.py @@ -0,0 +1,21 @@ +from pandas import ( + Index, + Series, +) +import pandas._testing as tm + + +def test_astype_str_from_bytes(): + # https://github.com/pandas-dev/pandas/issues/38607 + # GH#49658 pre-2.0 Index called .values.astype(str) here, which effectively + # did a .decode() on the bytes object. In 2.0 we go through + # ensure_string_array which does f"{val}" + idx = Index(["あ", b"a"], dtype="object") + result = idx.astype(str) + expected = Index(["あ", "a"], dtype="str") + tm.assert_index_equal(result, expected) + + # while we're here, check that Series.astype behaves the same + result = Series(idx).astype(str) + expected = Series(expected, dtype="str") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/string/test_indexing.py b/pandas/tests/indexes/string/test_indexing.py new file mode 100644 index 0000000000000..755b7109a5a04 --- /dev/null +++ b/pandas/tests/indexes/string/test_indexing.py @@ -0,0 +1,118 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import Index +import pandas._testing as tm + + +class TestGetIndexer: + @pytest.mark.parametrize( + "method,expected", + [ + ("pad", [-1, 0, 1, 1]), + ("backfill", [0, 0, 1, -1]), + ], + ) + def test_get_indexer_strings(self, any_string_dtype, method, expected): + expected = np.array(expected, dtype=np.intp) + index = Index(["b", "c"], dtype=any_string_dtype) + actual = index.get_indexer(["a", "b", "c", "d"], method=method) + + tm.assert_numpy_array_equal(actual, expected) + + def test_get_indexer_strings_raises(self, any_string_dtype): + index = Index(["b", "c"], dtype=any_string_dtype) + + msg = "|".join( + [ + "operation 'sub' not supported for dtype 'str", + r"unsupported operand type\(s\) for -: 'str' and 'str'", + ] + ) + with pytest.raises(TypeError, match=msg): + index.get_indexer(["a", "b", "c", "d"], method="nearest") + + with pytest.raises(TypeError, match=msg): + index.get_indexer(["a", "b", "c", "d"], method="pad", tolerance=2) + + with pytest.raises(TypeError, match=msg): + index.get_indexer( + ["a", "b", "c", "d"], method="pad", tolerance=[2, 2, 2, 2] + ) + + +class TestGetIndexerNonUnique: + @pytest.mark.xfail(reason="TODO(infer_string)", strict=False) + def test_get_indexer_non_unique_nas(self, any_string_dtype, nulls_fixture): + index = Index(["a", "b", None], dtype=any_string_dtype) + indexer, missing = index.get_indexer_non_unique([nulls_fixture]) + + expected_indexer = np.array([2], dtype=np.intp) + expected_missing = np.array([], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected_indexer) + tm.assert_numpy_array_equal(missing, expected_missing) + + # actually non-unique + index = Index(["a", None, "b", None], dtype=any_string_dtype) + indexer, missing = index.get_indexer_non_unique([nulls_fixture]) + + expected_indexer = np.array([1, 3], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected_indexer) + tm.assert_numpy_array_equal(missing, expected_missing) + + +class TestSliceLocs: + @pytest.mark.parametrize( + "in_slice,expected", + [ + # error: Slice index must be an integer or None + (pd.IndexSlice[::-1], "yxdcb"), + (pd.IndexSlice["b":"y":-1], ""), # type: ignore[misc] + (pd.IndexSlice["b"::-1], "b"), # type: ignore[misc] + (pd.IndexSlice[:"b":-1], "yxdcb"), # type: ignore[misc] + (pd.IndexSlice[:"y":-1], "y"), # type: ignore[misc] + (pd.IndexSlice["y"::-1], "yxdcb"), # type: ignore[misc] + (pd.IndexSlice["y"::-4], "yb"), # type: ignore[misc] + # absent labels + (pd.IndexSlice[:"a":-1], "yxdcb"), # type: ignore[misc] + (pd.IndexSlice[:"a":-2], "ydb"), # type: ignore[misc] + (pd.IndexSlice["z"::-1], "yxdcb"), # type: ignore[misc] + (pd.IndexSlice["z"::-3], "yc"), # type: ignore[misc] + (pd.IndexSlice["m"::-1], "dcb"), # type: ignore[misc] + (pd.IndexSlice[:"m":-1], "yx"), # type: ignore[misc] + (pd.IndexSlice["a":"a":-1], ""), # type: ignore[misc] + (pd.IndexSlice["z":"z":-1], ""), # type: ignore[misc] + (pd.IndexSlice["m":"m":-1], ""), # type: ignore[misc] + ], + ) + def test_slice_locs_negative_step(self, in_slice, expected, any_string_dtype): + index = Index(list("bcdxy"), dtype=any_string_dtype) + + s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, in_slice.step) + result = index[s_start : s_stop : in_slice.step] + expected = Index(list(expected), dtype=any_string_dtype) + tm.assert_index_equal(result, expected) + + def test_slice_locs_negative_step_oob(self, any_string_dtype): + index = Index(list("bcdxy"), dtype=any_string_dtype) + + result = index[-10:5:1] + tm.assert_index_equal(result, index) + + result = index[4:-10:-1] + expected = Index(list("yxdcb"), dtype=any_string_dtype) + tm.assert_index_equal(result, expected) + + def test_slice_locs_dup(self, any_string_dtype): + index = Index(["a", "a", "b", "c", "d", "d"], dtype=any_string_dtype) + assert index.slice_locs("a", "d") == (0, 6) + assert index.slice_locs(end="d") == (0, 6) + assert index.slice_locs("a", "c") == (0, 4) + assert index.slice_locs("b", "d") == (2, 6) + + index2 = index[::-1] + assert index2.slice_locs("d", "a") == (0, 6) + assert index2.slice_locs(end="a") == (0, 6) + assert index2.slice_locs("d", "b") == (0, 4) + assert index2.slice_locs("c", "a") == (2, 6) diff --git a/pandas/tests/indexes/test_any_index.py b/pandas/tests/indexes/test_any_index.py index e1ed96195e0a7..a4c18732ef258 100644 --- a/pandas/tests/indexes/test_any_index.py +++ b/pandas/tests/indexes/test_any_index.py @@ -40,7 +40,7 @@ def test_map_identity_mapping(index, request): # GH#12766 result = index.map(lambda x: x) - if index.dtype == object and result.dtype == bool: + if index.dtype == object and (result.dtype == bool or result.dtype == "string"): assert (index == result).all() # TODO: could work that into the 'exact="equiv"'? return # FIXME: doesn't belong in this file anymore! diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index 0199e21bfc980..65feb07e05d9f 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -256,7 +256,7 @@ def test_ensure_copied_data(self, index): "RangeIndex cannot be initialized from data, " "MultiIndex and CategoricalIndex are tested separately" ) - elif index.dtype == object and index.inferred_type == "boolean": + elif index.dtype == object and index.inferred_type in ["boolean", "string"]: init_kwargs["dtype"] = index.dtype index_type = type(index) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index e5dc47be20677..5f934ca3e6e83 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -299,7 +299,13 @@ def test_difference_base(self, sort, index): first.difference([1, 2, 3], sort) @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") - def test_symmetric_difference(self, index): + def test_symmetric_difference(self, index, using_infer_string, request): + if ( + using_infer_string + and index.dtype == "object" + and index.inferred_type == "string" + ): + request.applymarker(pytest.mark.xfail(reason="TODO: infer_string")) if isinstance(index, CategoricalIndex): pytest.skip(f"Not relevant for {type(index).__name__}") if len(index) < 2: diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 4c2ea036f08dc..6ef7105cf5ccc 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -17,7 +17,6 @@ pa_version_under13p0, pa_version_under15p0, pa_version_under17p0, - pa_version_under18p0, ) import pandas as pd @@ -974,21 +973,9 @@ def test_timestamp_nanoseconds(self, pa): df = pd.DataFrame({"a": pd.date_range("2017-01-01", freq="1ns", periods=10)}) check_round_trip(df, pa, write_kwargs={"version": ver}) - def test_timezone_aware_index(self, request, pa, timezone_aware_date_list): + def test_timezone_aware_index(self, pa, timezone_aware_date_list): pytest.importorskip("pyarrow", "11.0.0") - if ( - timezone_aware_date_list.tzinfo != datetime.timezone.utc - and pa_version_under18p0 - ): - request.applymarker( - pytest.mark.xfail( - reason=( - "pyarrow returns pytz.FixedOffset while pandas " - "constructs datetime.timezone https://github.com/pandas-dev/pandas/issues/37286" - ) - ) - ) idx = 5 * [timezone_aware_date_list] df = pd.DataFrame(index=idx, data={"index_as_col": idx}) @@ -1005,6 +992,18 @@ def test_timezone_aware_index(self, request, pa, timezone_aware_date_list): expected = df[:] if pa_version_under11p0: expected.index = expected.index.as_unit("ns") + if timezone_aware_date_list.tzinfo != datetime.timezone.utc: + # pyarrow returns pytz.FixedOffset while pandas constructs datetime.timezone + # https://github.com/pandas-dev/pandas/issues/37286 + try: + import pytz + except ImportError: + pass + else: + offset = df.index.tz.utcoffset(timezone_aware_date_list) + tz = pytz.FixedOffset(offset.total_seconds() / 60) + expected.index = expected.index.tz_convert(tz) + expected["index_as_col"] = expected["index_as_col"].dt.tz_convert(tz) check_round_trip(df, pa, check_dtype=False, expected=expected) def test_filter_row_groups(self, pa): diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py index 1aa9f6dca0303..950f74a686b8d 100644 --- a/pandas/tests/io/test_spss.py +++ b/pandas/tests/io/test_spss.py @@ -177,4 +177,6 @@ def test_spss_metadata(datapath): "modification_time": datetime.datetime(2015, 2, 6, 14, 33, 36), } ) - assert df.attrs == metadata + if Version(pyreadstat.__version__) >= Version("1.2.8"): + metadata["mr_sets"] = {} + tm.assert_dict_equal(df.attrs, metadata) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 9f5085ff2ad28..4b5369d61bed6 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -11,8 +11,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td import pandas as pd @@ -435,9 +433,8 @@ def test_write_dta6(self, datapath, temp_file): check_index_type=False, ) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) - def test_read_write_dta10(self, version, temp_file): + def test_read_write_dta10(self, version, temp_file, using_infer_string): original = DataFrame( data=[["string", "object", 1, 1.1, np.datetime64("2003-12-25")]], columns=["string", "object", "integer", "floating", "datetime"], @@ -451,9 +448,11 @@ def test_read_write_dta10(self, version, temp_file): original.to_stata(path, convert_dates={"datetime": "tc"}, version=version) written_and_read_again = self.read_dta(path) - expected = original[:] + expected = original.copy() # "tc" convert_dates means we store in ms expected["datetime"] = expected["datetime"].astype("M8[ms]") + if using_infer_string: + expected["object"] = expected["object"].astype("str") tm.assert_frame_equal( written_and_read_again.set_index("index"), @@ -1276,7 +1275,6 @@ def test_categorical_ordering(self, file, datapath): assert parsed[col].cat.ordered assert not parsed_unordered[col].cat.ordered - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings("ignore::UserWarning") @pytest.mark.parametrize( "file", @@ -1340,6 +1338,10 @@ def _convert_categorical(from_frame: DataFrame) -> DataFrame: if cat.categories.dtype == object: categories = pd.Index._with_infer(cat.categories._values) cat = cat.set_categories(categories) + elif cat.categories.dtype == "string" and len(cat.categories) == 0: + # if the read categories are empty, it comes back as object dtype + categories = cat.categories.astype(object) + cat = cat.set_categories(categories) from_frame[col] = cat return from_frame @@ -1369,7 +1371,6 @@ def test_iterator(self, datapath): from_chunks = pd.concat(itr) tm.assert_frame_equal(parsed, from_chunks) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.filterwarnings("ignore::UserWarning") @pytest.mark.parametrize( "file", @@ -1674,12 +1675,11 @@ def test_inf(self, infval, temp_file): path = temp_file df.to_stata(path) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_path_pathlib(self): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) df.index.name = "index" reader = lambda x: read_stata(x).set_index("index") @@ -1699,13 +1699,12 @@ def test_value_labels_iterator(self, write_index, temp_file): value_labels = dta_iter.value_labels() assert value_labels == {"A": {0: "A", 1: "B", 2: "C", 3: "E"}} - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_set_index(self, temp_file): # GH 17328 df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) df.index.name = "index" path = temp_file @@ -1733,9 +1732,9 @@ def test_date_parsing_ignores_format_details(self, column, datapath): formatted = df.loc[0, column + "_fmt"] assert unformatted == formatted - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") + # @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("byteorder", ["little", "big"]) - def test_writer_117(self, byteorder, temp_file): + def test_writer_117(self, byteorder, temp_file, using_infer_string): original = DataFrame( data=[ [ @@ -1802,6 +1801,9 @@ def test_writer_117(self, byteorder, temp_file): expected = original[:] # "tc" for convert_dates means we store with "ms" resolution expected["datetime"] = expected["datetime"].astype("M8[ms]") + if using_infer_string: + # object dtype (with only strings/None) comes back as string dtype + expected["object"] = expected["object"].astype("str") tm.assert_frame_equal( written_and_read_again.set_index("index"), @@ -1845,15 +1847,14 @@ def test_invalid_date_conversion(self, temp_file): with pytest.raises(ValueError, match=msg): original.to_stata(path, convert_dates={"wrong_name": "tc"}) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_nonfile_writing(self, version, temp_file): # GH 21041 bio = io.BytesIO() df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) df.index.name = "index" path = temp_file @@ -1864,13 +1865,12 @@ def test_nonfile_writing(self, version, temp_file): reread = read_stata(path, index_col="index") tm.assert_frame_equal(df, reread) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_gzip_writing(self, temp_file): # writing version 117 requires seek and cannot be used with gzip df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) df.index.name = "index" path = temp_file @@ -1907,8 +1907,7 @@ def test_unicode_dta_118_119(self, file, datapath): tm.assert_frame_equal(unicode_df, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_mixed_string_strl(self, temp_file): + def test_mixed_string_strl(self, temp_file, using_infer_string): # GH 23633 output = [{"mixed": "string" * 500, "number": 0}, {"mixed": None, "number": 1}] output = DataFrame(output) @@ -1925,6 +1924,8 @@ def test_mixed_string_strl(self, temp_file): output.to_stata(path, write_index=False, convert_strl=["mixed"], version=117) reread = read_stata(path) expected = output.fillna("") + if using_infer_string: + expected["mixed"] = expected["mixed"].astype("str") tm.assert_frame_equal(reread, expected) @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) @@ -2000,7 +2001,6 @@ def test_stata_119(self, datapath): reader._ensure_open() assert reader._nvar == 32999 - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("version", [118, 119, None]) @pytest.mark.parametrize("byteorder", ["little", "big"]) def test_utf8_writer(self, version, byteorder, temp_file): @@ -2348,13 +2348,12 @@ def test_iterator_errors(datapath, chunksize): pass -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_iterator_value_labels(temp_file): # GH 31544 values = ["c_label", "b_label"] + ["a_label"] * 500 df = DataFrame({f"col{k}": pd.Categorical(values, ordered=True) for k in range(2)}) df.to_stata(temp_file, write_index=False) - expected = pd.Index(["a_label", "b_label", "c_label"], dtype="object") + expected = pd.Index(["a_label", "b_label", "c_label"]) with read_stata(temp_file, chunksize=100) as reader: for j, chunk in enumerate(reader): for i in range(2): diff --git a/pandas/tests/series/accessors/test_str_accessor.py b/pandas/tests/series/accessors/test_str_accessor.py index 09d965ef1f322..ff530459b78fb 100644 --- a/pandas/tests/series/accessors/test_str_accessor.py +++ b/pandas/tests/series/accessors/test_str_accessor.py @@ -15,7 +15,8 @@ def test_str_attribute(self): # str accessor only valid with string values ser = Series(range(5)) - with pytest.raises(AttributeError, match="only use .str accessor"): + msg = "Can only use .str accessor with string values, not integer" + with pytest.raises(AttributeError, match=msg): ser.str.repeat(2) def test_str_accessor_updates_on_inplace(self): diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index dac74a0e32a42..81e7d3774b613 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -65,6 +65,7 @@ def test_factorize_complex(self): expected_uniques = np.array([(1 + 0j), (2 + 0j), (2 + 1j)], dtype=complex) tm.assert_numpy_array_equal(uniques, expected_uniques) + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_factorize(self, index_or_series_obj, sort): obj = index_or_series_obj result_codes, result_uniques = obj.factorize(sort=sort) diff --git a/pyproject.toml b/pyproject.toml index d6a963e94f5b8..6dfee8f4910db 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,8 +2,8 @@ # Minimum requirements for the build system to execute. # See https://github.com/scipy/scipy/pull/12940 for the AIX issue. requires = [ - "meson-python==0.13.1", - "meson==1.2.1", + "meson-python>=0.13.1", + "meson>=1.2.1,<2", "wheel", "Cython~=3.0.5", # Note: sync with setup.py, environment.yml and asv.conf.json # Force numpy higher than 2.0rc1, so that built wheels are compatible diff --git a/requirements-dev.txt b/requirements-dev.txt index 1bf42af6bf2cd..00e320e6370ce 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -44,7 +44,7 @@ s3fs>=2022.11.0 scipy>=1.10.0 SQLAlchemy>=2.0.0 tabulate>=0.9.0 -xarray>=2022.12.0 +xarray>=2022.12.0, <=2024.9.0 xlrd>=2.0.1 xlsxwriter>=3.0.5 zstandard>=0.19.0 diff --git a/web/pandas/about/team.md b/web/pandas/about/team.md index 49b8a26ab56e8..b66e134fa5b2f 100644 --- a/web/pandas/about/team.md +++ b/web/pandas/about/team.md @@ -43,7 +43,7 @@ If you want to support pandas development, you can find information in the [dona Wes McKinney is the Benevolent Dictator for Life (BDFL). -The project governance is available in the [project governance page]({{ base_url }}governance.html). +The project governance is available in the [project governance page]({{ base_url }}about/governance.html). ## Workgroups diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md index 2ea10954fc929..6c69ff7602491 100644 --- a/web/pandas/community/ecosystem.md +++ b/web/pandas/community/ecosystem.md @@ -239,6 +239,17 @@ Console](https://docs.spyder-ide.org/current/panes/ipythonconsole.html), and Spy render Numpydoc documentation on pandas objects in rich text with Sphinx both automatically and on-demand. +### [marimo](https://marimo.io) + +marimo is a reactive notebook for Python and SQL that enhances productivity when working with dataframes. It provides several features to make data manipulation and visualization more interactive and fun: + +1. Rich, interactive displays: marimo can display pandas dataframes in interactive tables or charts with filtering and sorting capabilities. +2. Data selection: Users can select data in tables or pandas-backed plots, and the selections are automatically sent to Python as pandas dataframes. +3. No-code transformations: Users can interactively transform pandas dataframes using a GUI, without writing code. The generated code can be copied and pasted into the notebook. +4. Custom filters: marimo allows the creation of pandas-backed filters using UI elements like sliders and dropdowns. +5. Dataset explorer: marimo automatically discovers and displays all dataframes in the notebook, allowing users to explore and visualize data interactively. +6. SQL integration: marimo allows users to write SQL queries against any pandas dataframes existing in memory. + ## API ### [pandas-datareader](https://github.com/pydata/pandas-datareader)