From 7deee5707e437bcde9dd622374834f84565dd8f0 Mon Sep 17 00:00:00 2001 From: DHRUVA KUMAR KAUSHAL Date: Mon, 1 Dec 2025 21:07:44 +0530 Subject: [PATCH 1/2] mypy fixes --- xarray/core/dataarray.py | 12 ++++++++++-- xarray/core/dataset.py | 23 +++++++++++++++++++---- xarray/tests/test_dataarray.py | 26 ++++++++++++++++++++++++++ xarray/tests/test_dataset.py | 32 ++++++++++++++++++++++++++++++++ 4 files changed, 87 insertions(+), 6 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 6c8d0617038..f8a42c53faf 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -3954,7 +3954,10 @@ def to_pandas(self) -> Self | pd.Series | pd.DataFrame: return pandas_object def to_dataframe( - self, name: Hashable | None = None, dim_order: Sequence[Hashable] | None = None + self, + name: Hashable | None = None, + dim_order: Sequence[Hashable] | None = None, + create_index: bool = True, ) -> pd.DataFrame: """Convert this array and its coordinates into a tidy pandas.DataFrame. @@ -3979,6 +3982,11 @@ def to_dataframe( If provided, must include all dimensions of this DataArray. By default, dimensions are sorted according to the DataArray dimensions order. + create_index : bool, default: True + If True (default), create a MultiIndex from the Cartesian product + of this DataArray's indices. If False, use a RangeIndex instead. + This can be useful to avoid the potentially expensive MultiIndex + creation. Returns ------- @@ -4013,7 +4021,7 @@ def to_dataframe( else: ordered_dims = ds._normalize_dim_order(dim_order=dim_order) - df = ds._to_dataframe(ordered_dims) + df = ds._to_dataframe(ordered_dims, create_index=create_index) df.columns = [name if c == unique_name else c for c in df.columns] return df diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 9c2c2f60db1..84d5a26ba8d 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -7200,7 +7200,7 @@ def to_pandas(self) -> pd.Series | pd.DataFrame: "Please use Dataset.to_dataframe() instead." ) - def _to_dataframe(self, ordered_dims: Mapping[Any, int]): + def _to_dataframe(self, ordered_dims: Mapping[Any, int], create_index: bool = True): from xarray.core.extension_array import PandasExtensionArray # All and only non-index arrays (whether data or coordinates) should @@ -7231,7 +7231,13 @@ def _to_dataframe(self, ordered_dims: Mapping[Any, int]): self._variables[k].set_dims(ordered_dims).values.reshape(-1) for k in non_extension_array_columns ] - index = self.coords.to_index([*ordered_dims]) + if create_index: + index = self.coords.to_index([*ordered_dims]) + else: + # Use a simple RangeIndex when create_index=False + # Calculate the total size from ordered_dims + total_size = np.prod(list(ordered_dims.values())) if ordered_dims else 0 + index = pd.RangeIndex(total_size) broadcasted_df = pd.DataFrame( { **dict(zip(non_extension_array_columns, data, strict=True)), @@ -7259,7 +7265,11 @@ def _to_dataframe(self, ordered_dims: Mapping[Any, int]): broadcasted_df = broadcasted_df.join(extension_array_df) return broadcasted_df[columns_in_order] - def to_dataframe(self, dim_order: Sequence[Hashable] | None = None) -> pd.DataFrame: + def to_dataframe( + self, + dim_order: Sequence[Hashable] | None = None, + create_index: bool = True, + ) -> pd.DataFrame: """Convert this dataset into a pandas.DataFrame. Non-index variables in this dataset form the columns of the @@ -7278,6 +7288,11 @@ def to_dataframe(self, dim_order: Sequence[Hashable] | None = None) -> pd.DataFr If provided, must include all dimensions of this dataset. By default, dimensions are in the same order as in `Dataset.sizes`. + create_index : bool, default: True + If True (default), create a MultiIndex from the Cartesian product + of this dataset's indices. If False, use a RangeIndex instead. + This can be useful to avoid the potentially expensive MultiIndex + creation. Returns ------- @@ -7288,7 +7303,7 @@ def to_dataframe(self, dim_order: Sequence[Hashable] | None = None) -> pd.DataFr ordered_dims = self._normalize_dim_order(dim_order=dim_order) - return self._to_dataframe(ordered_dims=ordered_dims) + return self._to_dataframe(ordered_dims=ordered_dims, create_index=create_index) def _set_sparse_data_from_dataframe( self, idx: pd.Index, arrays: list[tuple[Hashable, np.ndarray]], dims: tuple diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 5eec7b8a2fd..2cf75fb58e2 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3553,6 +3553,32 @@ def test_to_dataframe_0length(self) -> None: assert len(actual) == 0 assert_array_equal(actual.index.names, list("ABC")) + def test_to_dataframe_create_index(self) -> None: + # Test create_index parameter + arr_np = np.arange(12).reshape(3, 4) + arr = DataArray(arr_np, [("x", [1, 2, 3]), ("y", list("abcd"))], name="foo") + + # Default behavior: create MultiIndex + df_with_index = arr.to_dataframe() + assert isinstance(df_with_index.index, pd.MultiIndex) + assert df_with_index.index.names == ["x", "y"] + assert len(df_with_index) == 12 + + # With create_index=False: use RangeIndex + df_without_index = arr.to_dataframe(create_index=False) + assert isinstance(df_without_index.index, pd.RangeIndex) + assert len(df_without_index) == 12 + + # Data should be the same regardless + assert_array_equal(df_with_index["foo"].values, df_without_index["foo"].values) + + # Test with coords that have different dimensions + arr.coords["z"] = ("x", [-1, -2, -3]) + df_with_coords = arr.to_dataframe(create_index=False) + assert isinstance(df_with_coords.index, pd.RangeIndex) + assert "z" in df_with_coords.columns + assert len(df_with_coords) == 12 + @pytest.mark.parametrize( "x_dtype,y_dtype,v_dtype", [ diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index e677430dfbf..e30fa28bbbb 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -2259,6 +2259,38 @@ def test_to_pandas(self) -> None: with pytest.raises(ValueError, match=r"cannot convert Datasets"): Dataset({"a": (["t", "r"], x2d), "b": (["t", "r"], y2d)}).to_pandas() + def test_to_dataframe_create_index(self) -> None: + # Test create_index parameter for Dataset + x = np.random.randn(3, 4) + y = np.random.randn(3, 4) + ds = Dataset( + {"a": (("x", "y"), x), "b": (("x", "y"), y)}, + coords={"x": [1, 2, 3], "y": list("abcd")}, + ) + + # Default behavior: create MultiIndex + df_with_index = ds.to_dataframe() + assert isinstance(df_with_index.index, pd.MultiIndex) + assert df_with_index.index.names == ["x", "y"] + assert len(df_with_index) == 12 + + # With create_index=False: use RangeIndex + df_without_index = ds.to_dataframe(create_index=False) + assert isinstance(df_without_index.index, pd.RangeIndex) + assert len(df_without_index) == 12 + + # Data should be the same regardless + assert_array_equal(df_with_index["a"].values, df_without_index["a"].values) + assert_array_equal(df_with_index["b"].values, df_without_index["b"].values) + + # Test with dim_order and create_index=False + df_reordered = ds.to_dataframe(dim_order=["y", "x"], create_index=False) + assert isinstance(df_reordered.index, pd.RangeIndex) + assert len(df_reordered) == 12 + # Check that dim_order affects the data ordering + df_reordered_with_idx = ds.to_dataframe(dim_order=["y", "x"]) + assert_array_equal(df_reordered["a"].values, df_reordered_with_idx["a"].values) + def test_reindex_like(self) -> None: data = create_test_data() data["letters"] = ("dim3", 10 * ["a"]) From 139a19b73dc8720fb75128c577a1989167b99071 Mon Sep 17 00:00:00 2001 From: DHRUVA KUMAR KAUSHAL Date: Thu, 4 Dec 2025 03:29:44 +0530 Subject: [PATCH 2/2] ruff --- xarray/core/dataset.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 5ba895d4bb0..3c02ef293f5 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -7236,7 +7236,9 @@ def _to_dataframe(self, ordered_dims: Mapping[Any, int], create_index: bool = Tr else: # Use a simple RangeIndex when create_index=False # Calculate the total size from ordered_dims - total_size = np.prod(list(ordered_dims.values())) if ordered_dims else 0 + total_size = ( + int(np.prod(list(ordered_dims.values()))) if ordered_dims else 0 + ) index = pd.RangeIndex(total_size) broadcasted_df = pd.DataFrame( {