From 44bc5ee1ae597dcff4b76dd724d9962acc5fa32b Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 12 Nov 2025 22:15:37 +0000 Subject: [PATCH] feat: Add DataFrame/Series.squeeze --- bigframes/dataframe.py | 16 +++ bigframes/series.py | 5 + tests/unit/test_dataframe_polars.py | 51 +++++++++ tests/unit/test_series_polars.py | 19 ++++ .../bigframes_vendored/pandas/core/generic.py | 102 ++++++++++++++++++ 5 files changed, 193 insertions(+) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index da6da7a925..ac07d83e7b 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2033,6 +2033,22 @@ def nsmallest( column_ids = self._sql_names(columns) return DataFrame(block_ops.nsmallest(self._block, n, column_ids, keep=keep)) + def squeeze(self, axis: typing.Optional[typing.Union[int, str]] = None): + nrows, ncols = self.shape + squeeze_cols = True + squeeze_rows = True + if axis is not None: + axis_n = utils.get_axis_number(axis) + squeeze_cols = axis_n == 1 + squeeze_rows = axis_n == 0 + if (ncols == 1) and (nrows == 1) and (squeeze_rows and squeeze_cols): + return self.to_pandas().iloc[0, 0] + elif ncols == 1 and squeeze_cols: + return bigframes.series.Series(self._block) + elif nrows == 1 and squeeze_rows: + return bigframes.series.Series(self._block.transpose(single_row_mode=True)) + return self + def insert( self, loc: int, diff --git a/bigframes/series.py b/bigframes/series.py index c11cc48394..2ce2443071 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1133,6 +1133,11 @@ def nsmallest(self, n: int = 5, keep: str = "first") -> Series: block_ops.nsmallest(self._block, n, [self._value_column], keep=keep) ) + def squeeze(self, axis=None): + if len(self) == 1: + return self.to_pandas().iloc[0] + return self + def isin(self, values) -> "Series": if isinstance(values, Series): return Series(self._block.isin(values._block)) diff --git a/tests/unit/test_dataframe_polars.py b/tests/unit/test_dataframe_polars.py index b83380d789..0ec58d11df 100644 --- a/tests/unit/test_dataframe_polars.py +++ b/tests/unit/test_dataframe_polars.py @@ -1785,6 +1785,57 @@ def test_dataframe_sort_index_inplace(scalars_dfs): pandas.testing.assert_frame_equal(bf_result, pd_result) +@pytest.mark.parametrize( + ("axis",), + ((0,), ("columns",), (None,)), +) +def test_dataframe_squeeze_noop(scalars_dfs, axis): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_result = scalars_pandas_df.squeeze(axis=axis) + bf_result = scalars_df.squeeze(axis=axis).to_pandas() + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("axis",), + ((1,), (None,)), +) +def test_dataframe_squeeze_cols(scalars_dfs, axis): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_result = scalars_pandas_df[["int64_col"]].squeeze(axis) + bf_result = scalars_df[["int64_col"]].squeeze(axis).to_pandas() + + pandas.testing.assert_series_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("axis",), + ((0,), (None,)), +) +def test_dataframe_squeeze_rows(scalars_dfs, axis): + scalars_df, scalars_pandas_df = scalars_dfs + + # implicitly transposes, so col types need to be compatible + pd_result = scalars_pandas_df[["int64_col", "int64_too"]].head(1).squeeze(axis) + bf_result = scalars_df[["int64_col", "int64_too"]].head(1).squeeze(axis).to_pandas() + + pandas.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + + +def test_dataframe_squeeze_both_axes( + scalars_dfs, +): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_result = scalars_pandas_df[["int64_col"]].head(1).squeeze() + bf_result = scalars_df[["int64_col"]].head(1).squeeze() + + assert pd_result == bf_result + + def test_df_abs(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs columns = ["int64_col", "int64_too", "float64_col"] diff --git a/tests/unit/test_series_polars.py b/tests/unit/test_series_polars.py index 6f729b0df0..2cd2fcbbd0 100644 --- a/tests/unit/test_series_polars.py +++ b/tests/unit/test_series_polars.py @@ -544,6 +544,25 @@ def test_series_equals_different_values(scalars_df_index, scalars_pandas_df_inde assert pd_result == bf_result +def test_series_squeeze_noop(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_result = scalars_pandas_df["int64_too"].squeeze() + bf_result = scalars_df["int64_too"].squeeze().to_pandas() + + assert_series_equal(bf_result, pd_result) + + +def test_series_squeeze_squeezes(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + # implicitly transposes, so col types need to be compatible + pd_result = scalars_pandas_df["int64_too"].head(1).squeeze() + bf_result = scalars_df["int64_too"].head(1).squeeze() + + assert pd_result == bf_result + + def test_series_get_with_default_index(scalars_dfs): col_name = "float64_col" key = 2 diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index 63b9f8199b..66e0c967da 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -82,6 +82,108 @@ def __iter__(self) -> Iterator: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def squeeze(self, axis=None): + """ + Squeeze 1 dimensional axis objects into scalars. + + Series or DataFrames with a single element are squeezed to a scalar. + DataFrames with a single column or a single row are squeezed to a + Series. Otherwise the object is unchanged. + + This method is most useful when you don't know if your + object is a Series or DataFrame, but you do know it has just a single + column. In that case you can safely call `squeeze` to ensure you have a + Series. + + **Examples:** + >>> primes = bpd.Series([2, 3, 5, 7]) + + Slicing might produce a Series with a single value: + + >>> even_primes = primes[primes % 2 == 0] + >>> even_primes + 0 2 + dtype: Int64 + + >>> even_primes.squeeze() + np.int64(2) + + Squeezing objects with more than one value in every axis does nothing: + + >>> odd_primes = primes[primes % 2 == 1] + >>> odd_primes + 1 3 + 2 5 + 3 7 + dtype: Int64 + + >>> odd_primes.squeeze() + 1 3 + 2 5 + 3 7 + dtype: Int64 + + Squeezing is even more effective when used with DataFrames. + + >>> df = bpd.DataFrame([[1, 2], [3, 4]], columns=['a', 'b']) + >>> df + a b + 0 1 2 + 1 3 4 + + [2 rows x 2 columns] + + Slicing a single column will produce a DataFrame with the columns + having only one value: + + >>> df_a = df[['a']] + >>> df_a + a + 0 1 + 1 3 + + [2 rows x 1 columns] + + So the columns can be squeezed down, resulting in a Series: + + >>> df_a.squeeze('columns') + 0 1 + 1 3 + Name: a, dtype: Int64 + + Slicing a single row from a single column will produce a single + scalar DataFrame: + + >>> df_0a = df.loc[[0], ['a']] + >>> df_0a + a + 0 1 + + [1 rows x 1 columns] + + Squeezing the rows produces a single scalar Series: + + >>> df_0a.squeeze('rows') + a 1 + Name: 0, dtype: Int64 + + Squeezing all axes will project directly into a scalar: + + >>> df_0a.squeeze() + np.int64(1) + + Args: + axis ({0 or 'index', 1 or 'columns', None}, default None): + A specific axis to squeeze. By default, all length-1 axes are + squeezed. For `Series` this parameter is unused and defaults to `None`. + + Returns: + DataFrame, Series, or scalar: + The projection after squeezing `axis` or all the axes. + + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + # ------------------------------------------------------------------------- # Unary Methods