From 2b5f122fc175ade6d7402956c257d72233ed8f2c Mon Sep 17 00:00:00 2001 From: Garrett Wu Date: Wed, 19 Nov 2025 23:14:39 +0000 Subject: [PATCH 1/3] fix: ml.model_selection.train_test_split index to match in unordered mode --- bigframes/ml/model_selection.py | 1 + tests/system/small/ml/test_model_selection.py | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/bigframes/ml/model_selection.py b/bigframes/ml/model_selection.py index 6eba4f81c2..5f22de2f27 100644 --- a/bigframes/ml/model_selection.py +++ b/bigframes/ml/model_selection.py @@ -110,6 +110,7 @@ def _stratify_split(df: bpd.DataFrame, stratify: bpd.Series) -> List[bpd.DataFra joined_df = dfs[0] for df in dfs[1:]: joined_df = joined_df.join(df, how="outer") + joined_df = joined_df.cache() if stratify is None: joined_df_train, joined_df_test = joined_df._split( fracs=(train_size, test_size), random_state=random_state diff --git a/tests/system/small/ml/test_model_selection.py b/tests/system/small/ml/test_model_selection.py index ebce6e405a..992a884f64 100644 --- a/tests/system/small/ml/test_model_selection.py +++ b/tests/system/small/ml/test_model_selection.py @@ -46,6 +46,24 @@ def test_train_test_split_default_correct_shape(df_fixture, request): assert y_test.shape == (86, 1) +def test_train_test_split_default_unordered_same_index( + unordered_session, penguins_pandas_df_default_index +): + df = unordered_session.read_pandas(penguins_pandas_df_default_index) + X = df[ + [ + "species", + "island", + "culmen_length_mm", + ] + ] + y = df[["body_mass_g"]] + X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y) + + pd.testing.assert_index_equal(X_train.to_pandas().index, y_train.to_pandas().index) + pd.testing.assert_index_equal(X_test.to_pandas().index, y_test.to_pandas().index) + + def test_train_test_split_series_default_correct_shape(penguins_df_default_index): X = penguins_df_default_index[["species"]] y = penguins_df_default_index["body_mass_g"] From 9e2cd71b1907b236093c950a3bddfdde9ec22529 Mon Sep 17 00:00:00 2001 From: Garrett Wu Date: Thu, 20 Nov 2025 00:17:02 +0000 Subject: [PATCH 2/3] fix --- bigframes/ml/model_selection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bigframes/ml/model_selection.py b/bigframes/ml/model_selection.py index 5f22de2f27..a06fbeb818 100644 --- a/bigframes/ml/model_selection.py +++ b/bigframes/ml/model_selection.py @@ -121,8 +121,8 @@ def _stratify_split(df: bpd.DataFrame, stratify: bpd.Series) -> List[bpd.DataFra results = [] for array in arrays: columns = array.name if isinstance(array, bpd.Series) else array.columns - results.append(joined_df_train[columns]) - results.append(joined_df_test[columns]) + results.append(joined_df_train[columns].cache()) + results.append(joined_df_test[columns].cache()) return results From b29c6f3f1cf09294610966eba566df57a6f69988 Mon Sep 17 00:00:00 2001 From: Garrett Wu Date: Mon, 24 Nov 2025 23:53:53 +0000 Subject: [PATCH 3/3] move cache() --- bigframes/ml/model_selection.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/bigframes/ml/model_selection.py b/bigframes/ml/model_selection.py index a06fbeb818..cc3086dca4 100644 --- a/bigframes/ml/model_selection.py +++ b/bigframes/ml/model_selection.py @@ -110,7 +110,6 @@ def _stratify_split(df: bpd.DataFrame, stratify: bpd.Series) -> List[bpd.DataFra joined_df = dfs[0] for df in dfs[1:]: joined_df = joined_df.join(df, how="outer") - joined_df = joined_df.cache() if stratify is None: joined_df_train, joined_df_test = joined_df._split( fracs=(train_size, test_size), random_state=random_state @@ -118,11 +117,14 @@ def _stratify_split(df: bpd.DataFrame, stratify: bpd.Series) -> List[bpd.DataFra else: joined_df_train, joined_df_test = _stratify_split(joined_df, stratify) + joined_df_train = joined_df_train.cache() + joined_df_test = joined_df_test.cache() + results = [] for array in arrays: columns = array.name if isinstance(array, bpd.Series) else array.columns - results.append(joined_df_train[columns].cache()) - results.append(joined_df_test[columns].cache()) + results.append(joined_df_train[columns]) + results.append(joined_df_test[columns]) return results