From 2b5f122fc175ade6d7402956c257d72233ed8f2c Mon Sep 17 00:00:00 2001
From: Garrett Wu <garrettwu@google.com>
Date: Wed, 19 Nov 2025 23:14:39 +0000
Subject: [PATCH 1/3] fix: ml.model_selection.train_test_split index to match
 in unordered mode

---
 bigframes/ml/model_selection.py               |  1 +
 tests/system/small/ml/test_model_selection.py | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+)

diff --git a/bigframes/ml/model_selection.py b/bigframes/ml/model_selection.py
index 6eba4f81c2..5f22de2f27 100644
--- a/bigframes/ml/model_selection.py
+++ b/bigframes/ml/model_selection.py
@@ -110,6 +110,7 @@ def _stratify_split(df: bpd.DataFrame, stratify: bpd.Series) -> List[bpd.DataFra
     joined_df = dfs[0]
     for df in dfs[1:]:
         joined_df = joined_df.join(df, how="outer")
+    joined_df = joined_df.cache()
     if stratify is None:
         joined_df_train, joined_df_test = joined_df._split(
             fracs=(train_size, test_size), random_state=random_state
diff --git a/tests/system/small/ml/test_model_selection.py b/tests/system/small/ml/test_model_selection.py
index ebce6e405a..992a884f64 100644
--- a/tests/system/small/ml/test_model_selection.py
+++ b/tests/system/small/ml/test_model_selection.py
@@ -46,6 +46,24 @@ def test_train_test_split_default_correct_shape(df_fixture, request):
     assert y_test.shape == (86, 1)
 
 
+def test_train_test_split_default_unordered_same_index(
+    unordered_session, penguins_pandas_df_default_index
+):
+    df = unordered_session.read_pandas(penguins_pandas_df_default_index)
+    X = df[
+        [
+            "species",
+            "island",
+            "culmen_length_mm",
+        ]
+    ]
+    y = df[["body_mass_g"]]
+    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y)
+
+    pd.testing.assert_index_equal(X_train.to_pandas().index, y_train.to_pandas().index)
+    pd.testing.assert_index_equal(X_test.to_pandas().index, y_test.to_pandas().index)
+
+
 def test_train_test_split_series_default_correct_shape(penguins_df_default_index):
     X = penguins_df_default_index[["species"]]
     y = penguins_df_default_index["body_mass_g"]

From 9e2cd71b1907b236093c950a3bddfdde9ec22529 Mon Sep 17 00:00:00 2001
From: Garrett Wu <garrettwu@google.com>
Date: Thu, 20 Nov 2025 00:17:02 +0000
Subject: [PATCH 2/3] fix

---
 bigframes/ml/model_selection.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bigframes/ml/model_selection.py b/bigframes/ml/model_selection.py
index 5f22de2f27..a06fbeb818 100644
--- a/bigframes/ml/model_selection.py
+++ b/bigframes/ml/model_selection.py
@@ -121,8 +121,8 @@ def _stratify_split(df: bpd.DataFrame, stratify: bpd.Series) -> List[bpd.DataFra
     results = []
     for array in arrays:
         columns = array.name if isinstance(array, bpd.Series) else array.columns
-        results.append(joined_df_train[columns])
-        results.append(joined_df_test[columns])
+        results.append(joined_df_train[columns].cache())
+        results.append(joined_df_test[columns].cache())
 
     return results
 

From b29c6f3f1cf09294610966eba566df57a6f69988 Mon Sep 17 00:00:00 2001
From: Garrett Wu <garrettwu@google.com>
Date: Mon, 24 Nov 2025 23:53:53 +0000
Subject: [PATCH 3/3] move cache()

---
 bigframes/ml/model_selection.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/bigframes/ml/model_selection.py b/bigframes/ml/model_selection.py
index a06fbeb818..cc3086dca4 100644
--- a/bigframes/ml/model_selection.py
+++ b/bigframes/ml/model_selection.py
@@ -110,7 +110,6 @@ def _stratify_split(df: bpd.DataFrame, stratify: bpd.Series) -> List[bpd.DataFra
     joined_df = dfs[0]
     for df in dfs[1:]:
         joined_df = joined_df.join(df, how="outer")
-    joined_df = joined_df.cache()
     if stratify is None:
         joined_df_train, joined_df_test = joined_df._split(
             fracs=(train_size, test_size), random_state=random_state
@@ -118,11 +117,14 @@ def _stratify_split(df: bpd.DataFrame, stratify: bpd.Series) -> List[bpd.DataFra
     else:
         joined_df_train, joined_df_test = _stratify_split(joined_df, stratify)
 
+    joined_df_train = joined_df_train.cache()
+    joined_df_test = joined_df_test.cache()
+
     results = []
     for array in arrays:
         columns = array.name if isinstance(array, bpd.Series) else array.columns
-        results.append(joined_df_train[columns].cache())
-        results.append(joined_df_test[columns].cache())
+        results.append(joined_df_train[columns])
+        results.append(joined_df_test[columns])
 
     return results