Add train/test split function

folmos-at-orange · folmos-at-orange · commit b196040adedb · 2024-06-24T13:55:02.000+02:00
diff --git a/Sklearn Basics 2 - Train a Classifier on a Star Multi-Table Dataset.ipynb b/Sklearn Basics 2 - Train a Classifier on a Star Multi-Table Dataset.ipynb
@@ -22,6 +22,8 @@
     "import pandas as pd\n",
     "from khiops import core as kh\n",
     "from khiops.sklearn import KhiopsClassifier\n",
+    "from khiops.utils.helpers import train_test_split_dataset\n",
+    "from sklearn import metrics\n",
     "\n",
     "# If there are any issues you may Khiops status with the following command\n",
     "# kh.get_runner().print_status()"
@@ -106,8 +108,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "headlines_train_df = headlines_df.drop(\"IsSarcasm\", axis=1)\n",
-    "y_sarcasm_train = headlines_df[\"IsSarcasm\"]"
+    "headlines_main_df = headlines_df.drop(\"IsSarcasm\", axis=1)\n",
+    "y_sarcasm = headlines_df[\"IsSarcasm\"]"
    ]
   },
   {
@@ -138,15 +140,36 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "X_sarcasm_train = {\n",
+    "X_sarcasm = {\n",
     "    \"main_table\": \"headlines\",\n",
     "    \"tables\": {\n",
-    "        \"headlines\": (headlines_train_df, \"HeadlineId\"),\n",
+    "        \"headlines\": (headlines_main_df, \"HeadlineId\"),\n",
     "        \"headline_words\": (headlines_words_df, \"HeadlineId\"),\n",
     "    },\n",
     "}"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To separate this dataset into train and test, we user the `khiops-python` helper function `train_test_split_dataset`. This function allows to separate ``dict`` dataset specifications:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "(\n",
+    "    X_sarcasm_train,\n",
+    "    X_sarcasm_test,\n",
+    "    y_sarcasm_train,\n",
+    "    y_sarcasm_test,\n",
+    ") = train_test_split_dataset(X_sarcasm, y_sarcasm)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -196,7 +219,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Now, we use our sarcasm classifier to obtain predictions on the training data. We normally do that on new test data, and again a multi-table dataset specification would have been needed."
+    "Now, we use our sarcasm classifier to obtain predictions and probabilities on the test data:"
    ]
   },
   {
@@ -205,9 +228,33 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "sarcasm_predictions = khc_sarcasm.predict(X_sarcasm_train)\n",
-    "print(\"HeadlineSarcasm train predictions (first 10 values):\")\n",
-    "display(sarcasm_predictions[:10])"
+    "y_sarcasm_test_predicted = khc_sarcasm.predict(X_sarcasm_test)\n",
+    "probas_sarcasm_test = khc_sarcasm.predict_proba(X_sarcasm_test)\n",
+    "\n",
+    "print(\"HeadlineSarcasm test predictions (first 10 values):\")\n",
+    "display(sarcasm_test_predicted[:10])\n",
+    "print(\"HeadlineSarcasm test prediction probabilities (first 10 values):\")\n",
+    "display(sarcasm_test_probas[:10])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally we may estimate the accuracy and AUC for the test data:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sarcasm_test_accuracy = metrics.accuracy_score(y_sarcasm_test, y_sarcasm_test_predicted)\n",
+    "sarcasm_test_auc = metrics.roc_auc_score(y_sarcasm_test, probas_sarcasm_test[:, 1])\n",
+    "\n",
+    "print(f\"Sarcasm test accuracy: {sarcasm_test_accuracy}\")\n",
+    "print(f\"Sarcasm test auc     : {sarcasm_test_auc}\")"
    ]
   },
   {
@@ -249,13 +296,13 @@
     "accidents_dataset_dir = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n",
     "\n",
     "accidents_file = os.path.join(accidents_dataset_dir, \"Accidents.txt\")\n",
-    "accidents_df = pd.read_csv(accidents_file, sep=\"\\t\", encoding=\"ISO-8859-1\")\n",
+    "accidents_df = pd.read_csv(accidents_file, sep=\"\\t\", encoding=\"latin1\")\n",
     "print(f\"Accidents dataframe (first 10 rows):\")\n",
     "display(accidents_df.head(10))\n",
     "print()\n",
     "\n",
     "vehicles_file = os.path.join(accidents_dataset_dir, \"Vehicles.txt\")\n",
-    "vehicles_df = pd.read_csv(vehicles_file, sep=\"\\t\", encoding=\"ISO-8859-1\")\n",
+    "vehicles_df = pd.read_csv(vehicles_file, sep=\"\\t\", encoding=\"latin1\")\n",
     "print(f\"Vehicles dataframe (first 10 rows):\")\n",
     "display(vehicles_df.head(10))"
    ]
@@ -278,7 +325,7 @@
    "outputs": [],
    "source": [
     "accidents_main_df = accidents_df.drop(\"Gravity\", axis=1)\n",
-    "y_accidents_train = accidents_df[\"Gravity\"]"
+    "y_accidents = accidents_df[\"Gravity\"]"
    ]
   },
   {
@@ -298,7 +345,7 @@
    },
    "outputs": [],
    "source": [
-    "X_accidents_train = {\n",
+    "X_accidents = {\n",
     "    \"main_table\": \"accidents\",\n",
     "    \"tables\": {\n",
     "        \"accidents\": (accidents_main_df, \"AccidentId\"),\n",
@@ -307,6 +354,29 @@
     "}"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Split the dataset into train and test"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "is_khiops_tutorial_solution": true
+   },
+   "outputs": [],
+   "source": [
+    "(\n",
+    "    X_accidents_train,\n",
+    "    X_accidents_test,\n",
+    "    y_accidents_train,\n",
+    "    y_accidents_test,\n",
+    ") = train_test_split_dataset(X_accidents, y_accidents)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -333,13 +403,15 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "#### Print the accuracy and auc of the model\n"
+    "#### Print the train accuracy and auc of the model\n"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "is_khiops_tutorial_solution": true
+   },
    "outputs": [],
    "source": [
     "accidents_train_performance = (\n",
@@ -353,9 +425,32 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "#### Deploy the classifier to obtain predictions on the training data\n",
+    "#### Deploy the classifier to obtain predictions and its probabilites on the test data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "is_khiops_tutorial_solution": true
+   },
+   "outputs": [],
+   "source": [
+    "y_accidents_test_predicted = khc_accidents.predict(X_accidents_test)\n",
+    "probas_accidents_test = khc_accidents.predict_proba(X_accidents_test)\n",
     "\n",
-    "*Note that usually one deploys the model on new test data. We deploy on the train dataset to keep the tutorial simple*.\n"
+    "print(\"Accidents test predictions (first 10 values):\")\n",
+    "display(y_accidents_test_predicted[:10])\n",
+    "print(\"Accidentns test prediction probabilities (first 10 values):\")\n",
+    "display(probas_accidents_test[:10])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Obtain the accuracy and AUC on the test dataset\n",
+    "\n"
    ]
   },
   {
@@ -366,7 +461,15 @@
    },
    "outputs": [],
    "source": [
-    "khc_accidents.predict(X_accidents_train)"
+    "accidents_test_accuracy = metrics.accuracy_score(\n",
+    "    y_accidents_test, y_accidents_test_predicted\n",
+    ")\n",
+    "accidents_test_auc = metrics.roc_auc_score(\n",
+    "    y_accidents_test, probas_accidents_test[:, 1]\n",
+    ")\n",
+    "\n",
+    "print(f\"Accidents test accuracy: {accidents_test_accuracy}\")\n",
+    "print(f\"Accidents test auc     : {accidents_test_auc}\")"
    ]
   }
  ],
diff --git a/Sklearn Basics 3 - Train a Classifier on a Snowflake Multi-Table Dataset.ipynb b/Sklearn Basics 3 - Train a Classifier on a Snowflake Multi-Table Dataset.ipynb
@@ -21,6 +21,8 @@
     "import pandas as pd\n",
     "from khiops import core as kh\n",
     "from khiops.sklearn import KhiopsClassifier\n",
+    "from khiops.utils.helpers import train_test_split_dataset\n",
+    "from sklearn import metrics\n",
     "\n",
     "# If there are any issues you may Khiops status with the following command\n",
     "# kh.get_runner().print_status()"
@@ -86,23 +88,6 @@
     "display(places_df.head(10))"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Create the main feature matrix and the target vector for `Accidents`"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "accidents_main_df = accidents_df.drop(\"Gravity\", axis=1)\n",
-    "y_accidents_train = accidents_df[\"Gravity\"]"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -112,8 +97,7 @@
     "Note the main table `Accidents` and the secondary table `Places` have one key `AccidentId`.\n",
     "Tables `Vehicles` (the other secondary table) and `Users` (the tertiary table) have two keys: `AccidentId` and `VehicleId`.\n",
     "\n",
-    "To describe relations between tables, the field `relations` must be added to the dictionary of table specifications. This field\n",
-    "contains a list of tuples describing the relations between tables. The first two values (`str`) of each tuple correspond to names of both the parent and the child table involved in the relation. A third value (`bool`) can be optionally set as `True` to indicate that the relation is `1:1`. For example, if the tuple `(table1, table2, True)` is contained in this field, it means that:\n",
+    "To describe relations between tables, we add the  `relations` field must to the dataset spec. This field contains a list of tuples describing the relations between tables. The first two values (`str`) of each tuple correspond to names of both the parent and the child table involved in the relation. A third value (`bool`) can be optionally set as `True` to indicate that the relation is `1:1`. For example, if the tuple `(table1, table2, True)` is contained in this field, it means that:\n",
     "\n",
     "  - `table1` and `table2` are in a `1:1` relationship\n",
     "  - The key of `table1` is contained in that of `table2` (ie. keys are hierarchical)\n",
@@ -127,10 +111,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "X_accidents_train = {\n",
+    "X_accidents = {\n",
     "    \"main_table\": \"Accidents\",\n",
     "    \"tables\": {\n",
-    "        \"Accidents\": (accidents_main_df, \"AccidentId\"),\n",
+    "        \"Accidents\": (accidents_df.drop(\"Gravity\", axis=1), \"AccidentId\"),\n",
     "        \"Vehicles\": (vehicles_df, [\"AccidentId\", \"VehicleId\"]),\n",
     "        \"Users\": (users_df, [\"AccidentId\", \"VehicleId\"]),\n",
     "        \"Places\": (places_df, [\"AccidentId\"]),\n",
@@ -140,7 +124,30 @@
     "        (\"Vehicles\", \"Users\"),\n",
     "        (\"Accidents\", \"Places\", True),\n",
     "    ],\n",
-    "}"
+    "}\n",
+    "y_accidents = accidents_df[\"Gravity\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Split the dataset into train and test\n",
+    "We use the helper function `train_test_split_dataset` with the `X` dataset spec to obtain one spec for train and another for test."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "(\n",
+    "    X_accidents_train,\n",
+    "    X_accidents_test,\n",
+    "    y_accidents_train,\n",
+    "    y_accidents_test,\n",
+    ") = train_test_split_dataset(X, y, test_size=0.3)"
    ]
   },
   {
@@ -167,7 +174,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "#### Print the accuracy and auc of the model\n"
+    "#### Print the train accuracy and train auc of the model"
    ]
   },
   {
@@ -187,9 +194,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "#### Deploy the classifier to obtain predictions on the training data\n",
-    "\n",
-    "Note that usually one deploys the model on new test data. We deploy on the train dataset to keep the tutorial simple*.\n"
+    "#### Deploy the classifier to obtain predictions and probabilities on the test data"
    ]
   },
   {
@@ -202,6 +207,49 @@
    "source": [
     "khc_accidents.predict(X_accidents_train)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y_accidents_predicted_test = khc_accidents.predict(X_accidents_test)\n",
+    "probas_accidents_test = khc_accidents.predict_proba(X_accidents_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "probas_accidents_test"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Estimates the accuracy and AUC metrics on the test data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "accidents_test_accuracy = metrics.accuracy_score(\n",
+    "    y_accidents_test, y_accidents_predicted_test\n",
+    ")\n",
+    "accidents_test_auc = metrics.roc_auc_score(\n",
+    "    y_accidents_test, probas_accidents_test[:, 1]\n",
+    ")\n",
+    "\n",
+    "print(f\"Accidents test accuracy: {accidents_test_accuracy}\")\n",
+    "print(f\"Accidents test auc     : {accidents_test_auc}\")"
+   ]
   }
  ],
  "metadata": {
diff --git a/Sklearn Basics 4 - Train a Coclustering.ipynb b/Sklearn Basics 4 - Train a Coclustering.ipynb