KhiopsML
diff --git a/‎Core Basics 2 - Train a Classifier on a Star Multi-Table Dataset.ipynb‎
Lines changed: 3 additions & 1 deletion b/‎Core Basics 2 - Train a Classifier on a Star Multi-Table Dataset.ipynb‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎Sklearn Basics 2 - Train a Classifier on a Star Multi-Table Dataset.ipynb‎
Lines changed: 120 additions & 17 deletions b/‎Sklearn Basics 2 - Train a Classifier on a Star Multi-Table Dataset.ipynb‎
Lines changed: 120 additions & 17 deletions
@@ -344,7 +344,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "is_khiops_tutorial_solution": true
+   },
    "outputs": [],
    "source": [
     "# To visualize uncomment the line below\n",
 
@@ -22,6 +22,8 @@
     "import pandas as pd\n",
     "from khiops import core as kh\n",
     "from khiops.sklearn import KhiopsClassifier\n",
+    "from khiops.utils.helpers import train_test_split_dataset\n",
+    "from sklearn import metrics\n",
     "\n",
     "# If there are any issues you may Khiops status with the following command\n",
     "# kh.get_runner().print_status()"
@@ -106,8 +108,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "headlines_train_df = headlines_df.drop(\"IsSarcasm\", axis=1)\n",
-    "y_sarcasm_train = headlines_df[\"IsSarcasm\"]"
+    "headlines_main_df = headlines_df.drop(\"IsSarcasm\", axis=1)\n",
+    "y_sarcasm = headlines_df[\"IsSarcasm\"]"
    ]
   },
   {
@@ -138,15 +140,36 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "X_sarcasm_train = {\n",
+    "X_sarcasm = {\n",
     "    \"main_table\": \"headlines\",\n",
     "    \"tables\": {\n",
-    "        \"headlines\": (headlines_train_df, \"HeadlineId\"),\n",
+    "        \"headlines\": (headlines_main_df, \"HeadlineId\"),\n",
     "        \"headline_words\": (headlines_words_df, \"HeadlineId\"),\n",
     "    },\n",
     "}"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To separate this dataset into train and test, we user the `khiops-python` helper function `train_test_split_dataset`. This function allows to separate ``dict`` dataset specifications:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "(\n",
+    "    X_sarcasm_train,\n",
+    "    X_sarcasm_test,\n",
+    "    y_sarcasm_train,\n",
+    "    y_sarcasm_test,\n",
+    ") = train_test_split_dataset(X_sarcasm, y_sarcasm)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -196,7 +219,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Now, we use our sarcasm classifier to obtain predictions on the training data. We normally do that on new test data, and again a multi-table dataset specification would have been needed."
+    "Now, we use our sarcasm classifier to obtain predictions and probabilities on the test data:"
    ]
   },
   {
@@ -205,9 +228,33 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "sarcasm_predictions = khc_sarcasm.predict(X_sarcasm_train)\n",
-    "print(\"HeadlineSarcasm train predictions (first 10 values):\")\n",
-    "display(sarcasm_predictions[:10])"
+    "y_sarcasm_test_predicted = khc_sarcasm.predict(X_sarcasm_test)\n",
+    "probas_sarcasm_test = khc_sarcasm.predict_proba(X_sarcasm_test)\n",
+    "\n",
+    "print(\"HeadlineSarcasm test predictions (first 10 values):\")\n",
+    "display(sarcasm_test_predicted[:10])\n",
+    "print(\"HeadlineSarcasm test prediction probabilities (first 10 values):\")\n",
+    "display(sarcasm_test_probas[:10])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally we may estimate the accuracy and AUC for the test data:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sarcasm_test_accuracy = metrics.accuracy_score(y_sarcasm_test, y_sarcasm_test_predicted)\n",
+    "sarcasm_test_auc = metrics.roc_auc_score(y_sarcasm_test, probas_sarcasm_test[:, 1])\n",
+    "\n",
+    "print(f\"Sarcasm test accuracy: {sarcasm_test_accuracy}\")\n",
+    "print(f\"Sarcasm test auc     : {sarcasm_test_auc}\")"
    ]
   },
   {
@@ -249,13 +296,13 @@
     "accidents_dataset_dir = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n",
     "\n",
     "accidents_file = os.path.join(accidents_dataset_dir, \"Accidents.txt\")\n",
-    "accidents_df = pd.read_csv(accidents_file, sep=\"\\t\", encoding=\"ISO-8859-1\")\n",
+    "accidents_df = pd.read_csv(accidents_file, sep=\"\\t\", encoding=\"latin1\")\n",
     "print(f\"Accidents dataframe (first 10 rows):\")\n",
     "display(accidents_df.head(10))\n",
     "print()\n",
     "\n",
     "vehicles_file = os.path.join(accidents_dataset_dir, \"Vehicles.txt\")\n",
-    "vehicles_df = pd.read_csv(vehicles_file, sep=\"\\t\", encoding=\"ISO-8859-1\")\n",
+    "vehicles_df = pd.read_csv(vehicles_file, sep=\"\\t\", encoding=\"latin1\")\n",
     "print(f\"Vehicles dataframe (first 10 rows):\")\n",
     "display(vehicles_df.head(10))"
    ]
@@ -278,7 +325,7 @@
    "outputs": [],
    "source": [
     "accidents_main_df = accidents_df.drop(\"Gravity\", axis=1)\n",
-    "y_accidents_train = accidents_df[\"Gravity\"]"
+    "y_accidents = accidents_df[\"Gravity\"]"
    ]
   },
   {
@@ -298,7 +345,7 @@
    },
    "outputs": [],
    "source": [
-    "X_accidents_train = {\n",
+    "X_accidents = {\n",
     "    \"main_table\": \"accidents\",\n",
     "    \"tables\": {\n",
     "        \"accidents\": (accidents_main_df, \"AccidentId\"),\n",
@@ -307,6 +354,29 @@
     "}"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Split the dataset into train and test"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "is_khiops_tutorial_solution": true
+   },
+   "outputs": [],
+   "source": [
+    "(\n",
+    "    X_accidents_train,\n",
+    "    X_accidents_test,\n",
+    "    y_accidents_train,\n",
+    "    y_accidents_test,\n",
+    ") = train_test_split_dataset(X_accidents, y_accidents)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -333,13 +403,15 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "#### Print the accuracy and auc of the model\n"
+    "#### Print the train accuracy and auc of the model\n"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "is_khiops_tutorial_solution": true
+   },
    "outputs": [],
    "source": [
     "accidents_train_performance = (\n",
@@ -353,9 +425,32 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "#### Deploy the classifier to obtain predictions on the training data\n",
+    "#### Deploy the classifier to obtain predictions and its probabilites on the test data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "is_khiops_tutorial_solution": true
+   },
+   "outputs": [],
+   "source": [
+    "y_accidents_test_predicted = khc_accidents.predict(X_accidents_test)\n",
+    "probas_accidents_test = khc_accidents.predict_proba(X_accidents_test)\n",
     "\n",
-    "*Note that usually one deploys the model on new test data. We deploy on the train dataset to keep the tutorial simple*.\n"
+    "print(\"Accidents test predictions (first 10 values):\")\n",
+    "display(y_accidents_test_predicted[:10])\n",
+    "print(\"Accidentns test prediction probabilities (first 10 values):\")\n",
+    "display(probas_accidents_test[:10])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Obtain the accuracy and AUC on the test dataset\n",
+    "\n"
    ]
   },
   {
@@ -366,7 +461,15 @@
    },
    "outputs": [],
    "source": [
-    "khc_accidents.predict(X_accidents_train)"
+    "accidents_test_accuracy = metrics.accuracy_score(\n",
+    "    y_accidents_test, y_accidents_test_predicted\n",
+    ")\n",
+    "accidents_test_auc = metrics.roc_auc_score(\n",
+    "    y_accidents_test, probas_accidents_test[:, 1]\n",
+    ")\n",
+    "\n",
+    "print(f\"Accidents test accuracy: {accidents_test_accuracy}\")\n",
+    "print(f\"Accidents test auc     : {accidents_test_auc}\")"
    ]
   }
  ],