|
22 | 22 | "import pandas as pd\n", |
23 | 23 | "from khiops import core as kh\n", |
24 | 24 | "from khiops.sklearn import KhiopsClassifier\n", |
| 25 | + "from khiops.utils.helpers import train_test_split_dataset\n", |
| 26 | + "from sklearn import metrics\n", |
25 | 27 | "\n", |
26 | 28 | "# If there are any issues you may Khiops status with the following command\n", |
27 | 29 | "# kh.get_runner().print_status()" |
|
106 | 108 | "metadata": {}, |
107 | 109 | "outputs": [], |
108 | 110 | "source": [ |
109 | | - "headlines_train_df = headlines_df.drop(\"IsSarcasm\", axis=1)\n", |
110 | | - "y_sarcasm_train = headlines_df[\"IsSarcasm\"]" |
| 111 | + "headlines_main_df = headlines_df.drop(\"IsSarcasm\", axis=1)\n", |
| 112 | + "y_sarcasm = headlines_df[\"IsSarcasm\"]" |
111 | 113 | ] |
112 | 114 | }, |
113 | 115 | { |
|
138 | 140 | "metadata": {}, |
139 | 141 | "outputs": [], |
140 | 142 | "source": [ |
141 | | - "X_sarcasm_train = {\n", |
| 143 | + "X_sarcasm = {\n", |
142 | 144 | " \"main_table\": \"headlines\",\n", |
143 | 145 | " \"tables\": {\n", |
144 | | - " \"headlines\": (headlines_train_df, \"HeadlineId\"),\n", |
| 146 | + " \"headlines\": (headlines_main_df, \"HeadlineId\"),\n", |
145 | 147 | " \"headline_words\": (headlines_words_df, \"HeadlineId\"),\n", |
146 | 148 | " },\n", |
147 | 149 | "}" |
148 | 150 | ] |
149 | 151 | }, |
| 152 | + { |
| 153 | + "cell_type": "markdown", |
| 154 | + "metadata": {}, |
| 155 | + "source": [ |
| 156 | + "To separate this dataset into train and test, we user the `khiops-python` helper function `train_test_split_dataset`. This function allows to separate ``dict`` dataset specifications:" |
| 157 | + ] |
| 158 | + }, |
| 159 | + { |
| 160 | + "cell_type": "code", |
| 161 | + "execution_count": null, |
| 162 | + "metadata": {}, |
| 163 | + "outputs": [], |
| 164 | + "source": [ |
| 165 | + "(\n", |
| 166 | + " X_sarcasm_train,\n", |
| 167 | + " X_sarcasm_test,\n", |
| 168 | + " y_sarcasm_train,\n", |
| 169 | + " y_sarcasm_test,\n", |
| 170 | + ") = train_test_split_dataset(X_sarcasm, y_sarcasm)" |
| 171 | + ] |
| 172 | + }, |
150 | 173 | { |
151 | 174 | "cell_type": "markdown", |
152 | 175 | "metadata": {}, |
|
196 | 219 | "cell_type": "markdown", |
197 | 220 | "metadata": {}, |
198 | 221 | "source": [ |
199 | | - "Now, we use our sarcasm classifier to obtain predictions on the training data. We normally do that on new test data, and again a multi-table dataset specification would have been needed." |
| 222 | + "Now, we use our sarcasm classifier to obtain predictions and probabilities on the test data:" |
200 | 223 | ] |
201 | 224 | }, |
202 | 225 | { |
|
205 | 228 | "metadata": {}, |
206 | 229 | "outputs": [], |
207 | 230 | "source": [ |
208 | | - "sarcasm_predictions = khc_sarcasm.predict(X_sarcasm_train)\n", |
209 | | - "print(\"HeadlineSarcasm train predictions (first 10 values):\")\n", |
210 | | - "display(sarcasm_predictions[:10])" |
| 231 | + "y_sarcasm_test_predicted = khc_sarcasm.predict(X_sarcasm_test)\n", |
| 232 | + "probas_sarcasm_test = khc_sarcasm.predict_proba(X_sarcasm_test)\n", |
| 233 | + "\n", |
| 234 | + "print(\"HeadlineSarcasm test predictions (first 10 values):\")\n", |
| 235 | + "display(sarcasm_test_predicted[:10])\n", |
| 236 | + "print(\"HeadlineSarcasm test prediction probabilities (first 10 values):\")\n", |
| 237 | + "display(sarcasm_test_probas[:10])" |
| 238 | + ] |
| 239 | + }, |
| 240 | + { |
| 241 | + "cell_type": "markdown", |
| 242 | + "metadata": {}, |
| 243 | + "source": [ |
| 244 | + "Finally we may estimate the accuracy and AUC for the test data:" |
| 245 | + ] |
| 246 | + }, |
| 247 | + { |
| 248 | + "cell_type": "code", |
| 249 | + "execution_count": null, |
| 250 | + "metadata": {}, |
| 251 | + "outputs": [], |
| 252 | + "source": [ |
| 253 | + "sarcasm_test_accuracy = metrics.accuracy_score(y_sarcasm_test, y_sarcasm_test_predicted)\n", |
| 254 | + "sarcasm_test_auc = metrics.roc_auc_score(y_sarcasm_test, probas_sarcasm_test[:, 1])\n", |
| 255 | + "\n", |
| 256 | + "print(f\"Sarcasm test accuracy: {sarcasm_test_accuracy}\")\n", |
| 257 | + "print(f\"Sarcasm test auc : {sarcasm_test_auc}\")" |
211 | 258 | ] |
212 | 259 | }, |
213 | 260 | { |
|
249 | 296 | "accidents_dataset_dir = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", |
250 | 297 | "\n", |
251 | 298 | "accidents_file = os.path.join(accidents_dataset_dir, \"Accidents.txt\")\n", |
252 | | - "accidents_df = pd.read_csv(accidents_file, sep=\"\\t\", encoding=\"ISO-8859-1\")\n", |
| 299 | + "accidents_df = pd.read_csv(accidents_file, sep=\"\\t\", encoding=\"latin1\")\n", |
253 | 300 | "print(f\"Accidents dataframe (first 10 rows):\")\n", |
254 | 301 | "display(accidents_df.head(10))\n", |
255 | 302 | "print()\n", |
256 | 303 | "\n", |
257 | 304 | "vehicles_file = os.path.join(accidents_dataset_dir, \"Vehicles.txt\")\n", |
258 | | - "vehicles_df = pd.read_csv(vehicles_file, sep=\"\\t\", encoding=\"ISO-8859-1\")\n", |
| 305 | + "vehicles_df = pd.read_csv(vehicles_file, sep=\"\\t\", encoding=\"latin1\")\n", |
259 | 306 | "print(f\"Vehicles dataframe (first 10 rows):\")\n", |
260 | 307 | "display(vehicles_df.head(10))" |
261 | 308 | ] |
|
278 | 325 | "outputs": [], |
279 | 326 | "source": [ |
280 | 327 | "accidents_main_df = accidents_df.drop(\"Gravity\", axis=1)\n", |
281 | | - "y_accidents_train = accidents_df[\"Gravity\"]" |
| 328 | + "y_accidents = accidents_df[\"Gravity\"]" |
282 | 329 | ] |
283 | 330 | }, |
284 | 331 | { |
|
298 | 345 | }, |
299 | 346 | "outputs": [], |
300 | 347 | "source": [ |
301 | | - "X_accidents_train = {\n", |
| 348 | + "X_accidents = {\n", |
302 | 349 | " \"main_table\": \"accidents\",\n", |
303 | 350 | " \"tables\": {\n", |
304 | 351 | " \"accidents\": (accidents_main_df, \"AccidentId\"),\n", |
|
307 | 354 | "}" |
308 | 355 | ] |
309 | 356 | }, |
| 357 | + { |
| 358 | + "cell_type": "markdown", |
| 359 | + "metadata": {}, |
| 360 | + "source": [ |
| 361 | + "#### Split the dataset into train and test" |
| 362 | + ] |
| 363 | + }, |
| 364 | + { |
| 365 | + "cell_type": "code", |
| 366 | + "execution_count": null, |
| 367 | + "metadata": { |
| 368 | + "is_khiops_tutorial_solution": true |
| 369 | + }, |
| 370 | + "outputs": [], |
| 371 | + "source": [ |
| 372 | + "(\n", |
| 373 | + " X_accidents_train,\n", |
| 374 | + " X_accidents_test,\n", |
| 375 | + " y_accidents_train,\n", |
| 376 | + " y_accidents_test,\n", |
| 377 | + ") = train_test_split_dataset(X_accidents, y_accidents)" |
| 378 | + ] |
| 379 | + }, |
310 | 380 | { |
311 | 381 | "cell_type": "markdown", |
312 | 382 | "metadata": {}, |
|
333 | 403 | "cell_type": "markdown", |
334 | 404 | "metadata": {}, |
335 | 405 | "source": [ |
336 | | - "#### Print the accuracy and auc of the model\n" |
| 406 | + "#### Print the train accuracy and auc of the model\n" |
337 | 407 | ] |
338 | 408 | }, |
339 | 409 | { |
340 | 410 | "cell_type": "code", |
341 | 411 | "execution_count": null, |
342 | | - "metadata": {}, |
| 412 | + "metadata": { |
| 413 | + "is_khiops_tutorial_solution": true |
| 414 | + }, |
343 | 415 | "outputs": [], |
344 | 416 | "source": [ |
345 | 417 | "accidents_train_performance = (\n", |
|
353 | 425 | "cell_type": "markdown", |
354 | 426 | "metadata": {}, |
355 | 427 | "source": [ |
356 | | - "#### Deploy the classifier to obtain predictions on the training data\n", |
| 428 | + "#### Deploy the classifier to obtain predictions and its probabilites on the test data" |
| 429 | + ] |
| 430 | + }, |
| 431 | + { |
| 432 | + "cell_type": "code", |
| 433 | + "execution_count": null, |
| 434 | + "metadata": { |
| 435 | + "is_khiops_tutorial_solution": true |
| 436 | + }, |
| 437 | + "outputs": [], |
| 438 | + "source": [ |
| 439 | + "y_accidents_test_predicted = khc_accidents.predict(X_accidents_test)\n", |
| 440 | + "probas_accidents_test = khc_accidents.predict_proba(X_accidents_test)\n", |
357 | 441 | "\n", |
358 | | - "*Note that usually one deploys the model on new test data. We deploy on the train dataset to keep the tutorial simple*.\n" |
| 442 | + "print(\"Accidents test predictions (first 10 values):\")\n", |
| 443 | + "display(y_accidents_test_predicted[:10])\n", |
| 444 | + "print(\"Accidentns test prediction probabilities (first 10 values):\")\n", |
| 445 | + "display(probas_accidents_test[:10])" |
| 446 | + ] |
| 447 | + }, |
| 448 | + { |
| 449 | + "cell_type": "markdown", |
| 450 | + "metadata": {}, |
| 451 | + "source": [ |
| 452 | + "#### Obtain the accuracy and AUC on the test dataset\n", |
| 453 | + "\n" |
359 | 454 | ] |
360 | 455 | }, |
361 | 456 | { |
|
366 | 461 | }, |
367 | 462 | "outputs": [], |
368 | 463 | "source": [ |
369 | | - "khc_accidents.predict(X_accidents_train)" |
| 464 | + "accidents_test_accuracy = metrics.accuracy_score(\n", |
| 465 | + " y_accidents_test, y_accidents_test_predicted\n", |
| 466 | + ")\n", |
| 467 | + "accidents_test_auc = metrics.roc_auc_score(\n", |
| 468 | + " y_accidents_test, probas_accidents_test[:, 1]\n", |
| 469 | + ")\n", |
| 470 | + "\n", |
| 471 | + "print(f\"Accidents test accuracy: {accidents_test_accuracy}\")\n", |
| 472 | + "print(f\"Accidents test auc : {accidents_test_auc}\")" |
370 | 473 | ] |
371 | 474 | } |
372 | 475 | ], |
|
0 commit comments