diff --git a/playground.ipynb b/playground.ipynb new file mode 100644 index 00000000..9122aab1 --- /dev/null +++ b/playground.ipynb @@ -0,0 +1,1702 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "BigScience Metadata playground", + "provenance": [], + "collapsed_sections": [ + "SbhiOxCY8rZ2", + "veqze27o9HEX", + "VpEg7z7CAiUc", + "L6_jjYRf0ZXN", + "UI_pyX7P0cRG", + "ckWrWh2r07ZH", + "wexEuNAuOTrI", + "cEPExQdIbqN1", + "6lisgI0hbLUd", + "QluA_8eLbTqr", + "__lnPt_ya_SS", + "0nXMJuiMbcBx", + "j3EMRvoAbf9Y" + ] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "TPU" + }, + "cells": [ + { + "cell_type": "code", + "metadata": { + "id": "rE6KUpKC-xb7" + }, + "source": [ + "# !nvidia-smi -L" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "5DtYDK2LL6jM" + }, + "source": [ + "# import os\n", + "# from tensorflow.python.profiler import profiler_client\n", + "\n", + "# tpu_profile_service_address = os.environ['COLAB_TPU_ADDR'].replace('8470', '8466')\n", + "# print(profiler_client.monitor(tpu_profile_service_address, 100, 2))" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "V5QKUdkUMBAr" + }, + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "reQ0s7ej6tO2" + }, + "source": [ + "# Data" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "qX28VK4UYGRX" + }, + "source": [ + "from pathlib import Path\n", + "\n", + "\n", + "DATA_DIR=Path(\"/content/drive/MyDrive/colab_data/bigscience\")\n", + "!mkdir -p {DATA_DIR}\n", + "\n", + "!rm -rf sample_data" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "RwcpNOdzTo-o" + }, + "source": [ + "TRN_CNT, TRN_OFFSET = 10000, 0\n", + "VLD_CNT, VLD_OFFSET = 1000, 1000\n", + "ds_key, ds_subkey = \"mc4\", \"en\"\n", + "ds_name = f\"{ds_key}-{ds_subkey}\"\n", + "trn_jsonl_p = DATA_DIR / f\"{ds_name}_trn_{TRN_OFFSET}-{TRN_OFFSET+TRN_CNT}.jsonl\"\n", + "vld_jsonl_p = DATA_DIR / f\"{ds_name}_vld_{VLD_OFFSET}-{VLD_OFFSET+VLD_CNT}.jsonl\"" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NLyFy7Tr0gJz" + }, + "source": [ + "## URL Metadata of mC4 `(trn[:10000], vld[1000:2000])`\n", + "Skip this subsection if jsonl files of the same data subsets are prepared before." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "oFgAWS0dD2mq" + }, + "source": [ + "!pip check\n", + "!pip install -q datasets[streaming]\n", + "!pip check" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "apuZh6PrVQhW" + }, + "source": [ + "from datasets import load_dataset\n", + "\n", + "\n", + "mc4_en_trn_ds = load_dataset(\n", + " ds_key,\n", + " ds_subkey,\n", + " split=\"train\",\n", + " # data_dir=DATA_DIR,\n", + " streaming=True,\n", + ")\n", + "mc4_en_vld_ds = load_dataset(\n", + " ds_key,\n", + " ds_subkey,\n", + " split=\"validation\",\n", + " # data_dir=DATA_DIR,\n", + " streaming=True,\n", + ")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "ZstWjsMWygfG" + }, + "source": [ + "!pip check\n", + "!pip install -q jsonlines\n", + "!pip check" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "b4CozMGrjEOu" + }, + "source": [ + "import jsonlines\n", + "\n", + "\n", + "def _gen(ds, limit=None, offset=0):\n", + " for i, data in enumerate(ds):\n", + " if i < offset: continue\n", + " if i - offset == limit: break\n", + " yield {\n", + " \"text\": data['text'],\n", + " \"metadata\": [{\n", + " \"key\": \"url\",\n", + " \"type\": \"global\",\n", + " \"value\": data[\"url\"],\n", + " }],\n", + " }\n", + "\n", + "\n", + "with jsonlines.open(trn_jsonl_p, mode='w') as writer:\n", + " writer.write_all(l for l in _gen(mc4_en_trn_ds, TRN_CNT, TRN_OFFSET))\n", + "with jsonlines.open(vld_jsonl_p, mode='w') as writer:\n", + " writer.write_all(l for l in _gen(mc4_en_vld_ds, VLD_CNT, VLD_OFFSET))" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SbhiOxCY8rZ2" + }, + "source": [ + "# Trials" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "veqze27o9HEX" + }, + "source": [ + "## Dependencies" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "qqXXP-0sAzlv" + }, + "source": [ + "!pip check\n", + "!pip install -qU pip\n", + "!pip freeze | grep torch\n", + "!pip check" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "1Fv8-Xr2B21z" + }, + "source": [ + "!pip check\n", + "!pip install -q torch==1.8.1 torchvision==0.9.1 torchtext==0.9.1 -f https://download.pytorch.org/whl/cu102/torch_stable.html\n", + "!pip freeze | grep torch\n", + "!pip check" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VpEg7z7CAiUc" + }, + "source": [ + "### TPU\n", + "Skip this step if not using TPU." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "X_nCrsMuxRYa" + }, + "source": [ + "!pip check\n", + "# From https://github.com/huggingface/notebooks/blob/master/examples/accelerate/simple_nlp_example.ipynb\n", + "# Not sure whether we still need not-yet-released accelerate for TPU or not.\n", + "# !pip install -q cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl\n", + "!pip install -q cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.8.1-cp37-cp37m-linux_x86_64.whl\n", + "# !pip install -q git+https://github.com/huggingface/accelerate\n", + "!pip check" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fCISNe_6XnZ8" + }, + "source": [ + "### Package" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "ptIFglMEAHnG" + }, + "source": [ + "%cd /content\n", + "!rm -rf bigscience-metadata\n", + "!git clone -b perf-collator_with_padding_for_tpu https://github.com/tianjianjiang/bigscience-metadata.git" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "RhnFQ-EXr-JP" + }, + "source": [ + "%cd /content/bigscience-metadata\n", + "!pip install -q -e .\n", + "!pip check" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "4BXNw3yrUX9A" + }, + "source": [ + "!wandb login" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "L6_jjYRf0ZXN" + }, + "source": [ + "## Without URL" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "_IdlsR4X0Vfq", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "3fb8ab2b-8df3-401e-c9c2-a023c2fd71bd" + }, + "source": [ + "# !TOKENIZERS_PARALLELISM=false \\\n", + "!accelerate launch --fp16 \\\n", + "/content/bigscience-metadata/bsmetadata/train.py \\\n", + "max_train_steps=100 num_eval=1 \\\n", + "data_config.experiment=without_metadata \\\n", + "data_config.per_device_eval_batch_size=8 \\\n", + "data_config.max_seq_len=768 \\\n", + "data_config.train_file={trn_jsonl_p} \\\n", + "data_config.validation_file={vld_jsonl_p}" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "WARNING:root:TPU has started up successfully with version pytorch-1.8.1\n", + "WARNING:root:TPU has started up successfully with version pytorch-1.8.1\n", + "data_config:\n", + " experiment: without_metadata\n", + " per_device_eval_batch_size: 8\n", + " per_device_train_batch_size: 2\n", + " metadata_list: []\n", + " metadata_sep: ' | '\n", + " metadata_key_value_sep: ': '\n", + " metadata_probability: 1.0\n", + " global_metadata_sep: ' |||'\n", + " max_seq_len: 768\n", + " dataset_name: null\n", + " dataset_config_name: null\n", + " train_file: /content/drive/MyDrive/colab_data/bigscience/mc4-en_trn_0-10000.jsonl\n", + " validation_file: /content/drive/MyDrive/colab_data/bigscience/mc4-en_vld_1000-2000.jsonl\n", + " overwrite_cache: false\n", + " cache_dir: null\n", + " preprocessing_num_workers: null\n", + " validation_split_percentage: 5\n", + " block_size: null\n", + " distributed_type: TPU\n", + "weight_decay: 0.0\n", + "learning_rate: 5.0e-05\n", + "gradient_accumulation_steps: 1\n", + "num_train_epochs: 1\n", + "max_train_steps: 100\n", + "lr_scheduler_type: linear\n", + "num_warmup_steps: 1000\n", + "seed: 42\n", + "out_dir: output_dir\n", + "num_eval: 1\n", + "model_name: gpt2\n", + "project_name: metadata_lm\n", + "\n", + "[2021-09-08 22:35:02,024][datasets.builder][WARNING] - Using custom data configuration default-e08b76da1fb65150\n", + "[2021-09-08 22:35:02,028][datasets.builder][WARNING] - Reusing dataset json (/root/.cache/huggingface/datasets/json/default-e08b76da1fb65150/0.0.0/45636811569ec4a6630521c18235dfbbab83b7ab572e3393c5ba68ccabe98264)\n", + "Running tokenizer on dataset: 100% 10/10 [00:20<00:00, 2.02s/ba]\n", + "Running tokenizer on dataset: 100% 1/1 [00:02<00:00, 2.13s/ba]\n", + "Grouping texts in chunks of 1024: 100% 10/10 [00:36<00:00, 3.60s/ba]\n", + "Grouping texts in chunks of 1024: 100% 1/1 [00:03<00:00, 3.92s/ba]\n", + "training: 0% 0/100 [00:00\n", + " sys.exit(main())\n", + " File \"/usr/local/lib/python3.7/dist-packages/accelerate/commands/accelerate_cli.py\", line 41, in main\n", + " args.func(args)\n", + " File \"/usr/local/lib/python3.7/dist-packages/accelerate/commands/launch.py\", line 384, in launch_command\n", + " simple_launcher(args)\n", + " File \"/usr/local/lib/python3.7/dist-packages/accelerate/commands/launch.py\", line 142, in simple_launcher\n", + " raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd)\n", + "subprocess.CalledProcessError: Command '['/usr/bin/python3', '/content/bigscience-metadata/bsmetadata/train.py', 'max_train_steps=100', 'num_eval=1', 'data_config.experiment=without_metadata', 'data_config.per_device_eval_batch_size=8', 'data_config.max_seq_len=768', 'data_config.train_file=/content/drive/MyDrive/colab_data/bigscience/mc4-en_trn_0-10000.jsonl', 'data_config.validation_file=/content/drive/MyDrive/colab_data/bigscience/mc4-en_vld_1000-2000.jsonl']' died with .\n", + "/usr/lib/python3.7/multiprocessing/semaphore_tracker.py:144: UserWarning: semaphore_tracker: There appear to be 6 leaked semaphores to clean up at shutdown\n", + " len(cache))\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "ty69LlcHt7ZB" + }, + "source": [ + "# !rm -rf outputs" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UI_pyX7P0cRG" + }, + "source": [ + "## With URL" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "dgPpzdhbnkMd", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "aa02ca4a-5d1e-414a-9564-4cf6f34d68ee" + }, + "source": [ + "# !TOKENIZERS_PARALLELISM=false \\\n", + "!accelerate launch --fp16 \\\n", + "/content/bigscience-metadata/bsmetadata/train.py \\\n", + "max_train_steps=100 num_eval=1 \\\n", + "data_config.experiment=with_metadata \\\n", + "data_config.per_device_eval_batch_size=8 \\\n", + "data_config.max_seq_len=768 \\\n", + "data_config.train_file={trn_jsonl_p} \\\n", + "data_config.validation_file={vld_jsonl_p}" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "WARNING:root:TPU has started up successfully with version pytorch-1.8.1\n", + "WARNING:root:TPU has started up successfully with version pytorch-1.8.1\n", + "data_config:\n", + " experiment: with_metadata\n", + " per_device_eval_batch_size: 8\n", + " per_device_train_batch_size: 2\n", + " metadata_list: []\n", + " metadata_sep: ' | '\n", + " metadata_key_value_sep: ': '\n", + " metadata_probability: 1.0\n", + " global_metadata_sep: ' |||'\n", + " max_seq_len: 768\n", + " dataset_name: null\n", + " dataset_config_name: null\n", + " train_file: /content/drive/MyDrive/colab_data/bigscience/mc4-en_trn_0-10000.jsonl\n", + " validation_file: /content/drive/MyDrive/colab_data/bigscience/mc4-en_vld_1000-2000.jsonl\n", + " overwrite_cache: false\n", + " cache_dir: null\n", + " preprocessing_num_workers: null\n", + " validation_split_percentage: 5\n", + " block_size: null\n", + " distributed_type: TPU\n", + "weight_decay: 0.0\n", + "learning_rate: 5.0e-05\n", + "gradient_accumulation_steps: 1\n", + "num_train_epochs: 1\n", + "max_train_steps: 100\n", + "lr_scheduler_type: linear\n", + "num_warmup_steps: 1000\n", + "seed: 42\n", + "out_dir: output_dir\n", + "num_eval: 1\n", + "model_name: gpt2\n", + "project_name: metadata_lm\n", + "\n", + "[2021-09-08 22:39:13,291][datasets.builder][WARNING] - Using custom data configuration default-e08b76da1fb65150\n", + "[2021-09-08 22:39:13,295][datasets.builder][WARNING] - Reusing dataset json (/root/.cache/huggingface/datasets/json/default-e08b76da1fb65150/0.0.0/45636811569ec4a6630521c18235dfbbab83b7ab572e3393c5ba68ccabe98264)\n", + "Pre-process the text and metadata to create new samples: 100% 10/10 [00:58<00:00, 5.88s/ba]\n", + "Pre-process the text and metadata to create new samples: 100% 1/1 [00:06<00:00, 6.10s/ba]\n", + "Create labels column: 100% 13/13 [00:12<00:00, 1.06ba/s]\n", + "Create labels column: 100% 2/2 [00:01<00:00, 1.60ba/s]\n", + "training: 0% 0/100 [00:00\n", + " sys.exit(main())\n", + " File \"/usr/local/lib/python3.7/dist-packages/accelerate/commands/accelerate_cli.py\", line 41, in main\n", + " args.func(args)\n", + " File \"/usr/local/lib/python3.7/dist-packages/accelerate/commands/launch.py\", line 384, in launch_command\n", + " simple_launcher(args)\n", + " File \"/usr/local/lib/python3.7/dist-packages/accelerate/commands/launch.py\", line 142, in simple_launcher\n", + " raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd)\n", + "subprocess.CalledProcessError: Command '['/usr/bin/python3', '/content/bigscience-metadata/bsmetadata/train.py', 'max_train_steps=100', 'num_eval=1', 'data_config.experiment=with_metadata', 'data_config.per_device_eval_batch_size=8', 'data_config.max_seq_len=768', 'data_config.train_file=/content/drive/MyDrive/colab_data/bigscience/mc4-en_trn_0-10000.jsonl', 'data_config.validation_file=/content/drive/MyDrive/colab_data/bigscience/mc4-en_vld_1000-2000.jsonl']' died with .\n", + "/usr/lib/python3.7/multiprocessing/semaphore_tracker.py:144: UserWarning: semaphore_tracker: There appear to be 6 leaked semaphores to clean up at shutdown\n", + " len(cache))\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "UPTWrf_Tt74I" + }, + "source": [ + "# !rm -rf outputs" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ckWrWh2r07ZH" + }, + "source": [ + "## TBA" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "xTkK8Ye2B16g" + }, + "source": [ + "!pip install git+https://git@github.com/bigscience-workshop/promptsource.git@main" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "ozeXjIj79zWv" + }, + "source": [ + "# from promptsource.utils import get_dataset\n", + "# tydiqa_pri_ds, failed = get_dataset(ds_key, ds_subkey)\n", + "# tydiqa_pri_ds, failed\n", + "\n", + "ds_key, ds_subkey = \"tydiqa\", \"primary_task\"\n", + "tydiqa_pri_trn_ds = load_dataset(\n", + " ds_key,\n", + " ds_subkey,\n", + " split=\"train\",\n", + " data_dir=DATA_DIR,\n", + " streaming=True\n", + ")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "8JYUCIzJQDsw" + }, + "source": [ + "from promptsource.templates import TemplateCollection\n", + "template_collection = TemplateCollection()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "Kgrmu0xGQUMp" + }, + "source": [ + "tydiqa_pri_tmpls = template_collection.get_dataset(ds_key, ds_subkey)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "o3cBp_wCQ11h" + }, + "source": [ + "tydiqa_pri_tmpls.all_template_names" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "LIQjZWnvSzW5" + }, + "source": [ + "from promptsource.utils import removeHyphen" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "nPQdsLxMQ89g" + }, + "source": [ + "done = False\n", + "for data in tydiqa_pri_trn_ds:\n", + " if data[\"language\"] != \"english\":\n", + " continue\n", + " for tmpl in tydiqa_pri_tmpls.templates.values():\n", + " inst = tmpl.apply(removeHyphen(data))\n", + " if len(inst) == 2:\n", + " data.pop(\"passage_answer_candidates\", None)\n", + " # data.pop(\"plaintext_start_byte\", None)\n", + " display(data)\n", + " prmp, ans = inst\n", + " display(prmp)\n", + " display(ans)\n", + " outcome = {\n", + " \"text\": prmp,\n", + " \"metadata\": [{\n", + " \"key\": \"url\",\n", + " \"type\": \"global\",\n", + " \"value\": data[\"document_url\"],\n", + " }],\n", + " \"answer\": ans,\n", + " }\n", + " print(outcome)\n", + " done = True\n", + " if done:\n", + " break\n", + " if done:\n", + " break" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AzAgDN7G9XFQ" + }, + "source": [ + "# EDA" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eqjnzx-5a2Tf" + }, + "source": [ + "## Preprocessing for mC4 jsonl" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "8PpaITdECoqr" + }, + "source": [ + "from urllib.parse import urlsplit\n", + "\n", + "\n", + "trn_url_parts_map = dict()\n", + "with jsonlines.open(trn_jsonl_p) as trn_jsonl:\n", + " for trn_json in trn_jsonl:\n", + " trn_url = next(filter(lambda x: x[\"key\"] == \"url\", trn_json[\"metadata\"]))['value']\n", + " trn_url_parts = urlsplit(trn_url)\n", + " trn_url_parts_map[trn_url] = trn_url_parts" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "eCB5jZmJFMq-" + }, + "source": [ + "from urllib.parse import unquote_plus\n", + "\n", + "import pandas as pd\n", + "\n", + "\n", + "trn_url_parts_df = pd.DataFrame(\n", + " data=[(\n", + " url,\n", + " parts.netloc,\n", + " sum(1 for _ in filter(None, parts.netloc.split('.'))),\n", + " parts.hostname if parts.hostname else '',\n", + " parts.port if parts.port else '',\n", + " unquote_plus(parts.path),\n", + " sum(1 for _ in filter(None, parts.path.split('/'))),\n", + " unquote_plus(parts.query),\n", + " unquote_plus(parts.fragment)\n", + " ) for url, parts in trn_url_parts_map.items()],\n", + " columns=[\n", + " 'url',\n", + " 'netloc',\n", + " 'netloc_level_cnt',\n", + " 'hostname',\n", + " 'port',\n", + " 'path',\n", + " 'path_level_cnt',\n", + " 'query',\n", + " 'fragment'\n", + " ]\n", + ")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HG-g6IZqcCdk" + }, + "source": [ + "## Basic Stats" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 326 + }, + "id": "T-xQa8YSQGBW", + "outputId": "db5c0976-0d02-4600-905c-e120de3e7bad" + }, + "source": [ + "trn_url_parts_df.head()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
urlnetlocnetloc_level_cnthostnameportpathpath_level_cntqueryfragment
0http://www.polkaudio.com/forums/showthread.php...www.polkaudio.com3www.polkaudio.com/forums/showthread.php258429-Are-my-speakers-magnetically-shielded&go...
1http://www.atthecoachhouse.co.uk/events/practi...www.atthecoachhouse.co.uk4www.atthecoachhouse.co.uk/events/practical-course-in-the-natural-crafts...2
2https://www.digitalspy.com/showbiz/a26775099/l...www.digitalspy.com3www.digitalspy.com/showbiz/a26775099/louis-tomlinson-zayn-malik-...3
3http://www.californialandcan.org/Plumas/Farm-R...www.californialandcan.org3www.californialandcan.org/Plumas/Farm-Resources/2
4https://www.seattlepi.com/local/article/Report...www.seattlepi.com3www.seattlepi.com/local/article/Report-cites-major-problems-at-...3
\n", + "
" + ], + "text/plain": [ + " url ... fragment\n", + "0 http://www.polkaudio.com/forums/showthread.php... ... \n", + "1 http://www.atthecoachhouse.co.uk/events/practi... ... \n", + "2 https://www.digitalspy.com/showbiz/a26775099/l... ... \n", + "3 http://www.californialandcan.org/Plumas/Farm-R... ... \n", + "4 https://www.seattlepi.com/local/article/Report... ... \n", + "\n", + "[5 rows x 9 columns]" + ] + }, + "metadata": {}, + "execution_count": null + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 359 + }, + "id": "KK7CcZXXT4sR", + "outputId": "0acfd2b0-818d-47ca-df2d-70de695bf132" + }, + "source": [ + "trn_url_parts_df.describe(percentiles=[.25, .5, .75, .95, .975])" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
netloc_level_cntpath_level_cnt
count10000.00000010000.000000
mean2.8457002.495700
std0.5833741.398491
min2.0000000.000000
25%2.0000001.000000
50%3.0000002.000000
75%3.0000003.000000
95%4.0000005.000000
97.5%4.0000006.000000
max7.00000014.000000
\n", + "
" + ], + "text/plain": [ + " netloc_level_cnt path_level_cnt\n", + "count 10000.000000 10000.000000\n", + "mean 2.845700 2.495700\n", + "std 0.583374 1.398491\n", + "min 2.000000 0.000000\n", + "25% 2.000000 1.000000\n", + "50% 3.000000 2.000000\n", + "75% 3.000000 3.000000\n", + "95% 4.000000 5.000000\n", + "97.5% 4.000000 6.000000\n", + "max 7.000000 14.000000" + ] + }, + "metadata": {}, + "execution_count": null + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 282 + }, + "id": "-4ZLCRmOT4sR", + "outputId": "373f2605-6e13-48d3-db1d-be904ba1f533" + }, + "source": [ + "trn_url_parts_df.netloc_level_cnt.value_counts().sort_index().plot()\n", + "trn_url_parts_df.path_level_cnt.value_counts().sort_index().plot()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ] + }, + "metadata": {}, + "execution_count": null + }, + { + "output_type": "display_data", + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD4CAYAAAAAczaOAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO3de3xcdZn48c8zmdybNJM2TS+T0kJLL8rVcsfLUi7lokUFBEQKol1ddN39seuCrqKy7LLrBXQVEKFQEAUEXCotlFLAK2pTgQLNhIZyaUrSpE2bS9Nc5/n9cc6k03aSTJLJXM4879crrznznTNnnunlOd98z/c8X1FVjDHGZAdfqgMwxhiTPJb0jTEmi1jSN8aYLGJJ3xhjsoglfWOMySL+VAcwlMmTJ+usWbNSHYYxxmSUjRs37lTVilivpXXSnzVrFtXV1akOwxhjMoqIvDPYaza8Y4wxWcSSvjHGZBFL+sYYk0Us6RtjTBaxpG+MMVnEkr4xxmQRS/rGGJNFLOl72NrXG2lo3ZfqMIwxacSSvkd1dPfxhZ9vZMUf3kp1KMaYNGJJ36NqG9tRhXdbOlMdijEmjVjS96hQYxsA9btteMcYs58lfY8KNbQDsM16+saYKJb0PSrS02/r6qN1X2+KozHGpAtL+h6kqoQa2qkoyQegfrf19o0xDkv6HrR9zz7au/s4c8EUwMb1jTH7WdL3oNpGZzz/zAWVgI3rG2P2s6TvQSE36Z90+CQm5Putp2+MGWBJ34NqGtqYWV7EhHw/wUChjekbYwZY0vegUGM786eWABAMFFlP3xgzwJK+x3T19rO1uSMq6ReyraUTVU1xZMaYdGBJ32PqmjoIK8yfVgpAVXkRe3v62d1pc/WNMZb0PaemwbkpK7qnDzZX3xjjiCvpi0iZiDwqIiERqRGRU0SkXETWicgW9zHg7isi8iMRqRORTSJyfNRxlrn7bxGRZeP1pbJZbWM7Bbk+DptUDEBVoAiAbS02rm+Mib+n/0PgaVWdDxwD1ADXA+tVdS6w3n0OcC4w1/1ZDtwBICLlwI3AScCJwI2RE4VJnFBjO/MqS8jxCQDBcuvpG2P2Gzbpi8hE4EPAPQCq2qOqe4ClwEp3t5XAhe72UuB+dfwZKBORacA5wDpVbVHV3cA6YElCv40h1NjG/KmlA89LC3KZWJjLNkv6xhji6+nPBpqBe0XkJRG5W0SKgUpVbXD3aQQq3e0ZwLao99e7bYO1H0BElotItYhUNzc3j+zbZLnm9m52dvQwf1rJAe3OXH0b3jHGxJf0/cDxwB2qehywl/1DOQCoMx8wIXMCVfUuVV2kqosqKioSccisEamsOW/qgUm/KlBkpRiMMUB8Sb8eqFfVv7jPH8U5Cexwh21wH5vc17cDVVHvD7ptg7WbBInU0I8e3oH9PX2bq2+MGTbpq2ojsE1E5rlNi4HNwCogMgNnGfCEu70KuNKdxXMy0OoOA60FzhaRgHsB92y3zSRITWMblaX5lBfnHdBeVV5Ed1+Y5o7uFEVmjEkX/jj3+zLwoIjkAVuBq3FOGI+IyDXAO8Al7r5rgPOAOqDT3RdVbRGRm4AN7n7fUdWWhHwLAzjTNQ/u5QNUDczg2ceUkoJkh2WMSSNxJX1VfRlYFOOlxTH2VeDaQY6zAlgxkgBNfPr6w2zZ0cHpcycf8lpwYK5+J8fPtFmyxmQzuyPXI97auZee/jALYvT099+VazN4jMl2lvQ9osatoX/wdE2Aojw/k4rz7AYtY4wlfa8INbTh9wmHT54Q8/VguZVYNsZY0veMUGM7c6ZMIM8f+680UmLZGJPdLOl7RKihbaCyZixVgSK279lHOGxz9Y3JZpb0PaC1s5f3WrsGaujHEgwU0tuv7GjvSmJkxph0Y0nfA2p3RO7EHaKnX24llo0xlvQ9IVJzZ8EwPX2wEsvGZDtL+h5Q09BOoCiXKSX5g+4zo8xJ+tbTNya7WdL3gFBjG/OmliAig+5TkJvDlJJ86+kbk+Us6We4cFgHrblzsKryIltMxZgsZ0k/w23b3UlnTz8LYtyJe7AqW0zFmKxnST/D1QxSQz+WYKCIhtYu+vrD4x2WMSZNWdLPcLWN7YjAkZVx9PTLC+kPKw2tNlffmGxlST/DhRrbmD2pmMK8nGH3HSixbOP6xmQtS/oZLtTYHrOyZixVbtK3cX1jspcl/QzW2dPH27v2Mq9y+PF8gGllBfgE6q3wmjFZy5J+BntjRweqsWvox5Kb42PaRJvBY0w2s6SfwUINbvmFOGbuRMwIFNqYvjFZzJJ+Bgs1tlOclzNQVyceVQFbTMWYbGZJP4PVNDjlF3y+wcsvHCwYKKSxrYvuvv5xjMwYk67iSvoi8raIvCoiL4tItdtWLiLrRGSL+xhw20VEfiQidSKySUSOjzrOMnf/LSKybHy+UnZQVWp3tA9ZQz+WqvIiVKFhj83VNyYbjaSn/3eqeqyqLnKfXw+sV9W5wHr3OcC5wFz3ZzlwBzgnCeBG4CTgRODGyInCjNyOtm72dPayYIga+rFEhoJsXN+Y7DSW4Z2lwEp3eyVwYVT7/er4M1AmItOAc4B1qtqiqruBdcCSMXx+Vqtxa+jPG8FFXLDFVIzJdvEmfQWeEZGNIrLcbatU1QZ3uxGodLdnANui3lvvtg3WfgARWS4i1SJS3dzcHGd42Sfk1tyZN8Ke/tTSAvw+sRLLxmQpf5z7na6q20VkCrBORELRL6qqikhCVtxW1buAuwAWLVpkq3gPItTYxoyyQiYW5o7ofTk+YXpZIdtsBo8xWSmunr6qbncfm4Bf44zJ73CHbXAfm9zdtwNVUW8Pum2DtZtRCDW0D7km7lCqygutp29Mlho26YtIsYiURLaBs4HXgFVAZAbOMuAJd3sVcKU7i+dkoNUdBloLnC0iAfcC7tlumxmhnr4wbzZ3xH0n7sGCZUU2pm9MlopneKcS+LW7FJ8f+IWqPi0iG4BHROQa4B3gEnf/NcB5QB3QCVwNoKotInITsMHd7zuq2pKwb5JF3mzuoC+scdXQj6WqvJCdHd109fZTkDt8dU5jjHcMm/RVdStwTIz2XcDiGO0KXDvIsVYAK0YepokWcmfuxLNaVizBgWqbncyZMrpjGGMyk92Rm4FCDe3k+X3MmlQ8qvdXlUfm6tsQjzHZxpJ+BqppbGfulAn4c0b31zfQ07cSy8ZkHUv6GSjU0Dbq8XyAign55Pl9VnjNmCxkST/D7Oropqm9e9Tj+QA+nxAssxLLxmQjS/oZprbRuRN3LD19gGC5lVg2JhtZ0s8wNZGkP4aePjiF17bZmL4xWceSfoapbWxj8oR8Jk/IH9NxqgJF7O7spaO7L0GRGWMygSX9DBNqHH35hWiREstWjsGY7GJJP4P0h5XaBCX9SInleivHYExWsaSfQd7etZfuvvCIV8uKxRZTMSY7WdLPIJEa+ono6U8qzqMwN8cKrxmTZSzpZ5BQYxs5PmHOlAljPpaIWIllY7KQJf0MUtPQzuGTixNWGTMYKLL6O8ZkGUv6GaR2R1tCxvMjqgLW0zcm21jSzxDtXb1sa9mXkPH8iGCgiPauPlo7exN2TGNMerOknyHe2JG4i7gR+0ssW2/fmGxhST9D1ERm7iRweCd6MRVjTHawpJ8hQo1tlBT4mT6xIGHHrBpI+nYx15hsYUk/Q4Qa2lkwtRR3reKEKC30U5Lvt8JrxmQRS/oZQFWdmjtjrKx5MBGxEsvGZBlL+hmgfvc+Orr7xlxDP5ZgwBZTMSabxJ30RSRHRF4SkSfd57NF5C8iUiciD4tIntue7z6vc1+fFXWMG9z2WhE5J9FfxqsiC6fMS+DMnYiqgNPTV9WEH9sYk35G0tP/ClAT9fy/gVtVdQ6wG7jGbb8G2O223+ruh4gsBC4F3gcsAW4XkcTcWupxocY2YHySfjBQSGdPPy17exJ+bGNM+okr6YtIEDgfuNt9LsAZwKPuLiuBC93tpe5z3NcXu/svBR5S1W5VfQuoA05MxJfwuprGdmaWFzEh35/wYw+UWLZxfWOyQrw9/duArwJh9/kkYI+qRpZdqgdmuNszgG0A7uut7v4D7THeM0BElotItYhUNzc3j+CreFeooS2hN2VFsxLLxmSXYZO+iFwANKnqxiTEg6repaqLVHVRRUVFMj4yrXX19vPWzr0JvSkrmvX0jcku8YwXnAZ8TETOAwqAUuCHQJmI+N3efBDY7u6/HagC6kXED0wEdkW1R0S/xwxiy44OwgoLxqmnPyHfT6Ao1+bqG5Mlhu3pq+oNqhpU1Vk4F2KfU9VPA88DF7m7LQOecLdXuc9xX39Onakhq4BL3dk9s4G5wF8T9k08qsa9iDtePX2wEsvGZJOxXBn8N+AhEfkP4CXgHrf9HuABEakDWnBOFKjq6yLyCLAZ6AOuVdX+MXx+VqhtbKcg18dMdxhmPFSVFxJyp4UaY7xtRElfVV8AXnC3txJj9o2qdgEXD/L+m4GbRxpkNgs1tjGvsoQcX+LKLxwsGCji2ZomwmHFN46fY4xJPbsjN42pKjUN7eNyJ260qkAhPX1hdnZ0j+vnGGNSz5J+Gmvu6KZlb0/Ca+4cLFJi2aZtGuN9lvTTWChSQ3+8e/ruYio2bdMY77Okn8Yi5RfG68asiBllbk/fpm0a43mW9NNYqKGdqaUFBIrzxvVzCvNymDwh33r6xmQBS/pprKaxfVyKrMViJZaNyQ6W9NNUb3+YN5s6xv0ibkSVLaZiTFawpJ+m3tq5l57+MAvG+SJuRDBQyHt79tEftrr6xniZJf00VdMQKb+QpJ5+oIjefmVHW1dSPs8YkxqW9NNUqLGd3Bzh8MkTkvJ5AyWWbQaPMZ5mST9NhRraOKJiAnn+5PwVWYllY7KDJf00FWpsZ8E4VtY82PSyAkTsrlxjvM6Sfhpq7eylobVr3G/Kipbvz6GypMB6+sZ4nCX9NDSeC6EPpaq80Mb0jfE4S/ppKFLbPpnDO+AUXrOevjHeZkk/DYUa2wgU5TKlJD+pn1sVKKShdR+9/eGkfq4xJnks6aehSA19keQuaBIMFBFWaNhjc/WN8SpL+mkmHFZqG9uTdlNWtOBAiWUb1zfGqyzpp5l3WzrZ19uftPIL0apsMRVjPM+Sfqqpwu+/D3+4FYiqoZ+Cnv60iQXk+MQu5hrjYSNaGN0kWDgMa66D6hXO8/IjqGmYjwjMnZL8pO/P8TG1tMCmbRrjYcP29EWkQET+KiKviMjrIvJtt322iPxFROpE5GERyXPb893nde7rs6KOdYPbXisi54zXl8oI/X3wf190Ev5pX4Hpx8OqL9G0rY7Zk4opzMtJSVhV5YXW0zfGw+IZ3ukGzlDVY4BjgSUicjLw38CtqjoH2A1c4+5/DbDbbb/V3Q8RWQhcCrwPWALcLiKpyWyp1tcDj14Nmx6CM74BZ30HLroHwmEurb+JhVOLUhZaMFBkY/rGeNiwSV8dHe7TXPdHgTOAR932lcCF7vZS9znu64vFmXu4FHhIVbtV9S2gDjgxId8ik/Tug4cuh5pVsOQW+NC/OO3lh9N1znc5JryZK3p+lbLwqgJF7GjrpruvP2UxGGPGT1wXckUkR0ReBpqAdcCbwB5V7XN3qQdmuNszgG0A7uutwKTo9hjvif6s5SJSLSLVzc3NI/9G6ay7HR68GOqehY/+CE7+4gEv11Qs4bH+D3LSu3fD239MSYiREsvbbYjHGE+KK+mrar+qHgsEcXrn88crIFW9S1UXqeqiioqK8fqY5Nu3G+6/EN75E3zybvjAskN2CTW2883eq+ibeBg8/nnobEl6mFZi2RhvG9GUTVXdAzwPnAKUiUhk9k8Q2O5ubweqANzXJwK7ottjvMfbOprhvo9C4yb41ANw1EUxdws1tCH5JeRcfC90NMGqLztTOpOoyr1By8b1jfGmeGbvVIhImbtdCJwF1OAk/0j2WgY84W6vcp/jvv6cqqrbfqk7u2c2MBf4a6K+SNpqew/uOw921cFlD8H88wfdtaaxnSMrJ+ALHgdnfgtCT+6fzpkkU0oKyM2xufrGeFU88/SnASvdmTY+4BFVfVJENgMPich/AC8B97j73wM8ICJ1QAvOjB1U9XUReQTYDPQB16qqt68W7n4bVn7MGab5zONw2KmD7qrqlF84/+hpTsPJ/wBbn4e1X4OZp0DlwqSEnOMTZpRZiWVjvGrYpK+qm4DjYrRvJcbsG1XtAi4e5Fg3AzePPMwMtHOLk/B7O2HZEzDjA0Pu3tjWReu+XhZEauj7fHDhHXDHafDoZ2H585BbmITArcSyMV5mZRjGQ+NrcO+5EO6Fq1YPm/ABQg1ODf350TX0J0yBj98JzTWw9uvjFe0hnBu0rKdvjBdZ0k+0+o1w3/mQkwdXPwVT3x/X22oGWy1rzmI49R+h+h6o+U2io40pGChiZ0cP+3q8PfpmTDaypJ9Ib/8R7l8KhWVOwp88N+63hhramVFWSGlB7qEvnvENmH4cPPElaK1PYMCxRebqW2/fGO+xpJ8odc/Czz8JpdPh6qchcNiI3h5qbGPBYJU1/XnwyXsg3AePfR7C49sDD1qJZWM8y5J+ItQ8Cb+8DCbPgavXQOm0Eb29u6+fN5v3Dr0Q+qQj4PwfwLt/gt99b4wBD61qYDEVu5hrjNdY0h+rTb+CR66EacfAsiehePKID1HX1EF/WJk/3MIpx3wKjr4UfnsLvPPiKAMeXsWEfPL9Ppu2aYwHWdIfi433OeUSDjsVPvNrZyx/FGobnZk7gw7vRDv/exCYBY99btzKNIgIwYCVWDbGiyzpj9aLt8NvvgJzz4JP/wryR7/oSaixnTy/j1mTioffOb/EGd/vaITf/OO4lWmwEsvGeJMl/ZFShd99F9beAAs+Bp96cMw3TdU0tHFk5QT8OXH+dcw4Hhbf6Ezh3HjvmD57MLaYijHeZMsljoQqrP+2s57t0ZfC0p9Aztj/CL98xlw6e/qG3zHaKV9yyjQ8fYNTpmHKgjHHES0YKGJPZy/tXb2UxJpGaozJSNbTH4m/3e8k/EWfdUokJCDhA5w4u5yPzJsysjf5fHDhnc5wz6OfdRZnSaCqgJVYNsaLLOmPxKu/gooFztRJXxr80ZVUOom/aTM88+8JPfRAiWWbwWOMp6RB5soQnS3wzh+d0sgiqY5mv7lnOkM9G+527hdIkKD19I3xJEv68XpjLWgY5p+X6kgOtfhGmHYsPHFtwso0BIpyKc7LsRk8xniMJf141a6Gkukw7ZAq06nnz4OLVkB/Lzy+PCFlGpy5+lZi2RivsaQfj959ULce5p2bHmP5sUw6As7/vjME9fvvJ+SQVeW2mIoxXpOmGSzNbP2tsxhKOg7tRDvmUjjqEnjhv+DdP4/5cMFAEdt370OTvE6vMWb8WNKPR+1qyCuBWR9MdSRDE3F6+2UznTIN+3aP6XDBQCHt3X207utNUIDGmFSzpD+ccD/UPuWUW/Dnpzqa4RWUwidXQHsDrBpbmQabwWOM91jSH059NextdqZqZorgB2DxN6FmFfz1rlEfxubqG+M9lvSHU7safH6Yc2aqIxmZU74M886Dp/5t1Mss2mIqxnjPsElfRKpE5HkR2Swir4vIV9z2chFZJyJb3MeA2y4i8iMRqRORTSJyfNSxlrn7bxGRZeP3tRIotAZmnT7qsskp4/M51TiDi+DRa0ZVf39iYS6lBX4b3jHGQ+Lp6fcB16nqQuBk4FoRWQhcD6xX1bnAevc5wLnAXPdnOXAHOCcJ4EbgJOBE4MbIiSJtNb8Bu7bA/AtSHcno5BXB5Y84F3Z/+SloCo34EMFAkQ3vGOMhwyZ9VW1Q1b+52+1ADTADWAqsdHdbCVzobi8F7lfHn4EyEZkGnAOsU9UWVd0NrAOWJPTbJFrtaudx3rmpjWMsisrhisfAX+Cs4du6fURvtxLLxnjLiMb0RWQWcBzwF6BSVRvclxqBSnd7BrAt6m31bttg7ekrtMZZBnFiMNWRjE3gMPj0o9DVCg9eBPv2xP3WyF25NlffGG+IO+mLyATgMeCfVLUt+jV1MkJCsoKILBeRahGpbm5uTsQhR6ejCeo3wLwMmrUzlGlHw6UPws4t8NDl0NsV19uqAoXs6+1n196ecQ7QGJMMcSV9EcnFSfgPqurjbvMOd9gG97HJbd8OVEW9Pei2DdZ+AFW9S1UXqeqiioqKkXyXxKp9CtDMmqo5nMM/DB+/0ynV8Ov4avRUlbszeGxc3xhPiGf2jgD3ADWq+oOol1YBkRk4y4AnotqvdGfxnAy0usNAa4GzRSTgXsA9221LT6HVzgXQyvelOpLEOuoiOPtm2PyEs+rWMMM2doOWMd4Sz9JPpwGfAV4VkZfdtq8BtwCPiMg1wDvAJe5ra4DzgDqgE7gaQFVbROQmYIO733dUtSUh3yLRujtg6wvOClnpVDs/UU79knPH7os/htJpcPo/D7prMODeoGVz9Y3xhGGTvqr+ARgs8y2Osb8C1w5yrBXAipEEmBJvPgf93d4a2jnYWTdBxw549ltQMs0p1hZDcb6f8uI86+kb4xG2MHostWugMOAsOO5VPh8svd25YP3EtVA8edC7jqsCVmLZGK+wMgwH6++DN56GueckbOHztOXPg0/9HKYsgIevhPdeirlbpMSyMSbzWdI/2LsvOiWJ0712fqIUlDpz+IsnwYMXQ8vWQ3YJujdohcM2V9+YTGdJ/2C1ayAnH4445HKFd5VMhSsed6Zw/vyT0HHg/RHBQBE9/WGaO7pTFKAxJlEs6UdThdCTcPhHIH9CqqNJrslznTo9bQ3wi4udGUyuqoCVWDbGKyzpR9vxOux5N3uGdg5WdQJcfB80bIJfLXMWWsfm6hvjJZb0o9WuAQSOzOACa2M1bwlccCvUPTuw8lbQevrGeIbHp6eMUGg1BE+Aksrh9/WyDyyD9kZ44T+hdBoFi79JRUm+3aBljAdY0o9orYeGl+HMb6U6kvTw4a9C+3vw++9DyTSqAgtteMcYD7DhnYjap5xHr1TVHCsROO/7zpKLa/6V83OrradvjAdY0o8IrYZJc6DiyFRHkj5y/O6Siyew7L3/YEbry/T1h1MdlTFmDCzpg7OoyNu/93atndHKK4LLH2Zv0Qx+6v8uO996JdURGWPGwJI+ODNVwn02tDOYonLeOHMlXeQRePyyES+5aIxJH5b0wRnaKa6A4KJUR5K2psycw7Ke6/H1tDt37Y5gyUVjTPqwpN/X4/T0j1wCvpxUR5O2pk0spJaZ/N+878KuLfD45+NaecsYk14s6b/9e+hug/kXpDqStJbn9zGttIAXdSGc+z+w5Rl4/uZUh2WMGSFL+qHVkFvkrB9rhhQMFDlz9U+4Bj5wlTOH//VfpzosY8wIZHfSV3Xm5x9xBuQWpjqatBcsL6Q+Uorh3P+BqpPg//4BGl9LbWDGmLhld9J/7yXnrlObqhmXYKCIxrYuevrC4M+HS+6Hgonw0OXQmZ7LHRtjDpTdST+0GsTnXMQ1w6oKFBJWaGh1yzGUTHVW3mpvgEevdlYdM8aktexO+rVrYOapUFSe6kgyQswSy8FFTlXOrS/AszemJjBjTNyyN+m3vAVNm7O3dv4oVJUPUmL5uCvgxL+HF38Mmx5JQWTGmHgNm/RFZIWINInIa1Ft5SKyTkS2uI8Bt11E5EciUicim0Tk+Kj3LHP33yIiy8bn64xA7RrncZ4l/XhNLS0gxyexq22eczPM+iCs+vKgC6wbY1Ivnp7+fcDBg97XA+tVdS6w3n0OcC4w1/1ZDtwBzkkCuBE4CTgRuDFyokiZ0GqY8j4on53SMDKJP8fH9LKC2NU2c3KdVbeKK+ChKw5ZZ9cYkx6GTfqq+jvg4KkZS4GV7vZK4MKo9vvV8WegTESmAecA61S1RVV3A+s49ESSPHt3wbsv2tDOKFQFigZfQat4Mlz6IHTuOmC5RWNM+hjtmH6lqja4241AZKmpGcC2qP3q3bbB2g8hIstFpFpEqpubx6m3uGUtaNiGdkYhGCgcejGVacfAx/4X3vkjPH1D8gIzxsRlzBdyVVUBTUAskePdpaqLVHVRRUVFog57oNBqKJkO048bn+N7WFWgiKb2brp6h6i7c/TFcOqXYcPP4G/3Jy84Y8ywRpv0d7jDNriPTW77dqAqar+g2zZYe/L17oM3n3OGdkRSEkImC7ozeLbvGWbpxDO/7dzpvPo62LYhCZEZY+Ix2qS/CojMwFkGPBHVfqU7i+dkoNUdBloLnC0iAfcC7tluW/JtfQF6O21oZ5TmTy0F4MfP1eH8kjcIX46z6lbpdHj4CmhrGHxfY0zSxDNl85fAi8A8EakXkWuAW4CzRGQLcKb7HGANsBWoA34G/AOAqrYANwEb3J/vuG3JF1oN+aXO9EIzYgumlXLdWUfy65e288P1W4beuagcLv0FdLfDI5+Bvu7kBGmMGZR/uB1U9bJBXlocY18Frh3kOCuAFSOKLtHC/fDG0zD3LPDnpTSUTPalM+bw9q5Obnt2C4dNKuLjxwUH37nyffDxO+CRK52hno/9rw2rGZNC2XVHbv0G2NtsQztjJCL81yeO4uTDy/nqo5v4y9ZdQ79h4VL40L/CSw/AhruTE6QxJqbsSvqh1eDLdXr6Zkzy/D5+esUiZpYXsfyBjbzZ3DH0Gz7yNaew3dPXw9t/TE6QxphDZE/SV3WS/qzTnXLAZswmFuVy71Un4vcJn71vA7s6hhiz9/ngE3dBYLYz1NNan7xAjTEDsifp73wDWt602vkJNnNSET9btojG1i6WP7Bx6Pn7BROdC7v9PU4N/t5hpn0aYxIue5J+aLXzaOP5CXf8zAA/uORYNr6zm3/51SuEw0NM5aw4Ej7xM2jYBL/5ivMbmDEmabIn6deugWnHwsSY1R/MGJ1/9DT+bcl8ntzUwA/WvTH0zvOWwN99HTY9DH++PTkBGmOAbEn67Y1QX21DO+PsCx8+nEtPqOLHz9fxSPW2oXf+4HWw4KPwzL/Dm88nJ0BjTJYk/dqnALWkP85EhJsufD+nz5nM1x5/lT/V7Rx8Z58PLrwTKuY7Sy22vJW8QI3JYlmS9NdA2WEwZWGqI/G83Bwft19xPLMnF/P3P99IXVP74OsgWRAAAAp5SURBVDvnT3BKMWsYHvo09OxNXqDGZCnvJ/3uDtj6W6eXb3eCJkVpQS73Xn0C+f4crrp3A83tQ0zlLD8cLloBzTXwq6ugqSZpcRqTjbyf9N9cD/3dNmsnyYKBIu5ZtoidHd18/v7qoadyzjkTzvkvqFsPt58Md30E/nIXdKamPJMxXub9pB9aDYUBmHlKqiPJOsdUlXHbp47jlfo9/L9HXh56KufJX4DrQnDOf0J/Hzz1r/C9I50KnaE1tgqXMQni7aTf3wtvrHVu/88ZtracGQdL3j+Vr5+3gDWvNvLfa0ND7zxhCpxyLXzxD/CFP8CJy+GdF+Ghy+D7852VuBpfTU7gxniUtzPhuy9C1x4b2kmxa06fzdu79vLT327lsPJiLj9p5vBvmnoULDkKzvo21D0LL/8C/vozZ15/5VFw7GVw1CUwYZxWVzPGo7yd9ENrICffWcHJpIyI8K2Pvo9tLfv4xhOvEQwU8qEj40zWObkw71znp7MFXnvMOQGs/Ro88w2Ye7ZzAjhyCfjzx/eLGOMBMuTqRym2aNEira6uHt2bVeG2o6FyIVz+cGIDM6PS3tXLxXe+SP3ufTz2xVOZN7Vk9AdrCsErv4BXHoaORue6zfsvck4A04+3mVomq4nIRlVdFOs1747p73gNWt+1oZ00UlKQy4qrTqA4P4fP3reBprau0R9synw46zvwz6/Dpx9zfpt76QH42Rnwk5PgD7fZEo3GxODdpB9aA4gzLGDSxvSyQu5ZdgK7O3v43P3VdPb0je2AOX6Ye6Yz1/+6WrjgNigsg2dvhFsXwgOfcK4FvPMnmwJqDF4e3rnzg5BbCNc8k9igTEKsr9nB5++vZvGCSu684gPk+BI8HLOzDl75pVPUrTWqDtCESpiyACoWOI9TFjilIApKE/v5xqTQUMM73kz6e7bBbe+HM78Np/9T4gMzCXHfH9/iW7/ZzOdOn82/XzBOJTJUnQVbmkPO3b5NNc7dv8210Nu5f7/SoDNkFH1CqJgHecXjE5cx42iopO/N2Ts9e53ZHFZgLa1dddps3t7Vyd1/eIvDJhXxmVNmJf5DRKCsyvmJXiYzHIY977gng83OheHmGnjr984d3M6boWymU7NpynznsWI+TD4ScgsSH6sxSZD0nr6ILAF+COQAd6vqLYPtO6bhHZMR+sPK3z9QzXOhJn5y+fGcOmcyJfl+fIke7ok7oD7Y/bZzAmiK+tm1BcLu9QfxOcs+TpgCeROc3wbyJ0BeifsYaStxtgfaDtq2GwbNOEmb4R0RyQHeAM4C6oENwGWqujnW/pb0s8Pe7j4u+emLvP5e20BbSb6fkgI/JQW5lBY6jyUFfkrdx4HnhZH2yGvO86K8HCSR0zb7e2HXm85vBc0h56ezBXo6nKJ+PR3Ob5jd7UCc/6f8BVEnghLnRJFX7NxvkJMLOXngy92/nZM7SHuecwKJbPuitiPv8eWCL8c5Yfn87naO8xi9PdDmd/eNtPn3v9+mw6a9dBreORGoU9WtACLyELAUiJn0TXYozvfzi8+dzNrNjbTt66Wtq4/2rl7a9jmP7V197Gjroq5p//O+oer4ADk+cU8OfvL9Oajq/lSs+9NypD3S91F0/7bu38d9G6oTgZNQTuSLHz6Cq06bfeAHqzrXCgZOBFEnhe5258Qw0NbuPu7d//q+3c4JJtzrrCXcH/0Y2e4h7hPLeBDf/hPBwElAoh456PkQj+Ib4r3j/kXcBzm0bbD2uPZNkDlnwjk3J/ywyU76M4DoJZXqgZOidxCR5cBygJkz47hd33jCxKJcLllUFde+qsq+3n7a3ZNDa9TJoc19HHi+r5fefjdByv7/piKCsP//qrhtke2B/+OIk5sG3ue0ARw2KcZFXpH9PXYqR/aHMBLh/tgng3Df/u3+qO1wr3MdQ/udfcL97nastv4Dtwdt63NOcqo4Z9IxPB7cNt4GRjg0Rttg7XHsm0il08flsGk3qKiqdwF3gTO8k+JwTBoSEYry/BTl+akszdILqr4c8BU605KNGYFk35y1HYjuzgXdNmOMMUmQ7KS/AZgrIrNFJA+4FFiV5BiMMSZrJXV4R1X7RORLwFqcKZsrVPX1ZMZgjDHZLOlj+qq6BliT7M81xhjj5YJrxhhjDmFJ3xhjsoglfWOMySKW9I0xJoukdWllEWkG3hnDISYDOxMUznjLpFghs+K1WMdPJsWbSbHC2OI9TFVjLkSd1kl/rESkerCiQ+kmk2KFzIrXYh0/mRRvJsUK4xevDe8YY0wWsaRvjDFZxOtJ/65UBzACmRQrZFa8Fuv4yaR4MylWGKd4PT2mb4wx5kBe7+kbY4yJYknfGGOyiCeTvogsEZFaEakTketTHc9QRKRKRJ4Xkc0i8rqIfCXVMQ1HRHJE5CUReTLVsQxHRMpE5FERCYlIjYickuqYBiMi/+z+G3hNRH4pImm1QoyIrBCRJhF5LaqtXETWicgW9zGQyhgjBon1u+6/g00i8msRKUtljNFixRv12nUioiIyORGf5bmk7y6+/hPgXGAhcJmILExtVEPqA65T1YXAycC1aR4vwFeAmlQHEacfAk+r6nzgGNI0bhGZAfwjsEhV349TevzS1EZ1iPuAJQe1XQ+sV9W5wHr3eTq4j0NjXQe8X1WPBt4Abkh2UEO4j0PjRUSqgLOBdxP1QZ5L+kQtvq6qPUBk8fW0pKoNqvo3d7sdJynNSG1UgxORIHA+cHeqYxmOiEwEPgTcA6CqPaq6J7VRDckPFIqIHygC3ktxPAdQ1d8BLQc1LwVWutsrgQuTGtQgYsWqqs+oap/79M84K/elhUH+bAFuBb5KAhfi9WLSj7X4etom0WgiMgs4DvhLaiMZ0m04/wjDqQ4kDrOBZuBedzjqbhGJsZp56qnqduB7OD26BqBVVZ9JbVRxqVTVBne7kXFdDT6hPgs8leoghiIiS4HtqvpKIo/rxaSfkURkAvAY8E+q2pbqeGIRkQuAJlXdmOpY4uQHjgfuUNXjgL2kz/DDAdyx8KU4J6rpQLGIXJHaqEZGnfnfaT8HXES+jjOs+mCqYxmMiBQBXwO+mehjezHpZ9zi6yKSi5PwH1TVx1MdzxBOAz4mIm/jDJudISI/T21IQ6oH6lU18pvTozgngXR0JvCWqjarai/wOHBqimOKxw4RmQbgPjalOJ4hichVwAXApzW9b1I6AqcD8Ir7/y0I/E1Epo71wF5M+hm1+LqICM6Yc42q/iDV8QxFVW9Q1aCqzsL5c31OVdO2N6qqjcA2EZnnNi0GNqcwpKG8C5wsIkXuv4nFpOlF54OsApa528uAJ1IYy5BEZAnO0OTHVLUz1fEMRVVfVdUpqjrL/f9WDxzv/pseE88lffdCTWTx9RrgkTRffP004DM4veaX3Z/zUh2Uh3wZeFBENgHHAv+Z4nhicn8beRT4G/Aqzv/NtCobICK/BF4E5olIvYhcA9wCnCUiW3B+W7kllTFGDBLrj4ESYJ37/+zOlAYZZZB4x+ez0vs3HGOMMYnkuZ6+McaYwVnSN8aYLGJJ3xhjsoglfWOMySKW9I0xJotY0jfGmCxiSd8YY7LI/wfpaZNjMImHLQAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ghRgQvOnVewZ", + "outputId": "b9c72b93-0728-4a11-8ecc-b38a30062f27" + }, + "source": [ + "trn_url_parts_df[trn_url_parts_df.netloc_level_cnt==7].netloc.tolist()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['bmcinfectdis.biomedcentral.com.preview-live.oscarjournals.springer.com',\n", + " '0-www.loc.gov.oasys.lib.oxy.edu',\n", + " 'www.bennettandbelfort.com.php73-36.phx1-1.websitetestlink.com']" + ] + }, + "metadata": {}, + "execution_count": null + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "2cmQUCorT4sS", + "outputId": "5813272b-0007-43b2-c817-c57302a8ab76" + }, + "source": [ + "trn_url_parts_df[trn_url_parts_df.path_level_cnt==14].path.tolist()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['/MetroKids/Attractions-and-Events/Attractions-Guide/index.php/alpha/P/category/Theaters/ages/Parents/city/Malvern/county/NJ+-+Burlington+County/']" + ] + }, + "metadata": {}, + "execution_count": null + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "sR1h230cWTdo", + "outputId": "ce8d4cb8-2730-4a34-f4d0-2b5cd84a944b" + }, + "source": [ + "trn_url_parts_df = trn_url_parts_df.assign(\n", + " yearlike_path=trn_url_parts_df.path.str.contains(r\"20[0-2][0-9]|19\\d\\d\")\n", + ")\n", + "print(\n", + " f\"{len(trn_url_parts_df[trn_url_parts_df.yearlike_path].index)/len(trn_url_parts_df.index):.2%}\"\n", + ")\n", + "trn_url_parts_df.yearlike_path.value_counts().to_dict()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "22.52%\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{False: 7748, True: 2252}" + ] + }, + "metadata": {}, + "execution_count": null + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Tf3hlUV1WTdt", + "outputId": "544100f9-0122-42b5-e309-e0a5f8daf00f" + }, + "source": [ + "trn_url_parts_df[trn_url_parts_df.yearlike_path].path[:10].tolist()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['/news/2005/03/22/on-time-and-on-budget/',\n", + " '/2013/01/19/kitchen-tip-7-how-to-cook-pasta/',\n", + " '/2014/08/18/us-pay-tv-providers-shed-300000-subs-in-q2/',\n", + " '/2011/11/24/reflections-thanksgiving-2011/',\n", + " '/2019/',\n", + " '/oem-parts/2005-suzuki-dr200se-gasket-set/o/m16072sch590528',\n", + " '/Products/Genuine-Joe-Linen-like-Table-Skirts__GJO11915.aspx',\n", + " '/listing/485271256/vintage-90s-1995-58th-annual-postal',\n", + " '/2013/01/08/calendar-cat/',\n", + " '/2012/06/new-model-army-equipment-stolen-can-you-help/']" + ] + }, + "metadata": {}, + "execution_count": null + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "OG8YdBMqWTdu", + "outputId": "8e1a083e-df6f-4519-e47f-a08f250b4177" + }, + "source": [ + "import warnings\n", + "\n", + "\n", + "warnings.simplefilter(action=\"ignore\") # Just don't want to see `UserWarning` about regex group\n", + "trn_url_parts_df = trn_url_parts_df.assign(\n", + " datelike_path=trn_url_parts_df.path.str.contains(\n", + " r\"(20[0-2][0-9]|19\\d\\d)[/\\-](0?[1-9]|11|12)[/\\-](0?[1-9]|[12]\\d|3[01])\"\n", + " )\n", + ")\n", + "print(\n", + " f\"{len(trn_url_parts_df[trn_url_parts_df.datelike_path].index)/len(trn_url_parts_df.index):.2%}\"\n", + ")\n", + "trn_url_parts_df.datelike_path.value_counts().to_dict()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "7.34%\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{False: 9266, True: 734}" + ] + }, + "metadata": {}, + "execution_count": null + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "vN1sK3uzWTdu", + "outputId": "1cf1dbef-5e98-4112-e20c-05f1a13b3f80" + }, + "source": [ + "trn_url_parts_df[trn_url_parts_df.datelike_path].path[:10].tolist()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['/news/2005/03/22/on-time-and-on-budget/',\n", + " '/2013/01/19/kitchen-tip-7-how-to-cook-pasta/',\n", + " '/2014/08/18/us-pay-tv-providers-shed-300000-subs-in-q2/',\n", + " '/2011/11/24/reflections-thanksgiving-2011/',\n", + " '/2013/01/08/calendar-cat/',\n", + " '/2016/02/04/the-jesus-revolution/',\n", + " '/2016/03/18/colorado-avalanche-vs-calgary-flames-nhl-betting-hockey-odds-pick-and-prediction/',\n", + " '/news/2013/08/06/communities-should-be-paid-for-wind-and-solar-in-their-neighbourhood/',\n", + " '/2017/11/20/maurizio-cannavacciuolo/',\n", + " '/index.php/2014/02/19/showing-a-little-love/']" + ] + }, + "metadata": {}, + "execution_count": null + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wexEuNAuOTrI" + }, + "source": [ + "## Deprecated Basic EDA of C4 samples.tgz\n", + "Old code for raw data of C4 samples.tgz (uncompressed as `dst/en_meta.json`, for example)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cEPExQdIbqN1" + }, + "source": [ + "### English" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "m9e6X_9cSFPr" + }, + "source": [ + "import json\n", + "import pandas as pd\n", + "\n", + "from collections import namedtuple\n", + "from glob import glob\n", + "from urllib.parse import unquote_plus, urlsplit" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "Otf_HTdUSX8y" + }, + "source": [ + "%cd /content/drive/MyDrive/colab_data/dst" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "ucRZz8zRxVFW" + }, + "source": [ + "with open('en_meta.json') as en_meta_f:\n", + " en_meta_json = json.load(\n", + " en_meta_f,\n", + " object_hook=lambda d: namedtuple(\n", + " 'Meta',\n", + " map(lambda k: k.replace('-', '_'), d.keys())\n", + " )(*d.values())\n", + " )" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "8JyoUMQKxttO" + }, + "source": [ + "en_meta_offset_uri_map = {\n", + " row.offset: row.headers.warc_target_uri for row in en_meta_json\n", + "}" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "5ml3w27g6TBb" + }, + "source": [ + "en_uri_parts_map = {\n", + " uri: urlsplit(uri) for uri in en_meta_offset_uri_map.values()\n", + "}" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "d-_TysttCbFD" + }, + "source": [ + "i = 0\n", + "for (uri, parts) in en_uri_parts_map.items():\n", + " if '%' in parts.path:\n", + " print(f\"{i:02d} {uri} -> {unquote_plus(parts.path)} {unquote_plus(parts.query)} {unquote_plus(parts.fragment)}\")\n", + " i += 1\n", + " if i > 50:\n", + " break" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "V1cbOBx2w2bd" + }, + "source": [ + "en_uri_parts_df = pd.DataFrame(\n", + " data=[(\n", + " uri,\n", + " unquote_plus(parts.path),\n", + " sum(1 for _ in filter(None, parts.path.split('/'))),\n", + " unquote_plus(parts.query),\n", + " unquote_plus(parts.fragment)\n", + " ) for uri, parts in en_uri_parts_map.items()],\n", + " columns=['uri', 'path', 'path_level_cnt', 'query', 'fragment']\n", + ")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6lisgI0hbLUd" + }, + "source": [ + "#### Number of Path Levels" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "OnRoyOSQxEpj" + }, + "source": [ + "en_uri_parts_df.describe(percentiles=[.25, .5, .75, .95, .975])" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "dSbh_K4FKIjJ" + }, + "source": [ + "en_uri_parts_df.path_level_cnt.value_counts().sort_index().plot()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "uvslgW7mFn71" + }, + "source": [ + "en_uri_parts_df[en_uri_parts_df.path_level_cnt==33].uri.values" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "w9LU95g1IHvz" + }, + "source": [ + "en_uri_parts_df[en_uri_parts_df.path_level_cnt==5].path.values" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QluA_8eLbTqr" + }, + "source": [ + "#### Year and Date" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Wjs8GczPOSPp" + }, + "source": [ + "en_uri_parts_df = en_uri_parts_df.assign(\n", + " yearlike_path=en_uri_parts_df.path.str.contains(r\"20[0-2][0-9]|19\\d\\d\")\n", + ")\n", + "display(f\"{len(en_uri_parts_df[en_uri_parts_df.yearlike_path].index)/len(en_uri_parts_df.index):.2%}\")\n", + "en_uri_parts_df.yearlike_path.value_counts()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "vx5ls4MESADX" + }, + "source": [ + "en_uri_parts_df[en_uri_parts_df.yearlike_path].path.values" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "Zck_PuJ0PoCx" + }, + "source": [ + "en_uri_parts_df = en_uri_parts_df.assign(\n", + " en_uri_parts_df=en_uri_parts_df.path.str.contains(\n", + " r\"(20[0-2][0-9]|19\\d\\d)[/\\-](0?[1-9]|11|12)[/\\-](0?[1-9]|[12]\\d|3[01])\"\n", + " )\n", + ")\n", + "display(f\"{len(en_uri_parts_df[en_uri_parts_df.datelike_path].index)/len(en_uri_parts_df.index):.2%}\")\n", + "en_uri_parts_df.datelike_path.value_counts()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "BrJFJtjEPuMx" + }, + "source": [ + "en_uri_parts_df[en_uri_parts_df.datelike_path].path.values" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "__lnPt_ya_SS" + }, + "source": [ + "### All" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "SnoCgIxkTVWs" + }, + "source": [ + "meta_jsons = []\n", + "for meta_json_fname in glob('??_meta.json'):\n", + " with open(meta_json_fname) as meta_json_f:\n", + " meta_jsons.extend(\n", + " json.load(\n", + " meta_json_f,\n", + " object_hook=lambda d: namedtuple(\n", + " 'Meta',\n", + " map(lambda k: k.replace('-', '_'), d.keys())\n", + " )(*d.values())\n", + " )\n", + " )" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "2dz9enqbT6iK" + }, + "source": [ + "meta_offset_uri_map = {\n", + " row.offset: row.headers.warc_target_uri for row in meta_jsons\n", + "}" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "1cqTjbtjV1oR" + }, + "source": [ + "uri_parts_map = {\n", + " uri: urlsplit(uri) for uri in meta_offset_uri_map.values()\n", + "}" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "qgLPRtDnVVEk" + }, + "source": [ + "uri_parts_df = pd.DataFrame(\n", + " data=[(\n", + " uri,\n", + " unquote_plus(parts.path),\n", + " sum(1 for _ in filter(None, parts.path.split('/'))),\n", + " unquote_plus(parts.query),\n", + " unquote_plus(parts.fragment)\n", + " ) for uri, parts in uri_parts_map.items()],\n", + " columns=['uri', 'path', 'path_level_cnt', 'query', 'fragment']\n", + ")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0nXMJuiMbcBx" + }, + "source": [ + "#### Number of Path Levels" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "3y7CVd1iV_yN" + }, + "source": [ + "uri_parts_df.describe(percentiles=[.25, .5, .75, .95, .975])" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "88eQb3RFWGnk" + }, + "source": [ + "uri_parts_df.path_level_cnt.value_counts().sort_index().plot()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "isBR1jmUWMb0" + }, + "source": [ + "uri_parts_df[uri_parts_df.path_level_cnt==61].uri.values" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "j3EMRvoAbf9Y" + }, + "source": [ + "#### Year and Date" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "fLdePo9-Wqx8" + }, + "source": [ + "uri_parts_df = uri_parts_df.assign(\n", + " yearlike_path=df_uri_parts.path.str.contains(r\"20[0-2][0-9]|19\\d\\d\")\n", + ")\n", + "display(f\"{len(uri_parts_df[uri_parts_df.yearlike_path].index)/len(uri_parts_df.index):.2%}\")\n", + "uri_parts_df.yearlike_path.value_counts()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "iffYbZ_BXNjE" + }, + "source": [ + "uri_parts_df = uri_parts_df.assign(\n", + " datelike_path=uri_parts_df.path.str.contains(\n", + " r\"(20[0-2][0-9]|19\\d\\d)[/\\-](0?[1-9]|11|12)[/\\-](0?[1-9]|[12]\\d|3[01])\"\n", + " )\n", + ")\n", + "display(f\"{len(uri_parts_df[uri_parts_df.datelike_path].index)/len(uri_parts_df.index):.2%}\")\n", + "uri_parts_df.datelike_path.value_counts()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "rWJKdnZFZSvH" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file