diff --git a/playground.ipynb b/playground.ipynb new file mode 100644 index 00000000..9122aab1 --- /dev/null +++ b/playground.ipynb @@ -0,0 +1,1702 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "BigScience Metadata playground", + "provenance": [], + "collapsed_sections": [ + "SbhiOxCY8rZ2", + "veqze27o9HEX", + "VpEg7z7CAiUc", + "L6_jjYRf0ZXN", + "UI_pyX7P0cRG", + "ckWrWh2r07ZH", + "wexEuNAuOTrI", + "cEPExQdIbqN1", + "6lisgI0hbLUd", + "QluA_8eLbTqr", + "__lnPt_ya_SS", + "0nXMJuiMbcBx", + "j3EMRvoAbf9Y" + ] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "TPU" + }, + "cells": [ + { + "cell_type": "code", + "metadata": { + "id": "rE6KUpKC-xb7" + }, + "source": [ + "# !nvidia-smi -L" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "5DtYDK2LL6jM" + }, + "source": [ + "# import os\n", + "# from tensorflow.python.profiler import profiler_client\n", + "\n", + "# tpu_profile_service_address = os.environ['COLAB_TPU_ADDR'].replace('8470', '8466')\n", + "# print(profiler_client.monitor(tpu_profile_service_address, 100, 2))" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "V5QKUdkUMBAr" + }, + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "reQ0s7ej6tO2" + }, + "source": [ + "# Data" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "qX28VK4UYGRX" + }, + "source": [ + "from pathlib import Path\n", + "\n", + "\n", + "DATA_DIR=Path(\"/content/drive/MyDrive/colab_data/bigscience\")\n", + "!mkdir -p {DATA_DIR}\n", + "\n", + "!rm -rf sample_data" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "RwcpNOdzTo-o" + }, + "source": [ + "TRN_CNT, TRN_OFFSET = 10000, 0\n", + "VLD_CNT, VLD_OFFSET = 1000, 1000\n", + "ds_key, ds_subkey = \"mc4\", \"en\"\n", + "ds_name = f\"{ds_key}-{ds_subkey}\"\n", + "trn_jsonl_p = DATA_DIR / f\"{ds_name}_trn_{TRN_OFFSET}-{TRN_OFFSET+TRN_CNT}.jsonl\"\n", + "vld_jsonl_p = DATA_DIR / f\"{ds_name}_vld_{VLD_OFFSET}-{VLD_OFFSET+VLD_CNT}.jsonl\"" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NLyFy7Tr0gJz" + }, + "source": [ + "## URL Metadata of mC4 `(trn[:10000], vld[1000:2000])`\n", + "Skip this subsection if jsonl files of the same data subsets are prepared before." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "oFgAWS0dD2mq" + }, + "source": [ + "!pip check\n", + "!pip install -q datasets[streaming]\n", + "!pip check" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "apuZh6PrVQhW" + }, + "source": [ + "from datasets import load_dataset\n", + "\n", + "\n", + "mc4_en_trn_ds = load_dataset(\n", + " ds_key,\n", + " ds_subkey,\n", + " split=\"train\",\n", + " # data_dir=DATA_DIR,\n", + " streaming=True,\n", + ")\n", + "mc4_en_vld_ds = load_dataset(\n", + " ds_key,\n", + " ds_subkey,\n", + " split=\"validation\",\n", + " # data_dir=DATA_DIR,\n", + " streaming=True,\n", + ")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "ZstWjsMWygfG" + }, + "source": [ + "!pip check\n", + "!pip install -q jsonlines\n", + "!pip check" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "b4CozMGrjEOu" + }, + "source": [ + "import jsonlines\n", + "\n", + "\n", + "def _gen(ds, limit=None, offset=0):\n", + " for i, data in enumerate(ds):\n", + " if i < offset: continue\n", + " if i - offset == limit: break\n", + " yield {\n", + " \"text\": data['text'],\n", + " \"metadata\": [{\n", + " \"key\": \"url\",\n", + " \"type\": \"global\",\n", + " \"value\": data[\"url\"],\n", + " }],\n", + " }\n", + "\n", + "\n", + "with jsonlines.open(trn_jsonl_p, mode='w') as writer:\n", + " writer.write_all(l for l in _gen(mc4_en_trn_ds, TRN_CNT, TRN_OFFSET))\n", + "with jsonlines.open(vld_jsonl_p, mode='w') as writer:\n", + " writer.write_all(l for l in _gen(mc4_en_vld_ds, VLD_CNT, VLD_OFFSET))" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SbhiOxCY8rZ2" + }, + "source": [ + "# Trials" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "veqze27o9HEX" + }, + "source": [ + "## Dependencies" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "qqXXP-0sAzlv" + }, + "source": [ + "!pip check\n", + "!pip install -qU pip\n", + "!pip freeze | grep torch\n", + "!pip check" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "1Fv8-Xr2B21z" + }, + "source": [ + "!pip check\n", + "!pip install -q torch==1.8.1 torchvision==0.9.1 torchtext==0.9.1 -f https://download.pytorch.org/whl/cu102/torch_stable.html\n", + "!pip freeze | grep torch\n", + "!pip check" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VpEg7z7CAiUc" + }, + "source": [ + "### TPU\n", + "Skip this step if not using TPU." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "X_nCrsMuxRYa" + }, + "source": [ + "!pip check\n", + "# From https://github.com/huggingface/notebooks/blob/master/examples/accelerate/simple_nlp_example.ipynb\n", + "# Not sure whether we still need not-yet-released accelerate for TPU or not.\n", + "# !pip install -q cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl\n", + "!pip install -q cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.8.1-cp37-cp37m-linux_x86_64.whl\n", + "# !pip install -q git+https://github.com/huggingface/accelerate\n", + "!pip check" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fCISNe_6XnZ8" + }, + "source": [ + "### Package" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "ptIFglMEAHnG" + }, + "source": [ + "%cd /content\n", + "!rm -rf bigscience-metadata\n", + "!git clone -b perf-collator_with_padding_for_tpu https://github.com/tianjianjiang/bigscience-metadata.git" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "RhnFQ-EXr-JP" + }, + "source": [ + "%cd /content/bigscience-metadata\n", + "!pip install -q -e .\n", + "!pip check" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "4BXNw3yrUX9A" + }, + "source": [ + "!wandb login" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "L6_jjYRf0ZXN" + }, + "source": [ + "## Without URL" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "_IdlsR4X0Vfq", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "3fb8ab2b-8df3-401e-c9c2-a023c2fd71bd" + }, + "source": [ + "# !TOKENIZERS_PARALLELISM=false \\\n", + "!accelerate launch --fp16 \\\n", + "/content/bigscience-metadata/bsmetadata/train.py \\\n", + "max_train_steps=100 num_eval=1 \\\n", + "data_config.experiment=without_metadata \\\n", + "data_config.per_device_eval_batch_size=8 \\\n", + "data_config.max_seq_len=768 \\\n", + "data_config.train_file={trn_jsonl_p} \\\n", + "data_config.validation_file={vld_jsonl_p}" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "WARNING:root:TPU has started up successfully with version pytorch-1.8.1\n", + "WARNING:root:TPU has started up successfully with version pytorch-1.8.1\n", + "data_config:\n", + " experiment: without_metadata\n", + " per_device_eval_batch_size: 8\n", + " per_device_train_batch_size: 2\n", + " metadata_list: []\n", + " metadata_sep: ' | '\n", + " metadata_key_value_sep: ': '\n", + " metadata_probability: 1.0\n", + " global_metadata_sep: ' |||'\n", + " max_seq_len: 768\n", + " dataset_name: null\n", + " dataset_config_name: null\n", + " train_file: /content/drive/MyDrive/colab_data/bigscience/mc4-en_trn_0-10000.jsonl\n", + " validation_file: /content/drive/MyDrive/colab_data/bigscience/mc4-en_vld_1000-2000.jsonl\n", + " overwrite_cache: false\n", + " cache_dir: null\n", + " preprocessing_num_workers: null\n", + " validation_split_percentage: 5\n", + " block_size: null\n", + " distributed_type: TPU\n", + "weight_decay: 0.0\n", + "learning_rate: 5.0e-05\n", + "gradient_accumulation_steps: 1\n", + "num_train_epochs: 1\n", + "max_train_steps: 100\n", + "lr_scheduler_type: linear\n", + "num_warmup_steps: 1000\n", + "seed: 42\n", + "out_dir: output_dir\n", + "num_eval: 1\n", + "model_name: gpt2\n", + "project_name: metadata_lm\n", + "\n", + "[2021-09-08 22:35:02,024][datasets.builder][WARNING] - Using custom data configuration default-e08b76da1fb65150\n", + "[2021-09-08 22:35:02,028][datasets.builder][WARNING] - Reusing dataset json (/root/.cache/huggingface/datasets/json/default-e08b76da1fb65150/0.0.0/45636811569ec4a6630521c18235dfbbab83b7ab572e3393c5ba68ccabe98264)\n", + "Running tokenizer on dataset: 100% 10/10 [00:20<00:00, 2.02s/ba]\n", + "Running tokenizer on dataset: 100% 1/1 [00:02<00:00, 2.13s/ba]\n", + "Grouping texts in chunks of 1024: 100% 10/10 [00:36<00:00, 3.60s/ba]\n", + "Grouping texts in chunks of 1024: 100% 1/1 [00:03<00:00, 3.92s/ba]\n", + "training: 0% 0/100 [00:00\n", + " sys.exit(main())\n", + " File \"/usr/local/lib/python3.7/dist-packages/accelerate/commands/accelerate_cli.py\", line 41, in main\n", + " args.func(args)\n", + " File \"/usr/local/lib/python3.7/dist-packages/accelerate/commands/launch.py\", line 384, in launch_command\n", + " simple_launcher(args)\n", + " File \"/usr/local/lib/python3.7/dist-packages/accelerate/commands/launch.py\", line 142, in simple_launcher\n", + " raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd)\n", + "subprocess.CalledProcessError: Command '['/usr/bin/python3', '/content/bigscience-metadata/bsmetadata/train.py', 'max_train_steps=100', 'num_eval=1', 'data_config.experiment=without_metadata', 'data_config.per_device_eval_batch_size=8', 'data_config.max_seq_len=768', 'data_config.train_file=/content/drive/MyDrive/colab_data/bigscience/mc4-en_trn_0-10000.jsonl', 'data_config.validation_file=/content/drive/MyDrive/colab_data/bigscience/mc4-en_vld_1000-2000.jsonl']' died with .\n", + "/usr/lib/python3.7/multiprocessing/semaphore_tracker.py:144: UserWarning: semaphore_tracker: There appear to be 6 leaked semaphores to clean up at shutdown\n", + " len(cache))\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "ty69LlcHt7ZB" + }, + "source": [ + "# !rm -rf outputs" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UI_pyX7P0cRG" + }, + "source": [ + "## With URL" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "dgPpzdhbnkMd", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "aa02ca4a-5d1e-414a-9564-4cf6f34d68ee" + }, + "source": [ + "# !TOKENIZERS_PARALLELISM=false \\\n", + "!accelerate launch --fp16 \\\n", + "/content/bigscience-metadata/bsmetadata/train.py \\\n", + "max_train_steps=100 num_eval=1 \\\n", + "data_config.experiment=with_metadata \\\n", + "data_config.per_device_eval_batch_size=8 \\\n", + "data_config.max_seq_len=768 \\\n", + "data_config.train_file={trn_jsonl_p} \\\n", + "data_config.validation_file={vld_jsonl_p}" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "WARNING:root:TPU has started up successfully with version pytorch-1.8.1\n", + "WARNING:root:TPU has started up successfully with version pytorch-1.8.1\n", + "data_config:\n", + " experiment: with_metadata\n", + " per_device_eval_batch_size: 8\n", + " per_device_train_batch_size: 2\n", + " metadata_list: []\n", + " metadata_sep: ' | '\n", + " metadata_key_value_sep: ': '\n", + " metadata_probability: 1.0\n", + " global_metadata_sep: ' |||'\n", + " max_seq_len: 768\n", + " dataset_name: null\n", + " dataset_config_name: null\n", + " train_file: /content/drive/MyDrive/colab_data/bigscience/mc4-en_trn_0-10000.jsonl\n", + " validation_file: /content/drive/MyDrive/colab_data/bigscience/mc4-en_vld_1000-2000.jsonl\n", + " overwrite_cache: false\n", + " cache_dir: null\n", + " preprocessing_num_workers: null\n", + " validation_split_percentage: 5\n", + " block_size: null\n", + " distributed_type: TPU\n", + "weight_decay: 0.0\n", + "learning_rate: 5.0e-05\n", + "gradient_accumulation_steps: 1\n", + "num_train_epochs: 1\n", + "max_train_steps: 100\n", + "lr_scheduler_type: linear\n", + "num_warmup_steps: 1000\n", + "seed: 42\n", + "out_dir: output_dir\n", + "num_eval: 1\n", + "model_name: gpt2\n", + "project_name: metadata_lm\n", + "\n", + "[2021-09-08 22:39:13,291][datasets.builder][WARNING] - Using custom data configuration default-e08b76da1fb65150\n", + "[2021-09-08 22:39:13,295][datasets.builder][WARNING] - Reusing dataset json (/root/.cache/huggingface/datasets/json/default-e08b76da1fb65150/0.0.0/45636811569ec4a6630521c18235dfbbab83b7ab572e3393c5ba68ccabe98264)\n", + "Pre-process the text and metadata to create new samples: 100% 10/10 [00:58<00:00, 5.88s/ba]\n", + "Pre-process the text and metadata to create new samples: 100% 1/1 [00:06<00:00, 6.10s/ba]\n", + "Create labels column: 100% 13/13 [00:12<00:00, 1.06ba/s]\n", + "Create labels column: 100% 2/2 [00:01<00:00, 1.60ba/s]\n", + "training: 0% 0/100 [00:00\n", + " sys.exit(main())\n", + " File \"/usr/local/lib/python3.7/dist-packages/accelerate/commands/accelerate_cli.py\", line 41, in main\n", + " args.func(args)\n", + " File \"/usr/local/lib/python3.7/dist-packages/accelerate/commands/launch.py\", line 384, in launch_command\n", + " simple_launcher(args)\n", + " File \"/usr/local/lib/python3.7/dist-packages/accelerate/commands/launch.py\", line 142, in simple_launcher\n", + " raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd)\n", + "subprocess.CalledProcessError: Command '['/usr/bin/python3', '/content/bigscience-metadata/bsmetadata/train.py', 'max_train_steps=100', 'num_eval=1', 'data_config.experiment=with_metadata', 'data_config.per_device_eval_batch_size=8', 'data_config.max_seq_len=768', 'data_config.train_file=/content/drive/MyDrive/colab_data/bigscience/mc4-en_trn_0-10000.jsonl', 'data_config.validation_file=/content/drive/MyDrive/colab_data/bigscience/mc4-en_vld_1000-2000.jsonl']' died with .\n", + "/usr/lib/python3.7/multiprocessing/semaphore_tracker.py:144: UserWarning: semaphore_tracker: There appear to be 6 leaked semaphores to clean up at shutdown\n", + " len(cache))\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "UPTWrf_Tt74I" + }, + "source": [ + "# !rm -rf outputs" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ckWrWh2r07ZH" + }, + "source": [ + "## TBA" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "xTkK8Ye2B16g" + }, + "source": [ + "!pip install git+https://git@github.com/bigscience-workshop/promptsource.git@main" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "ozeXjIj79zWv" + }, + "source": [ + "# from promptsource.utils import get_dataset\n", + "# tydiqa_pri_ds, failed = get_dataset(ds_key, ds_subkey)\n", + "# tydiqa_pri_ds, failed\n", + "\n", + "ds_key, ds_subkey = \"tydiqa\", \"primary_task\"\n", + "tydiqa_pri_trn_ds = load_dataset(\n", + " ds_key,\n", + " ds_subkey,\n", + " split=\"train\",\n", + " data_dir=DATA_DIR,\n", + " streaming=True\n", + ")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "8JYUCIzJQDsw" + }, + "source": [ + "from promptsource.templates import TemplateCollection\n", + "template_collection = TemplateCollection()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "Kgrmu0xGQUMp" + }, + "source": [ + "tydiqa_pri_tmpls = template_collection.get_dataset(ds_key, ds_subkey)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "o3cBp_wCQ11h" + }, + "source": [ + "tydiqa_pri_tmpls.all_template_names" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "LIQjZWnvSzW5" + }, + "source": [ + "from promptsource.utils import removeHyphen" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "nPQdsLxMQ89g" + }, + "source": [ + "done = False\n", + "for data in tydiqa_pri_trn_ds:\n", + " if data[\"language\"] != \"english\":\n", + " continue\n", + " for tmpl in tydiqa_pri_tmpls.templates.values():\n", + " inst = tmpl.apply(removeHyphen(data))\n", + " if len(inst) == 2:\n", + " data.pop(\"passage_answer_candidates\", None)\n", + " # data.pop(\"plaintext_start_byte\", None)\n", + " display(data)\n", + " prmp, ans = inst\n", + " display(prmp)\n", + " display(ans)\n", + " outcome = {\n", + " \"text\": prmp,\n", + " \"metadata\": [{\n", + " \"key\": \"url\",\n", + " \"type\": \"global\",\n", + " \"value\": data[\"document_url\"],\n", + " }],\n", + " \"answer\": ans,\n", + " }\n", + " print(outcome)\n", + " done = True\n", + " if done:\n", + " break\n", + " if done:\n", + " break" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AzAgDN7G9XFQ" + }, + "source": [ + "# EDA" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eqjnzx-5a2Tf" + }, + "source": [ + "## Preprocessing for mC4 jsonl" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "8PpaITdECoqr" + }, + "source": [ + "from urllib.parse import urlsplit\n", + "\n", + "\n", + "trn_url_parts_map = dict()\n", + "with jsonlines.open(trn_jsonl_p) as trn_jsonl:\n", + " for trn_json in trn_jsonl:\n", + " trn_url = next(filter(lambda x: x[\"key\"] == \"url\", trn_json[\"metadata\"]))['value']\n", + " trn_url_parts = urlsplit(trn_url)\n", + " trn_url_parts_map[trn_url] = trn_url_parts" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "eCB5jZmJFMq-" + }, + "source": [ + "from urllib.parse import unquote_plus\n", + "\n", + "import pandas as pd\n", + "\n", + "\n", + "trn_url_parts_df = pd.DataFrame(\n", + " data=[(\n", + " url,\n", + " parts.netloc,\n", + " sum(1 for _ in filter(None, parts.netloc.split('.'))),\n", + " parts.hostname if parts.hostname else '',\n", + " parts.port if parts.port else '',\n", + " unquote_plus(parts.path),\n", + " sum(1 for _ in filter(None, parts.path.split('/'))),\n", + " unquote_plus(parts.query),\n", + " unquote_plus(parts.fragment)\n", + " ) for url, parts in trn_url_parts_map.items()],\n", + " columns=[\n", + " 'url',\n", + " 'netloc',\n", + " 'netloc_level_cnt',\n", + " 'hostname',\n", + " 'port',\n", + " 'path',\n", + " 'path_level_cnt',\n", + " 'query',\n", + " 'fragment'\n", + " ]\n", + ")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HG-g6IZqcCdk" + }, + "source": [ + "## Basic Stats" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 326 + }, + "id": "T-xQa8YSQGBW", + "outputId": "db5c0976-0d02-4600-905c-e120de3e7bad" + }, + "source": [ + "trn_url_parts_df.head()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
urlnetlocnetloc_level_cnthostnameportpathpath_level_cntqueryfragment
0http://www.polkaudio.com/forums/showthread.php...www.polkaudio.com3www.polkaudio.com/forums/showthread.php258429-Are-my-speakers-magnetically-shielded&go...
1http://www.atthecoachhouse.co.uk/events/practi...www.atthecoachhouse.co.uk4www.atthecoachhouse.co.uk/events/practical-course-in-the-natural-crafts...2
2https://www.digitalspy.com/showbiz/a26775099/l...www.digitalspy.com3www.digitalspy.com/showbiz/a26775099/louis-tomlinson-zayn-malik-...3
3http://www.californialandcan.org/Plumas/Farm-R...www.californialandcan.org3www.californialandcan.org/Plumas/Farm-Resources/2
4https://www.seattlepi.com/local/article/Report...www.seattlepi.com3www.seattlepi.com/local/article/Report-cites-major-problems-at-...3
\n", + "
" + ], + "text/plain": [ + " url ... fragment\n", + "0 http://www.polkaudio.com/forums/showthread.php... ... \n", + "1 http://www.atthecoachhouse.co.uk/events/practi... ... \n", + "2 https://www.digitalspy.com/showbiz/a26775099/l... ... \n", + "3 http://www.californialandcan.org/Plumas/Farm-R... ... \n", + "4 https://www.seattlepi.com/local/article/Report... ... \n", + "\n", + "[5 rows x 9 columns]" + ] + }, + "metadata": {}, + "execution_count": null + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 359 + }, + "id": "KK7CcZXXT4sR", + "outputId": "0acfd2b0-818d-47ca-df2d-70de695bf132" + }, + "source": [ + "trn_url_parts_df.describe(percentiles=[.25, .5, .75, .95, .975])" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
netloc_level_cntpath_level_cnt
count10000.00000010000.000000
mean2.8457002.495700
std0.5833741.398491
min2.0000000.000000
25%2.0000001.000000
50%3.0000002.000000
75%3.0000003.000000
95%4.0000005.000000
97.5%4.0000006.000000
max7.00000014.000000
\n", + "
" + ], + "text/plain": [ + " netloc_level_cnt path_level_cnt\n", + "count 10000.000000 10000.000000\n", + "mean 2.845700 2.495700\n", + "std 0.583374 1.398491\n", + "min 2.000000 0.000000\n", + "25% 2.000000 1.000000\n", + "50% 3.000000 2.000000\n", + "75% 3.000000 3.000000\n", + "95% 4.000000 5.000000\n", + "97.5% 4.000000 6.000000\n", + "max 7.000000 14.000000" + ] + }, + "metadata": {}, + "execution_count": null + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 282 + }, + "id": "-4ZLCRmOT4sR", + "outputId": "373f2605-6e13-48d3-db1d-be904ba1f533" + }, + "source": [ + "trn_url_parts_df.netloc_level_cnt.value_counts().sort_index().plot()\n", + "trn_url_parts_df.path_level_cnt.value_counts().sort_index().plot()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ] + }, + "metadata": {}, + "execution_count": null + }, + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ghRgQvOnVewZ", + "outputId": "b9c72b93-0728-4a11-8ecc-b38a30062f27" + }, + "source": [ + "trn_url_parts_df[trn_url_parts_df.netloc_level_cnt==7].netloc.tolist()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['bmcinfectdis.biomedcentral.com.preview-live.oscarjournals.springer.com',\n", + " '0-www.loc.gov.oasys.lib.oxy.edu',\n", + " 'www.bennettandbelfort.com.php73-36.phx1-1.websitetestlink.com']" + ] + }, + "metadata": {}, + "execution_count": null + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "2cmQUCorT4sS", + "outputId": "5813272b-0007-43b2-c817-c57302a8ab76" + }, + "source": [ + "trn_url_parts_df[trn_url_parts_df.path_level_cnt==14].path.tolist()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['/MetroKids/Attractions-and-Events/Attractions-Guide/index.php/alpha/P/category/Theaters/ages/Parents/city/Malvern/county/NJ+-+Burlington+County/']" + ] + }, + "metadata": {}, + "execution_count": null + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "sR1h230cWTdo", + "outputId": "ce8d4cb8-2730-4a34-f4d0-2b5cd84a944b" + }, + "source": [ + "trn_url_parts_df = trn_url_parts_df.assign(\n", + " yearlike_path=trn_url_parts_df.path.str.contains(r\"20[0-2][0-9]|19\\d\\d\")\n", + ")\n", + "print(\n", + " f\"{len(trn_url_parts_df[trn_url_parts_df.yearlike_path].index)/len(trn_url_parts_df.index):.2%}\"\n", + ")\n", + "trn_url_parts_df.yearlike_path.value_counts().to_dict()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "22.52%\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{False: 7748, True: 2252}" + ] + }, + "metadata": {}, + "execution_count": null + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Tf3hlUV1WTdt", + "outputId": "544100f9-0122-42b5-e309-e0a5f8daf00f" + }, + "source": [ + "trn_url_parts_df[trn_url_parts_df.yearlike_path].path[:10].tolist()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['/news/2005/03/22/on-time-and-on-budget/',\n", + " '/2013/01/19/kitchen-tip-7-how-to-cook-pasta/',\n", + " '/2014/08/18/us-pay-tv-providers-shed-300000-subs-in-q2/',\n", + " '/2011/11/24/reflections-thanksgiving-2011/',\n", + " '/2019/',\n", + " '/oem-parts/2005-suzuki-dr200se-gasket-set/o/m16072sch590528',\n", + " '/Products/Genuine-Joe-Linen-like-Table-Skirts__GJO11915.aspx',\n", + " '/listing/485271256/vintage-90s-1995-58th-annual-postal',\n", + " '/2013/01/08/calendar-cat/',\n", + " '/2012/06/new-model-army-equipment-stolen-can-you-help/']" + ] + }, + "metadata": {}, + "execution_count": null + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "OG8YdBMqWTdu", + "outputId": "8e1a083e-df6f-4519-e47f-a08f250b4177" + }, + "source": [ + "import warnings\n", + "\n", + "\n", + "warnings.simplefilter(action=\"ignore\") # Just don't want to see `UserWarning` about regex group\n", + "trn_url_parts_df = trn_url_parts_df.assign(\n", + " datelike_path=trn_url_parts_df.path.str.contains(\n", + " r\"(20[0-2][0-9]|19\\d\\d)[/\\-](0?[1-9]|11|12)[/\\-](0?[1-9]|[12]\\d|3[01])\"\n", + " )\n", + ")\n", + "print(\n", + " f\"{len(trn_url_parts_df[trn_url_parts_df.datelike_path].index)/len(trn_url_parts_df.index):.2%}\"\n", + ")\n", + "trn_url_parts_df.datelike_path.value_counts().to_dict()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "7.34%\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{False: 9266, True: 734}" + ] + }, + "metadata": {}, + "execution_count": null + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "vN1sK3uzWTdu", + "outputId": "1cf1dbef-5e98-4112-e20c-05f1a13b3f80" + }, + "source": [ + "trn_url_parts_df[trn_url_parts_df.datelike_path].path[:10].tolist()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['/news/2005/03/22/on-time-and-on-budget/',\n", + " '/2013/01/19/kitchen-tip-7-how-to-cook-pasta/',\n", + " '/2014/08/18/us-pay-tv-providers-shed-300000-subs-in-q2/',\n", + " '/2011/11/24/reflections-thanksgiving-2011/',\n", + " '/2013/01/08/calendar-cat/',\n", + " '/2016/02/04/the-jesus-revolution/',\n", + " '/2016/03/18/colorado-avalanche-vs-calgary-flames-nhl-betting-hockey-odds-pick-and-prediction/',\n", + " '/news/2013/08/06/communities-should-be-paid-for-wind-and-solar-in-their-neighbourhood/',\n", + " '/2017/11/20/maurizio-cannavacciuolo/',\n", + " '/index.php/2014/02/19/showing-a-little-love/']" + ] + }, + "metadata": {}, + "execution_count": null + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wexEuNAuOTrI" + }, + "source": [ + "## Deprecated Basic EDA of C4 samples.tgz\n", + "Old code for raw data of C4 samples.tgz (uncompressed as `dst/en_meta.json`, for example)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cEPExQdIbqN1" + }, + "source": [ + "### English" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "m9e6X_9cSFPr" + }, + "source": [ + "import json\n", + "import pandas as pd\n", + "\n", + "from collections import namedtuple\n", + "from glob import glob\n", + "from urllib.parse import unquote_plus, urlsplit" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "Otf_HTdUSX8y" + }, + "source": [ + "%cd /content/drive/MyDrive/colab_data/dst" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "ucRZz8zRxVFW" + }, + "source": [ + "with open('en_meta.json') as en_meta_f:\n", + " en_meta_json = json.load(\n", + " en_meta_f,\n", + " object_hook=lambda d: namedtuple(\n", + " 'Meta',\n", + " map(lambda k: k.replace('-', '_'), d.keys())\n", + " )(*d.values())\n", + " )" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "8JyoUMQKxttO" + }, + "source": [ + "en_meta_offset_uri_map = {\n", + " row.offset: row.headers.warc_target_uri for row in en_meta_json\n", + "}" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "5ml3w27g6TBb" + }, + "source": [ + "en_uri_parts_map = {\n", + " uri: urlsplit(uri) for uri in en_meta_offset_uri_map.values()\n", + "}" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "d-_TysttCbFD" + }, + "source": [ + "i = 0\n", + "for (uri, parts) in en_uri_parts_map.items():\n", + " if '%' in parts.path:\n", + " print(f\"{i:02d} {uri} -> {unquote_plus(parts.path)} {unquote_plus(parts.query)} {unquote_plus(parts.fragment)}\")\n", + " i += 1\n", + " if i > 50:\n", + " break" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "V1cbOBx2w2bd" + }, + "source": [ + "en_uri_parts_df = pd.DataFrame(\n", + " data=[(\n", + " uri,\n", + " unquote_plus(parts.path),\n", + " sum(1 for _ in filter(None, parts.path.split('/'))),\n", + " unquote_plus(parts.query),\n", + " unquote_plus(parts.fragment)\n", + " ) for uri, parts in en_uri_parts_map.items()],\n", + " columns=['uri', 'path', 'path_level_cnt', 'query', 'fragment']\n", + ")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6lisgI0hbLUd" + }, + "source": [ + "#### Number of Path Levels" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "OnRoyOSQxEpj" + }, + "source": [ + "en_uri_parts_df.describe(percentiles=[.25, .5, .75, .95, .975])" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "dSbh_K4FKIjJ" + }, + "source": [ + "en_uri_parts_df.path_level_cnt.value_counts().sort_index().plot()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "uvslgW7mFn71" + }, + "source": [ + "en_uri_parts_df[en_uri_parts_df.path_level_cnt==33].uri.values" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "w9LU95g1IHvz" + }, + "source": [ + "en_uri_parts_df[en_uri_parts_df.path_level_cnt==5].path.values" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QluA_8eLbTqr" + }, + "source": [ + "#### Year and Date" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Wjs8GczPOSPp" + }, + "source": [ + "en_uri_parts_df = en_uri_parts_df.assign(\n", + " yearlike_path=en_uri_parts_df.path.str.contains(r\"20[0-2][0-9]|19\\d\\d\")\n", + ")\n", + "display(f\"{len(en_uri_parts_df[en_uri_parts_df.yearlike_path].index)/len(en_uri_parts_df.index):.2%}\")\n", + "en_uri_parts_df.yearlike_path.value_counts()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "vx5ls4MESADX" + }, + "source": [ + "en_uri_parts_df[en_uri_parts_df.yearlike_path].path.values" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "Zck_PuJ0PoCx" + }, + "source": [ + "en_uri_parts_df = en_uri_parts_df.assign(\n", + " en_uri_parts_df=en_uri_parts_df.path.str.contains(\n", + " r\"(20[0-2][0-9]|19\\d\\d)[/\\-](0?[1-9]|11|12)[/\\-](0?[1-9]|[12]\\d|3[01])\"\n", + " )\n", + ")\n", + "display(f\"{len(en_uri_parts_df[en_uri_parts_df.datelike_path].index)/len(en_uri_parts_df.index):.2%}\")\n", + "en_uri_parts_df.datelike_path.value_counts()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "BrJFJtjEPuMx" + }, + "source": [ + "en_uri_parts_df[en_uri_parts_df.datelike_path].path.values" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "__lnPt_ya_SS" + }, + "source": [ + "### All" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "SnoCgIxkTVWs" + }, + "source": [ + "meta_jsons = []\n", + "for meta_json_fname in glob('??_meta.json'):\n", + " with open(meta_json_fname) as meta_json_f:\n", + " meta_jsons.extend(\n", + " json.load(\n", + " meta_json_f,\n", + " object_hook=lambda d: namedtuple(\n", + " 'Meta',\n", + " map(lambda k: k.replace('-', '_'), d.keys())\n", + " )(*d.values())\n", + " )\n", + " )" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "2dz9enqbT6iK" + }, + "source": [ + "meta_offset_uri_map = {\n", + " row.offset: row.headers.warc_target_uri for row in meta_jsons\n", + "}" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "1cqTjbtjV1oR" + }, + "source": [ + "uri_parts_map = {\n", + " uri: urlsplit(uri) for uri in meta_offset_uri_map.values()\n", + "}" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "qgLPRtDnVVEk" + }, + "source": [ + "uri_parts_df = pd.DataFrame(\n", + " data=[(\n", + " uri,\n", + " unquote_plus(parts.path),\n", + " sum(1 for _ in filter(None, parts.path.split('/'))),\n", + " unquote_plus(parts.query),\n", + " unquote_plus(parts.fragment)\n", + " ) for uri, parts in uri_parts_map.items()],\n", + " columns=['uri', 'path', 'path_level_cnt', 'query', 'fragment']\n", + ")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0nXMJuiMbcBx" + }, + "source": [ + "#### Number of Path Levels" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "3y7CVd1iV_yN" + }, + "source": [ + "uri_parts_df.describe(percentiles=[.25, .5, .75, .95, .975])" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "88eQb3RFWGnk" + }, + "source": [ + "uri_parts_df.path_level_cnt.value_counts().sort_index().plot()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "isBR1jmUWMb0" + }, + "source": [ + "uri_parts_df[uri_parts_df.path_level_cnt==61].uri.values" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "j3EMRvoAbf9Y" + }, + "source": [ + "#### Year and Date" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "fLdePo9-Wqx8" + }, + "source": [ + "uri_parts_df = uri_parts_df.assign(\n", + " yearlike_path=df_uri_parts.path.str.contains(r\"20[0-2][0-9]|19\\d\\d\")\n", + ")\n", + "display(f\"{len(uri_parts_df[uri_parts_df.yearlike_path].index)/len(uri_parts_df.index):.2%}\")\n", + "uri_parts_df.yearlike_path.value_counts()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "iffYbZ_BXNjE" + }, + "source": [ + "uri_parts_df = uri_parts_df.assign(\n", + " datelike_path=uri_parts_df.path.str.contains(\n", + " r\"(20[0-2][0-9]|19\\d\\d)[/\\-](0?[1-9]|11|12)[/\\-](0?[1-9]|[12]\\d|3[01])\"\n", + " )\n", + ")\n", + "display(f\"{len(uri_parts_df[uri_parts_df.datelike_path].index)/len(uri_parts_df.index):.2%}\")\n", + "uri_parts_df.datelike_path.value_counts()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "rWJKdnZFZSvH" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file