From 9cc99be72e5fcc0426ecf7596042f11c6c6f45fe Mon Sep 17 00:00:00 2001
From: Jasmine-Yuting-Zhang <yuting_zhang_2023@163.com>
Date: Sun, 26 Oct 2025 20:33:33 -0400
Subject: [PATCH 01/17] Added NanoChat evaluation modules under
 benchmarks/language_models.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Integrated NanoChat’s benchmark components into Plato to enable direct evaluation of Plato models using the NanoChat benchmark.

Added files:
- common.py: shared utilities and configurations for the benchmark
- core_eval.py: implements the CORE benchmark evaluation logic
- evaluate_model.py: main entry point to run model evaluation from Plato
- report.py: handles result aggregation and reporting
- tokenizer.py: provides tokenization utilities for language model evaluation
---
 benchmarks/language_models/common.py         | 154 +++++++
 benchmarks/language_models/core_eval.py      | 269 +++++++++++
 benchmarks/language_models/evaluate_model.py | 224 ++++++++++
 benchmarks/language_models/report.py         | 446 +++++++++++++++++++
 benchmarks/language_models/tokenizer.py      | 234 ++++++++++
 5 files changed, 1327 insertions(+)
 create mode 100644 benchmarks/language_models/common.py
 create mode 100644 benchmarks/language_models/core_eval.py
 create mode 100644 benchmarks/language_models/evaluate_model.py
 create mode 100644 benchmarks/language_models/report.py
 create mode 100644 benchmarks/language_models/tokenizer.py

diff --git a/benchmarks/language_models/common.py b/benchmarks/language_models/common.py
new file mode 100644
index 000000000..c13dfd424
--- /dev/null
+++ b/benchmarks/language_models/common.py
@@ -0,0 +1,154 @@
+"""
+Common utilities for nanochat.
+"""
+
+import os
+import re
+import logging
+import torch
+import torch.distributed as dist
+
+
+class ColoredFormatter(logging.Formatter):
+    """Custom formatter that adds colors to log messages."""
+
+    # ANSI color codes
+    COLORS = {
+        "DEBUG": "\033[36m",  # Cyan
+        "INFO": "\033[32m",  # Green
+        "WARNING": "\033[33m",  # Yellow
+        "ERROR": "\033[31m",  # Red
+        "CRITICAL": "\033[35m",  # Magenta
+    }
+    RESET = "\033[0m"
+    BOLD = "\033[1m"
+
+    def format(self, record):
+        # Add color to the level name
+        levelname = record.levelname
+        if levelname in self.COLORS:
+            record.levelname = (
+                f"{self.COLORS[levelname]}{self.BOLD}{levelname}{self.RESET}"
+            )
+        # Format the message
+        message = super().format(record)
+        # Add color to specific parts of the message
+        if levelname == "INFO":
+            # Highlight numbers and percentages
+            message = re.sub(
+                r"(\d+\.?\d*\s*(?:GB|MB|%|docs))",
+                rf"{self.BOLD}\1{self.RESET}",
+                message,
+            )
+            message = re.sub(
+                r"(Shard \d+)",
+                rf"{self.COLORS['INFO']}{self.BOLD}\1{self.RESET}",
+                message,
+            )
+        return message
+
+
+def setup_default_logging():
+    handler = logging.StreamHandler()
+    handler.setFormatter(
+        ColoredFormatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+    )
+    logging.basicConfig(level=logging.INFO, handlers=[handler])
+
+
+setup_default_logging()
+logger = logging.getLogger(__name__)
+
+
+def get_base_dir():
+    # co-locate nanochat intermediates with other cached data in ~/.cache (by default)
+    if os.environ.get("NANOCHAT_BASE_DIR"):
+        nanochat_dir = os.environ.get("NANOCHAT_BASE_DIR")
+    else:
+        home_dir = os.path.expanduser("~")
+        cache_dir = os.path.join(home_dir, ".cache")
+        nanochat_dir = os.path.join(cache_dir, "nanochat")
+    os.makedirs(nanochat_dir, exist_ok=True)
+    return nanochat_dir
+
+
+def print0(s="", **kwargs):
+    ddp_rank = int(os.environ.get("RANK", 0))
+    if ddp_rank == 0:
+        print(s, **kwargs)
+
+
+def is_ddp():
+    # TODO is there a proper way
+    return int(os.environ.get("RANK", -1)) != -1
+
+
+def get_dist_info():
+    if is_ddp():
+        assert all(var in os.environ for var in ["RANK", "LOCAL_RANK", "WORLD_SIZE"])
+        ddp_rank = int(os.environ["RANK"])
+        ddp_local_rank = int(os.environ["LOCAL_RANK"])
+        ddp_world_size = int(os.environ["WORLD_SIZE"])
+        return True, ddp_rank, ddp_local_rank, ddp_world_size
+    else:
+        return False, 0, 0, 1
+
+
+def autodetect_device_type():
+    # prefer to use CUDA if available, otherwise use MPS, otherwise fallback on CPU
+    if torch.cuda.is_available():
+        device_type = "cuda"
+    elif torch.backends.mps.is_available():
+        device_type = "mps"
+    else:
+        device_type = "cpu"
+    print0(f"Autodetected device type: {device_type}")
+    return device_type
+
+
+def compute_init(device_type="cuda"):  # cuda|cpu|mps
+    """Basic initialization that we keep doing over and over, so make common."""
+
+    assert device_type in ["cuda", "mps", "cpu"], "Invalid device type atm"
+    if device_type == "cuda":
+        assert torch.cuda.is_available(), (
+            "Your PyTorch installation is not configured for CUDA but device_type is 'cuda'"
+        )
+    if device_type == "mps":
+        assert torch.backends.mps.is_available(), (
+            "Your PyTorch installation is not configured for MPS but device_type is 'mps'"
+        )
+
+    # Reproducibility
+    torch.manual_seed(42)
+    if device_type == "cuda":
+        torch.cuda.manual_seed(42)
+    # skipping full reproducibility for now, possibly investigate slowdown later
+    # torch.use_deterministic_algorithms(True)
+
+    # Precision
+    if device_type == "cuda":
+        torch.set_float32_matmul_precision(
+            "high"
+        )  # uses tf32 instead of fp32 for matmuls
+
+    # Distributed setup: Distributed Data Parallel (DDP), optional, and requires CUDA
+    ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info()
+    if ddp and device_type == "cuda":
+        device = torch.device("cuda", ddp_local_rank)
+        torch.cuda.set_device(device)  # make "cuda" default to this device
+        dist.init_process_group(backend="nccl", device_id=device)
+        dist.barrier()
+    else:
+        device = torch.device(device_type)  # mps|cpu
+
+    if ddp_rank == 0:
+        logger.info(f"Distributed world size: {ddp_world_size}")
+
+    return ddp, ddp_rank, ddp_local_rank, ddp_world_size, device
+
+
+def compute_cleanup():
+    """Companion function to compute_init, to clean things up before script exit"""
+    if is_ddp():
+        dist.destroy_process_group()
diff --git a/benchmarks/language_models/core_eval.py b/benchmarks/language_models/core_eval.py
new file mode 100644
index 000000000..547d22cd3
--- /dev/null
+++ b/benchmarks/language_models/core_eval.py
@@ -0,0 +1,269 @@
+"""
+Functions for evaluating the CORE metric, as described in the DCLM paper.
+https://arxiv.org/abs/2406.11794
+
+TODOs:
+- All tasks ~match except for squad. We get 31% reference is 37%. Figure out why.
+"""
+
+import random
+
+from jinja2 import Template
+import torch
+import torch.distributed as dist
+
+# -----------------------------------------------------------------------------
+# Prompt rendering utilities
+
+
+def render_prompts_mc(item, continuation_delimiter, fewshot_examples=None):
+    """Render complete prompts for a multiple choice question"""
+    template_str = """
+{%- for example in fewshot_examples -%}
+{{ example.query }}{{ continuation_delimiter }}{{ example.choices[example.gold] }}
+
+{% endfor -%}
+{{ item.query }}{{ continuation_delimiter }}{{ choice }}""".strip()
+    template = Template(template_str)
+    fewshot_examples = fewshot_examples or []
+    context = {
+        "fewshot_examples": fewshot_examples,
+        "continuation_delimiter": continuation_delimiter,
+        "item": item,
+    }
+    prompts = [template.render(choice=choice, **context) for choice in item["choices"]]
+    return prompts
+
+
+def render_prompts_schema(item, continuation_delimiter, fewshot_examples=None):
+    """Render complete prompts for a schema question"""
+    template_str = """
+{%- for example in fewshot_examples -%}
+{{ example.context_options[example.gold] }}{{ continuation_delimiter }}{{ example.continuation }}
+
+{% endfor -%}
+{{ context }}{{ continuation_delimiter }}{{ item.continuation }}""".strip()
+    template = Template(template_str)
+    fewshot_examples = fewshot_examples or []
+    context = {
+        "fewshot_examples": fewshot_examples,
+        "continuation_delimiter": continuation_delimiter,
+        "item": item,
+    }
+    prompts = [
+        template.render(context=context_option, **context)
+        for context_option in item["context_options"]
+    ]
+    return prompts
+
+
+def render_prompts_lm(item, continuation_delimiter, fewshot_examples=None):
+    """
+    Render complete prompt for a language modeling task.
+    Notice that we manually trim the context in the template,
+    which in some datasets seems to have trailing whitespace (which we don't want).
+    """
+    template_str = """
+{%- for example in fewshot_examples -%}
+{{ example.context | trim }}{{ continuation_delimiter }}{{ example.continuation }}
+
+{% endfor -%}
+{{ item.context | trim }}{{ continuation_delimiter }}{% if include_continuation %}{{ item.continuation }}{% endif %}""".strip()
+    template = Template(template_str)
+    fewshot_examples = fewshot_examples or []
+    context = {
+        "fewshot_examples": fewshot_examples,
+        "continuation_delimiter": continuation_delimiter,
+        "item": item,
+    }
+    # Return two prompts: without and with the continuation
+    prompt_without = template.render(include_continuation=False, **context)
+    prompt_with = template.render(include_continuation=True, **context)
+    # Due to the way the data seems to be stored, I think I need to strip in the case of LM here.
+    # Otherwise we may get trailing whitespaces in prompt_without (which get absorbed into the next
+    # token in prompt_with), meaning we don't get a nice and clean prefix in the token space
+    # to detect the final continuation. Tokenizers...
+    prompt_without = prompt_without.strip()
+    return [prompt_without, prompt_with]
+
+
+def find_common_length(token_sequences, direction="left"):
+    """
+    Find the length of the common prefix or suffix across token sequences
+    - direction: 'left' for prefix, 'right' for suffix
+    """
+    min_len = min(len(seq) for seq in token_sequences)
+    indices = {"left": range(min_len), "right": range(-1, -min_len - 1, -1)}[direction]
+    # Find the first position where the token sequences differ
+    for i, idx in enumerate(indices):
+        token = token_sequences[0][idx]
+        if not all(seq[idx] == token for seq in token_sequences):
+            return i
+    return min_len
+
+
+def stack_sequences(tokens, pad_token_id):
+    """Stack up a list of token sequences, pad to longest on the right"""
+    bsz, seq_len = len(tokens), max(len(x) for x in tokens)
+    input_ids = torch.full((bsz, seq_len), pad_token_id, dtype=torch.long)
+    for i, x in enumerate(tokens):
+        input_ids[i, : len(x)] = torch.tensor(x, dtype=torch.long)
+    return input_ids
+
+
+def batch_sequences_mc(tokenizer, prompts):
+    # In multiple choice, contexts are the same but the continuation is different (common prefix)
+    tokens = tokenizer(prompts, prepend=tokenizer.get_bos_token_id())
+    # figure out the start and end of each continuation
+    answer_start_idx = find_common_length(tokens, direction="left")
+    start_indices = [answer_start_idx] * len(prompts)
+    end_indices = [len(x) for x in tokens]
+    return tokens, start_indices, end_indices
+
+
+def batch_sequences_schema(tokenizer, prompts):
+    # In schema tasks, contexts vary but continuation is the same (common suffix)
+    tokens = tokenizer(prompts, prepend=tokenizer.get_bos_token_id())
+    # figure out the start and end of each context
+    suffix_length = find_common_length(tokens, direction="right")
+    end_indices = [len(x) for x in tokens]
+    start_indices = [ei - suffix_length for ei in end_indices]
+    return tokens, start_indices, end_indices
+
+
+def batch_sequences_lm(tokenizer, prompts):
+    # In LM tasks, we have two prompts: without and with continuation
+    tokens = tokenizer(prompts, prepend=tokenizer.get_bos_token_id())
+    tokens_without, tokens_with = tokens
+    start_idx, end_idx = len(tokens_without), len(tokens_with)
+    assert start_idx < end_idx, (
+        "prompt without is supposed to be a prefix of prompt with"
+    )
+    assert tokens_without == tokens_with[:start_idx], (
+        "prompt without is supposed to be a prefix of prompt with"
+    )
+    # we only need the with continuation prompt in the LM task, i.e. batch size of 1
+    return [tokens_with], [start_idx], [end_idx]
+
+
+@torch.no_grad()
+def forward_model(model, input_ids):
+    """
+    Take BxT tensor of token ids, return BxT tensor of losses and argmax predictions.
+    The last column of losses is set to nan because we don't have autoregressive targets there.
+    """
+    batch_size, seq_len = input_ids.size()
+    outputs = model(input_ids)
+    # Roll the tensor to the left by one position to get the (autoregressive) target ids
+    target_ids = torch.roll(input_ids, shifts=-1, dims=1)
+    # Calculate cross entropy at all positions
+    losses = torch.nn.functional.cross_entropy(
+        outputs.view(batch_size * seq_len, -1),
+        target_ids.view(batch_size * seq_len),
+        reduction="none",
+    ).view(batch_size, seq_len)
+    # Set the last column to be nan because there is no autoregressive loss there
+    losses[:, -1] = float("nan")
+    # Get the argmax predictions at each position
+    predictions = outputs.argmax(dim=-1)
+    return losses, predictions
+
+
+@torch.no_grad()
+def evaluate_example(idx, model, tokenizer, data, device, task_meta):
+    """Evaluate a single example, return True if correct, False otherwise"""
+    item = data[idx]
+    task_type = task_meta["task_type"]
+    num_fewshot = task_meta["num_fewshot"]
+    continuation_delimiter = task_meta["continuation_delimiter"]
+
+    # Sample few-shot examples (excluding current item)
+    fewshot_examples = []
+    if num_fewshot > 0:
+        rng = random.Random(1234 + idx)
+        available_indices = [i for i in range(len(data)) if i != idx]
+        fewshot_indices = rng.sample(available_indices, num_fewshot)
+        fewshot_examples = [data[i] for i in fewshot_indices]
+
+    # Render prompts and batch sequences based on task type
+    if task_type == "multiple_choice":
+        prompts = render_prompts_mc(item, continuation_delimiter, fewshot_examples)
+        tokens, start_idxs, end_idxs = batch_sequences_mc(tokenizer, prompts)
+    elif task_type == "schema":
+        prompts = render_prompts_schema(item, continuation_delimiter, fewshot_examples)
+        tokens, start_idxs, end_idxs = batch_sequences_schema(tokenizer, prompts)
+    elif task_type == "language_modeling":
+        prompts = render_prompts_lm(item, continuation_delimiter, fewshot_examples)
+        tokens, start_idxs, end_idxs = batch_sequences_lm(tokenizer, prompts)
+    else:
+        raise ValueError(f"Unsupported task type: {task_type}")
+
+    # Some models can't forward sequences beyond a certain length (e.g. GPT-2)
+    # In these cases, we have to truncate sequences to max length and adjust the indices
+    if hasattr(model, "max_seq_len") and model.max_seq_len is not None:
+        max_tokens = model.max_seq_len
+        new_tokens, new_start_idxs, new_end_idxs = [], [], []
+        for t, s, e in zip(tokens, start_idxs, end_idxs):
+            if len(t) > max_tokens:
+                num_to_crop = len(t) - max_tokens
+                new_tokens.append(t[-max_tokens:])  # take the last max_tokens tokens
+                new_start_idxs.append(s - num_to_crop)  # shift the indices down
+                new_end_idxs.append(e - num_to_crop)
+                assert s - num_to_crop >= 0, "this should never happen right?"
+                assert e - num_to_crop >= 0, "this should never happen right?"
+            else:
+                new_tokens.append(t)  # keep unchanged
+                new_start_idxs.append(s)
+                new_end_idxs.append(e)
+        tokens, start_idxs, end_idxs = new_tokens, new_start_idxs, new_end_idxs
+
+    # Stack up all the sequences into a batch
+    pad_token_id = tokenizer.get_bos_token_id()  # use BOS as pad token is ok
+    input_ids = stack_sequences(tokens, pad_token_id)
+    input_ids = input_ids.to(device)
+
+    # Forward the model, get the autoregressive loss and argmax prediction at each token
+    losses, predictions = forward_model(model, input_ids)
+
+    # See if the losses/predictions come out correctly
+    if task_type == "language_modeling":
+        # language modeling task is currently always batch size 1
+        si = start_idxs[0]
+        ei = end_idxs[0]
+        # predictions[i] predict input_ids[i+1] autoregressively
+        predicted_tokens = predictions[0, si - 1 : ei - 1]
+        actual_tokens = input_ids[0, si:ei]
+        is_correct = torch.all(predicted_tokens == actual_tokens).item()
+    elif task_type in ["multiple_choice", "schema"]:
+        # For MC/schema: find the option with lowest average loss
+        mean_losses = [
+            losses[i, si - 1 : ei - 1].mean().item()
+            for i, (si, ei) in enumerate(zip(start_idxs, end_idxs))
+        ]
+        pred_idx = mean_losses.index(min(mean_losses))
+        is_correct = pred_idx == item["gold"]
+    else:
+        raise ValueError(f"Unsupported task type: {task_type}")
+
+    return is_correct
+
+
+def evaluate_task(model, tokenizer, data, device, task_meta):
+    """
+    This function is responsible for evaluating one task across many examples.
+    It also handles dispatch to all processes if the script is run with torchrun.
+    """
+    rank = dist.get_rank() if dist.is_initialized() else 0
+    world_size = dist.get_world_size() if dist.is_initialized() else 1
+    correct = torch.zeros(len(data), dtype=torch.float32, device=device)
+    # stride the examples to each rank
+    for idx in range(rank, len(data), world_size):
+        is_correct = evaluate_example(idx, model, tokenizer, data, device, task_meta)
+        correct[idx] = float(is_correct)
+    # sync results across all the processes if running distributed
+    if world_size > 1:
+        dist.barrier()
+        dist.all_reduce(correct, op=dist.ReduceOp.SUM)
+    # compute the mean
+    mean_correct = correct.mean().item()
+    return mean_correct
diff --git a/benchmarks/language_models/evaluate_model.py b/benchmarks/language_models/evaluate_model.py
new file mode 100644
index 000000000..b133013c5
--- /dev/null
+++ b/benchmarks/language_models/evaluate_model.py
@@ -0,0 +1,224 @@
+"""
+Evlauate the CORE metric for a given model.
+
+Run on a single GPU:
+python base_eval.py --hf-path <path_to_model>
+
+The script will print the CORE metric to the console.
+"""
+
+import os
+import time
+import json
+import random
+import yaml
+from contextlib import nullcontext
+
+import pandas as pd
+import torch
+
+from common import (
+    compute_init,
+    compute_cleanup,
+    print0,
+    get_base_dir,
+    autodetect_device_type,
+)
+from tokenizer import UniversalHuggingFaceTokenizer
+from core_eval import evaluate_task
+
+# -----------------------------------------------------------------------------
+# nanoChat specific function dealing with I/O etc.
+
+
+def evaluate_model(model, tokenizer, device, max_per_task=-1):
+    """
+    Evaluate a base model on the CORE benchmark.
+    - max_per_task: crop the data to this many examples per task for testing (-1 = disable)
+    TODO: clean up this function, delete the need for all the files, for pandas dependency, etc.
+    """
+    # Load config and task metadata
+    base_dir = get_base_dir()
+    eval_bundle_dir = os.path.join(base_dir, "eval_bundle")
+    config_path = os.path.join(eval_bundle_dir, "core.yaml")
+    data_base_path = os.path.join(eval_bundle_dir, "eval_data")
+    eval_meta_data = os.path.join(eval_bundle_dir, "eval_meta_data.csv")
+    with open(config_path, "r") as f:
+        config = yaml.safe_load(f)
+    tasks = config["icl_tasks"]
+    eval_metadata = pd.read_csv(eval_meta_data)
+
+    # Evaluate each task
+    results = {}
+    centered_results = {}
+    for task in tasks:
+        start_time = time.time()
+        label = task["label"]
+        task_meta = {
+            "task_type": task["icl_task_type"],
+            "dataset_uri": task["dataset_uri"],
+            "num_fewshot": task["num_fewshot"][0],
+            "continuation_delimiter": task.get("continuation_delimiter", " "),
+        }
+        print0(
+            f"Evaluating: {label} ({task_meta['num_fewshot']}-shot, type: {task_meta['task_type']})... ",
+            end="",
+        )
+
+        # Load data for this task
+        data_path = os.path.join(data_base_path, task_meta["dataset_uri"])
+        with open(data_path, "r") as f:
+            data = [json.loads(line.strip()) for line in f]
+
+        # shuffle the data because in many cases it appears ordered but we want
+        # the abillity to only run a subset of the data for debugging purposes etc.
+        shuffle_rng = random.Random(1337)
+        shuffle_rng.shuffle(data)
+        if max_per_task > 0:
+            data = data[:max_per_task]
+
+        # run the evaluation for this task
+        accuracy = evaluate_task(model, tokenizer, data, device, task_meta)
+
+        results[label] = accuracy
+        row = eval_metadata[eval_metadata["Eval Task"] == label]
+        random_baseline = row["Random baseline"].values[0]
+        centered_result = (accuracy - 0.01 * random_baseline) / (
+            1.0 - 0.01 * random_baseline
+        )
+        centered_results[label] = centered_result
+        end_time = time.time()
+        print0(
+            f"accuracy: {accuracy:.4f} | centered: {centered_result:.4f} | time: {end_time - start_time:.2f}s"
+        )
+
+    core_metric = sum(centered_results.values()) / len(centered_results)
+    out = {
+        "results": results,
+        "centered_results": centered_results,
+        "core_metric": core_metric,
+    }
+    return out
+
+
+# -----------------------------------------------------------------------------
+# HuggingFace loading utilities and light wrappers for a model
+
+
+class ModelWrapper:
+    """Lightweight wrapper for a HuggingFace model"""
+
+    def __init__(self, model, max_seq_len=None):
+        self.model = model
+        self.max_seq_len = max_seq_len
+
+    def __call__(self, input_ids):
+        outputs = self.model(input_ids)
+        logits = outputs.logits
+        return logits
+
+
+def load_hf_model(hf_path: str, device):
+    print0(f"Loading model from: {hf_path}")
+    from transformers import AutoModelForCausalLM, AutoConfig
+
+    if os.path.exists(hf_path):
+        hf_path = os.path.abspath(hf_path)
+        print0(f"Using absolute path: {hf_path}")
+
+        # Load config to help with token detection
+        config = AutoConfig.from_pretrained(hf_path, local_files_only=True)
+        model = AutoModelForCausalLM.from_pretrained(hf_path, local_files_only=True)
+        tokenizer = UniversalHuggingFaceTokenizer(hf_path, config)
+
+    model.to(device)
+    model.eval()
+    max_seq_len = 1024  # subject to change based on model type, for GPT-2 it's 1024
+    model = ModelWrapper(model, max_seq_len=max_seq_len)
+    return model, tokenizer
+
+
+# -----------------------------------------------------------------------------
+def main():
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--hf-path",
+        type=str,
+        default=None,
+        required=True,
+        help="HuggingFace model path to evaluate",
+    )
+    parser.add_argument(
+        "--max-per-task",
+        type=int,
+        default=-1,
+        help="Max examples per task to evaluate (-1 = disable)",
+    )
+    args = parser.parse_args()
+
+    # distributed / precision setup
+    device_type = autodetect_device_type()
+    ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
+    autocast_ctx = (
+        torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16)
+        if device_type == "cuda"
+        else nullcontext()
+    )
+
+    # Load model and tokenizer from command line or from file system
+    # atm assume that if a path is given, it's a huggingface model path
+    hf_path = args.hf_path
+    print0(f"Loading huggingface model from: {hf_path}")
+    model, tokenizer = load_hf_model(hf_path, device)
+    model_name = hf_path  # just for logging
+    model_slug = hf_path.replace("/", "-")  # for the output csv file
+
+    # Evaluate the model
+    with autocast_ctx:
+        out = evaluate_model(model, tokenizer, device, max_per_task=args.max_per_task)
+
+    # Write out the results to a csv file
+    core_metric = None
+    centered_results = {}
+    if ddp_rank == 0:
+        base_dir = get_base_dir()
+        output_csv_path = os.path.join(base_dir, "base_eval", f"{model_slug}.csv")
+        os.makedirs(os.path.dirname(output_csv_path), exist_ok=True)
+        results = out["results"]
+        centered_results = out["centered_results"]
+        core_metric = out["core_metric"]
+        with open(output_csv_path, "w") as f:
+            f.write(f"{'Task':<35}, {'Accuracy':<10}, {'Centered':<10}\n")
+            for label in results:
+                f.write(
+                    f"{label:<35}, {results[label]:<10.6f}, {centered_results[label]:<10.6f}\n"
+                )
+            f.write(f"{'CORE':<35}, {'':<10}, {core_metric:<10.6f}\n")
+        # Print the content of the csv file to console too
+        print0("=" * 80)
+        print0(f"Model: {model_name}")
+        print0("=" * 80)
+        with open(output_csv_path, "r") as f:
+            print0(f.read())
+
+    # Log to report
+    from report import get_report
+
+    get_report().log(
+        section="Base model evaluation",
+        data=[
+            {
+                "Model": model_name,
+                "CORE metric": core_metric,
+            },
+            centered_results,  # the full table
+        ],
+    )
+
+    compute_cleanup()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/language_models/report.py b/benchmarks/language_models/report.py
new file mode 100644
index 000000000..5721b1be2
--- /dev/null
+++ b/benchmarks/language_models/report.py
@@ -0,0 +1,446 @@
+"""
+Utilities for generating training report cards. More messy code than usual, will fix.
+"""
+
+import os
+import re
+import shutil
+import subprocess
+import socket
+import datetime
+import platform
+import psutil
+import torch
+
+
+def run_command(cmd):
+    """Run a shell command and return output, or None if it fails."""
+    try:
+        result = subprocess.run(
+            cmd, shell=True, capture_output=True, text=True, timeout=5
+        )
+        if result.returncode == 0:
+            return result.stdout.strip()
+        return None
+    except:
+        return None
+
+
+def get_git_info():
+    """Get current git commit, branch, and dirty status."""
+    info = {}
+    info["commit"] = run_command("git rev-parse --short HEAD") or "unknown"
+    info["branch"] = run_command("git rev-parse --abbrev-ref HEAD") or "unknown"
+
+    # Check if repo is dirty (has uncommitted changes)
+    status = run_command("git status --porcelain")
+    info["dirty"] = bool(status) if status is not None else False
+
+    # Get commit message
+    info["message"] = run_command("git log -1 --pretty=%B") or ""
+    info["message"] = info["message"].split("\n")[0][:80]  # First line, truncated
+
+    return info
+
+
+def get_gpu_info():
+    """Get GPU information."""
+    if not torch.cuda.is_available():
+        return {"available": False}
+
+    num_devices = torch.cuda.device_count()
+    info = {"available": True, "count": num_devices, "names": [], "memory_gb": []}
+
+    for i in range(num_devices):
+        props = torch.cuda.get_device_properties(i)
+        info["names"].append(props.name)
+        info["memory_gb"].append(props.total_memory / (1024**3))
+
+    # Get CUDA version
+    info["cuda_version"] = torch.version.cuda or "unknown"
+
+    return info
+
+
+def get_system_info():
+    """Get system information."""
+    info = {}
+
+    # Basic system info
+    info["hostname"] = socket.gethostname()
+    info["platform"] = platform.system()
+    info["python_version"] = platform.python_version()
+    info["torch_version"] = torch.__version__
+
+    # CPU and memory
+    info["cpu_count"] = psutil.cpu_count(logical=False)
+    info["cpu_count_logical"] = psutil.cpu_count(logical=True)
+    info["memory_gb"] = psutil.virtual_memory().total / (1024**3)
+
+    # User and environment
+    info["user"] = os.environ.get("USER", "unknown")
+    info["nanochat_base_dir"] = os.environ.get("NANOCHAT_BASE_DIR", "out")
+    info["working_dir"] = os.getcwd()
+
+    return info
+
+
+def estimate_cost(gpu_info, runtime_hours=None):
+    """Estimate training cost based on GPU type and runtime."""
+
+    # Rough pricing, from Lambda Cloud
+    default_rate = 2.0
+    gpu_hourly_rates = {
+        "H100": 3.00,
+        "A100": 1.79,
+        "V100": 0.55,
+    }
+
+    if not gpu_info.get("available"):
+        return None
+
+    # Try to identify GPU type from name
+    hourly_rate = None
+    gpu_name = gpu_info["names"][0] if gpu_info["names"] else "unknown"
+    for gpu_type, rate in gpu_hourly_rates.items():
+        if gpu_type in gpu_name:
+            hourly_rate = rate * gpu_info["count"]
+            break
+
+    if hourly_rate is None:
+        hourly_rate = default_rate * gpu_info["count"]  # Default estimate
+
+    return {
+        "hourly_rate": hourly_rate,
+        "gpu_type": gpu_name,
+        "estimated_total": hourly_rate * runtime_hours if runtime_hours else None,
+    }
+
+
+def generate_header():
+    """Generate the header for a training report."""
+    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+    git_info = get_git_info()
+    gpu_info = get_gpu_info()
+    sys_info = get_system_info()
+    cost_info = estimate_cost(gpu_info)
+
+    header = f"""# nanochat training report
+
+Generated: {timestamp}
+
+## Environment
+
+### Git Information
+- Branch: {git_info["branch"]}
+- Commit: {git_info["commit"]} {"(dirty)" if git_info["dirty"] else "(clean)"}
+- Message: {git_info["message"]}
+
+### Hardware
+- Platform: {sys_info["platform"]}
+- CPUs: {sys_info["cpu_count"]} cores ({sys_info["cpu_count_logical"]} logical)
+- Memory: {sys_info["memory_gb"]:.1f} GB
+"""
+
+    if gpu_info.get("available"):
+        gpu_names = ", ".join(set(gpu_info["names"]))
+        total_vram = sum(gpu_info["memory_gb"])
+        header += f"""- GPUs: {gpu_info["count"]}x {gpu_names}
+- GPU Memory: {total_vram:.1f} GB total
+- CUDA Version: {gpu_info["cuda_version"]}
+"""
+    else:
+        header += "- GPUs: None available\n"
+
+    if cost_info and cost_info["hourly_rate"] > 0:
+        header += f"""- Hourly Rate: ${cost_info["hourly_rate"]:.2f}/hour\n"""
+
+    header += f"""
+### Software
+- Python: {sys_info["python_version"]}
+- PyTorch: {sys_info["torch_version"]}
+
+"""
+
+    # bloat metrics: package all of the source code and assess its weight
+    packaged = run_command(
+        'files-to-prompt . -e py -e md -e rs -e html -e toml -e sh --ignore "*target*" --cxml'
+    )
+    num_chars = len(packaged)
+    num_lines = len(packaged.split("\n"))
+    num_files = len([x for x in packaged.split("\n") if x.startswith("<source>")])
+    num_tokens = num_chars // 4  # assume approximately 4 chars per token
+
+    # count dependencies via uv.lock
+    uv_lock_lines = 0
+    if os.path.exists("uv.lock"):
+        with open("uv.lock", "r") as f:
+            uv_lock_lines = len(f.readlines())
+
+    header += f"""
+### Bloat
+- Characters: {num_chars:,}
+- Lines: {num_lines:,}
+- Files: {num_files:,}
+- Tokens (approx): {num_tokens:,}
+- Dependencies (uv.lock lines): {uv_lock_lines:,}
+
+"""
+    return header
+
+
+# -----------------------------------------------------------------------------
+
+
+def slugify(text):
+    """Slugify a text string."""
+    return text.lower().replace(" ", "-")
+
+
+# the expected files and their order
+EXPECTED_FILES = [
+    "tokenizer-training.md",
+    "tokenizer-evaluation.md",
+    "base-model-training.md",
+    "base-model-loss.md",
+    "base-model-evaluation.md",
+    "midtraining.md",
+    "chat-evaluation-mid.md",
+    "chat-sft.md",
+    "chat-evaluation-sft.md",
+    "chat-rl.md",
+    "chat-evaluation-rl.md",
+]
+# the metrics we're currently interested in
+chat_metrics = ["ARC-Easy", "ARC-Challenge", "MMLU", "GSM8K", "HumanEval", "ChatCORE"]
+
+
+def extract(section, keys):
+    """simple def to extract a single key from a section"""
+    if not isinstance(keys, list):
+        keys = [keys]  # convenience
+    out = {}
+    for line in section.split("\n"):
+        for key in keys:
+            if key in line:
+                out[key] = line.split(":")[1].strip()
+    return out
+
+
+def extract_timestamp(content, prefix):
+    """Extract timestamp from content with given prefix."""
+    for line in content.split("\n"):
+        if line.startswith(prefix):
+            time_str = line.split(":", 1)[1].strip()
+            try:
+                return datetime.datetime.strptime(time_str, "%Y-%m-%d %H:%M:%S")
+            except:
+                pass
+    return None
+
+
+class Report:
+    """Maintains a bunch of logs, generates a final markdown report."""
+
+    def __init__(self, report_dir):
+        os.makedirs(report_dir, exist_ok=True)
+        self.report_dir = report_dir
+
+    def log(self, section, data):
+        """Log a section of data to the report."""
+        slug = slugify(section)
+        file_name = f"{slug}.md"
+        file_path = os.path.join(self.report_dir, file_name)
+        with open(file_path, "w") as f:
+            f.write(f"## {section}\n")
+            f.write(
+                f"timestamp: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n"
+            )
+            for item in data:
+                if not item:
+                    # skip falsy values like None or empty dict etc.
+                    continue
+                if isinstance(item, str):
+                    # directly write the string
+                    f.write(item)
+                else:
+                    # render a dict
+                    for k, v in item.items():
+                        if isinstance(v, float):
+                            vstr = f"{v:.4f}"
+                        elif isinstance(v, int) and v >= 10000:
+                            vstr = f"{v:,.0f}"
+                        else:
+                            vstr = str(v)
+                        f.write(f"- {k}: {vstr}\n")
+            f.write("\n")
+        return file_path
+
+    def generate(self):
+        """Generate the final report."""
+        report_dir = self.report_dir
+        report_file = os.path.join(report_dir, "report.md")
+        print(f"Generating report to {report_file}")
+        final_metrics = {}  # the most important final metrics we'll add as table at the end
+        start_time = None
+        end_time = None
+        with open(report_file, "w") as out_file:
+            # write the header first
+            header_file = os.path.join(report_dir, "header.md")
+            if os.path.exists(header_file):
+                with open(header_file, "r") as f:
+                    header_content = f.read()
+                    out_file.write(header_content)
+                    start_time = extract_timestamp(header_content, "Run started:")
+                    # capture bloat data for summary later (the stuff after Bloat header and until \n\n)
+                    bloat_data = re.search(
+                        r"### Bloat\n(.*?)\n\n", header_content, re.DOTALL
+                    )
+                    bloat_data = bloat_data.group(1) if bloat_data else ""
+            else:
+                start_time = (
+                    None  # will cause us to not write the total wall clock time
+                )
+                bloat_data = "[bloat data missing]"
+                print(
+                    f"Warning: {header_file} does not exist. Did you forget to run `nanochat reset`?"
+                )
+            # process all the individual sections
+            for file_name in EXPECTED_FILES:
+                section_file = os.path.join(report_dir, file_name)
+                if not os.path.exists(section_file):
+                    print(f"Warning: {section_file} does not exist, skipping")
+                    continue
+                with open(section_file, "r") as in_file:
+                    section = in_file.read()
+                # Extract timestamp from this section (the last section's timestamp will "stick" as end_time)
+                if "rl" not in file_name:
+                    # Skip RL sections for end_time calculation because RL is experimental
+                    end_time = extract_timestamp(section, "timestamp:")
+                # extract the most important metrics from the sections
+                if file_name == "base-model-evaluation.md":
+                    final_metrics["base"] = extract(section, "CORE")
+                if file_name == "chat-evaluation-mid.md":
+                    final_metrics["mid"] = extract(section, chat_metrics)
+                if file_name == "chat-evaluation-sft.md":
+                    final_metrics["sft"] = extract(section, chat_metrics)
+                if file_name == "chat-evaluation-rl.md":
+                    final_metrics["rl"] = extract(
+                        section, "GSM8K"
+                    )  # RL only evals GSM8K
+                # append this section of the report
+                out_file.write(section)
+                out_file.write("\n")
+            # add the final metrics table
+            out_file.write("## Summary\n\n")
+            # Copy over the bloat metrics from the header
+            out_file.write(bloat_data)
+            out_file.write("\n\n")
+            # Collect all unique metric names
+            all_metrics = set()
+            for stage_metrics in final_metrics.values():
+                all_metrics.update(stage_metrics.keys())
+            # Custom ordering: CORE first, ChatCORE last, rest in middle
+            all_metrics = sorted(
+                all_metrics, key=lambda x: (x != "CORE", x == "ChatCORE", x)
+            )
+            # Fixed column widths
+            stages = ["base", "mid", "sft", "rl"]
+            metric_width = 15
+            value_width = 8
+            # Write table header
+            header = f"| {'Metric'.ljust(metric_width)} |"
+            for stage in stages:
+                header += f" {stage.upper().ljust(value_width)} |"
+            out_file.write(header + "\n")
+            # Write separator
+            separator = f"|{'-' * (metric_width + 2)}|"
+            for stage in stages:
+                separator += f"{'-' * (value_width + 2)}|"
+            out_file.write(separator + "\n")
+            # Write table rows
+            for metric in all_metrics:
+                row = f"| {metric.ljust(metric_width)} |"
+                for stage in stages:
+                    value = final_metrics.get(stage, {}).get(metric, "-")
+                    row += f" {str(value).ljust(value_width)} |"
+                out_file.write(row + "\n")
+            out_file.write("\n")
+            # Calculate and write total wall clock time
+            if start_time and end_time:
+                duration = end_time - start_time
+                total_seconds = int(duration.total_seconds())
+                hours = total_seconds // 3600
+                minutes = (total_seconds % 3600) // 60
+                out_file.write(f"Total wall clock time: {hours}h{minutes}m\n")
+            else:
+                out_file.write("Total wall clock time: unknown\n")
+        # also cp the report.md file to current directory
+        print(f"Copying report.md to current directory for convenience")
+        shutil.copy(report_file, "report.md")
+        return report_file
+
+    def reset(self):
+        """Reset the report."""
+        # Remove section files
+        for file_name in EXPECTED_FILES:
+            file_path = os.path.join(self.report_dir, file_name)
+            if os.path.exists(file_path):
+                os.remove(file_path)
+        # Remove report.md if it exists
+        report_file = os.path.join(self.report_dir, "report.md")
+        if os.path.exists(report_file):
+            os.remove(report_file)
+        # Generate and write the header section with start timestamp
+        header_file = os.path.join(self.report_dir, "header.md")
+        header = generate_header()
+        start_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        with open(header_file, "w") as f:
+            f.write(header)
+            f.write(f"Run started: {start_time}\n\n---\n\n")
+        print(f"Reset report and wrote header to {header_file}")
+
+
+# -----------------------------------------------------------------------------
+# nanochat-specific convenience functions
+
+
+class DummyReport:
+    def log(self, *args, **kwargs):
+        pass
+
+    def reset(self, *args, **kwargs):
+        pass
+
+
+def get_report():
+    # just for convenience, only rank 0 logs to report
+    from common import get_base_dir, get_dist_info
+
+    ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info()
+    if ddp_rank == 0:
+        report_dir = os.path.join(get_base_dir(), "report")
+        return Report(report_dir)
+    else:
+        return DummyReport()
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description="Generate or reset nanochat training reports."
+    )
+    parser.add_argument(
+        "command",
+        nargs="?",
+        default="generate",
+        choices=["generate", "reset"],
+        help="Operation to perform (default: generate)",
+    )
+    args = parser.parse_args()
+    if args.command == "generate":
+        get_report().generate()
+    elif args.command == "reset":
+        get_report().reset()
diff --git a/benchmarks/language_models/tokenizer.py b/benchmarks/language_models/tokenizer.py
new file mode 100644
index 000000000..fc962c1d0
--- /dev/null
+++ b/benchmarks/language_models/tokenizer.py
@@ -0,0 +1,234 @@
+"""
+BPE Tokenizer in the style of GPT-4.
+
+Two implementations are available:
+1) HuggingFace Tokenizer that can do both training and inference but is really confusing
+2) Universal Wrapper that can load any HuggingFace tokenizer (e.g., for GPT-2 which has slightly different tokenization rules than GPT-4) for inference only.
+"""
+
+import os
+
+SPECIAL_TOKENS = [
+    # every document begins with the Beginning of Sequence (BOS) token that delimits documents
+    "<|bos|>",
+    # tokens below are only used during finetuning to render Conversations into token ids
+    "<|user_start|>",  # user messages
+    "<|user_end|>",
+    "<|assistant_start|>",  # assistant messages
+    "<|assistant_end|>",
+    "<|python_start|>",  # assistant invokes python REPL tool
+    "<|python_end|>",
+    "<|output_start|>",  # python REPL outputs back to assistant
+    "<|output_end|>",
+]
+
+# NOTE: this split pattern deviates from GPT-4 in that we use \p{N}{1,2} instead of \p{N}{1,3}
+# I did this because I didn't want to "waste" too many tokens on numbers for smaller vocab sizes.
+# I haven't validated that this is actually a good idea, TODO.
+SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,2}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
+
+# -----------------------------------------------------------------------------
+# Generic GPT-4-style tokenizer based on HuggingFace Tokenizer
+from tokenizers import Tokenizer as HFTokenizer
+from tokenizers import pre_tokenizers, decoders, Regex
+from tokenizers.models import BPE
+from tokenizers.trainers import BpeTrainer
+
+
+class HuggingFaceTokenizer:
+    """Light wrapper around HuggingFace Tokenizer for some utilities"""
+
+    def __init__(self, tokenizer):
+        self.tokenizer = tokenizer
+
+    @classmethod
+    def from_pretrained(cls, hf_path):
+        # init from a HuggingFace pretrained tokenizer (e.g. "gpt2")
+        tokenizer = HFTokenizer.from_pretrained(hf_path)
+        return cls(tokenizer)
+
+    @classmethod
+    def from_directory(cls, tokenizer_dir):
+        # init from a local directory on disk (e.g. "out/tokenizer")
+        tokenizer_path = os.path.join(tokenizer_dir, "tokenizer.json")
+        tokenizer = HFTokenizer.from_file(tokenizer_path)
+        return cls(tokenizer)
+
+    @classmethod
+    def train_from_iterator(cls, text_iterator, vocab_size):
+        # train from an iterator of text
+        # Configure the HuggingFace Tokenizer
+        tokenizer = HFTokenizer(
+            BPE(
+                byte_fallback=True,  # needed!
+                unk_token=None,
+                fuse_unk=False,
+            )
+        )
+        # Normalizer: None
+        tokenizer.normalizer = None
+        # Pre-tokenizer: GPT-4 style
+        # the regex pattern used by GPT-4 to split text into groups before BPE
+        # NOTE: The pattern was changed from \p{N}{1,3} to \p{N}{1,2} because I suspect it is harmful to
+        # very small models and smaller vocab sizes, because it is a little bit wasteful in the token space.
+        # (but I haven't validated this! TODO)
+        gpt4_split_regex = Regex(
+            SPLIT_PATTERN
+        )  # huggingface demands that you wrap it in Regex!!
+        tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
+            [
+                pre_tokenizers.Split(
+                    pattern=gpt4_split_regex, behavior="isolated", invert=False
+                ),
+                pre_tokenizers.ByteLevel(add_prefix_space=False, use_regex=False),
+            ]
+        )
+        # Decoder: ByteLevel (it pairs together with the ByteLevel pre-tokenizer)
+        tokenizer.decoder = decoders.ByteLevel()
+        # Post-processor: None
+        tokenizer.post_processor = None
+        # Trainer: BPE
+        trainer = BpeTrainer(
+            vocab_size=vocab_size,
+            show_progress=True,
+            min_frequency=0,  # no minimum frequency
+            initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
+            special_tokens=SPECIAL_TOKENS,
+        )
+        # Kick off the training
+        tokenizer.train_from_iterator(text_iterator, trainer)
+        return cls(tokenizer)
+
+    def get_vocab_size(self):
+        return self.tokenizer.get_vocab_size()
+
+    def get_special_tokens(self):
+        special_tokens_map = self.tokenizer.get_added_tokens_decoder()
+        special_tokens = [w.content for w in special_tokens_map.values()]
+        return special_tokens
+
+    def id_to_token(self, id):
+        return self.tokenizer.id_to_token(id)
+
+    def _encode_one(self, text, prepend=None, append=None):
+        # encode a single string
+        # prepend/append can be either a string of a special token or a token id directly.
+        assert isinstance(text, str)
+        ids = []
+        if prepend is not None:
+            prepend_id = (
+                prepend if isinstance(prepend, int) else self.encode_special(prepend)
+            )
+            ids.append(prepend_id)
+        ids.extend(self.tokenizer.encode(text, add_special_tokens=False).ids)
+        if append is not None:
+            append_id = (
+                append if isinstance(append, int) else self.encode_special(append)
+            )
+            ids.append(append_id)
+        return ids
+
+    def encode_special(self, text):
+        # encode a single special token via exact match
+        return self.tokenizer.token_to_id(text)
+
+    def get_bos_token_id(self):
+        bos = self.encode_special("<|bos|>")
+        return bos
+
+    def encode(self, text, *args, **kwargs):
+        if isinstance(text, str):
+            return self._encode_one(text, *args, **kwargs)
+        elif isinstance(text, list):
+            return [self._encode_one(t, *args, **kwargs) for t in text]
+        else:
+            raise ValueError(f"Invalid input type: {type(text)}")
+
+    def __call__(self, *args, **kwargs):
+        return self.encode(*args, **kwargs)
+
+    def decode(self, ids):
+        return self.tokenizer.decode(ids, skip_special_tokens=False)
+
+    def save(self, tokenizer_dir):
+        # save the tokenizer to disk
+        os.makedirs(tokenizer_dir, exist_ok=True)
+        tokenizer_path = os.path.join(tokenizer_dir, "tokenizer.json")
+        self.tokenizer.save(tokenizer_path)
+        print(f"Saved tokenizer to {tokenizer_path}")
+
+
+# -----------------------------------------------------------------------------
+# Universal Tokenizer Wrapper that works with any HuggingFace model
+class UniversalHuggingFaceTokenizer:
+    """Universal wrapper that works with any HuggingFace model"""
+
+    def __init__(self, tokenizer_dir, model_config=None):
+        self.tokenizer = HuggingFaceTokenizer.from_directory(tokenizer_dir)
+        self.model_config = model_config
+        self._pad_token_id = None
+        self._bos_token_id = None
+        self._detect_special_tokens()
+
+    def _detect_special_tokens(self):
+        """Auto-detect special tokens for any model"""
+        # Try to get pad token from tokenizer
+        if hasattr(self.tokenizer, "tokenizer"):
+            tokenizer_obj = self.tokenizer.tokenizer
+
+            # Try common pad token names
+            pad_candidates = ["<pad>", "[PAD]", "<|pad|>", "</s>", "<|endoftext|>"]
+            for candidate in pad_candidates:
+                if hasattr(tokenizer_obj, "token_to_id"):
+                    token_id = tokenizer_obj.token_to_id(candidate)
+                    if token_id is not None:
+                        self._pad_token_id = token_id
+                        break
+
+            # Try common BOS token names
+            bos_candidates = ["<s>", "[CLS]", "<|startoftext|>", "<|endoftext|>"]
+            for candidate in bos_candidates:
+                if hasattr(tokenizer_obj, "token_to_id"):
+                    token_id = tokenizer_obj.token_to_id(candidate)
+                    if token_id is not None:
+                        self._bos_token_id = token_id
+                        break
+
+        # Fallback to config-based detection
+        if self.model_config and hasattr(self.model_config, "pad_token_id"):
+            self._pad_token_id = self.model_config.pad_token_id
+
+        if self.model_config and hasattr(self.model_config, "bos_token_id"):
+            self._bos_token_id = self.model_config.bos_token_id
+
+        # Final fallbacks based on common patterns
+        if self._pad_token_id is None:
+            # Most models use either 0 or their EOS token
+            self._pad_token_id = 0
+
+        if self._bos_token_id is None:
+            # Use pad token as fallback
+            self._bos_token_id = self._pad_token_id
+
+    def get_bos_token_id(self):
+        return self._bos_token_id
+
+    def get_pad_token_id(self):
+        return self._pad_token_id
+
+    def __call__(self, prompts, prepend=None):
+        """Universal tokenization method"""
+        if isinstance(prompts, str):
+            prompts = [prompts]
+
+        result = []
+        for prompt in prompts:
+            tokens = self.tokenizer.encode(prompt)
+            if prepend is not None:
+                tokens = [prepend] + tokens
+            result.append(tokens)
+
+        return result[0] if len(result) == 1 else result
+
+    def __getattr__(self, name):
+        return getattr(self.tokenizer, name)

From 06e666a5401ce98fd9c5ac769a5d7a63068a0b61 Mon Sep 17 00:00:00 2001
From: Jasmine-Yuting-Zhang <yuting_zhang_2023@163.com>
Date: Sun, 26 Oct 2025 21:14:28 -0400
Subject: [PATCH 02/17] Added evaluation script for running NanoChat benchmark
 on HuggingFace models.

Features:
- Automatically sets up the NanoChat datasets  under `.cache/nanochat`.
- Downloads and unpacks the CORE evaluation bundle if not already available.
- Invokes `evaluate_model.py` with the specified HuggingFace model path.
- Adds argument parsing for `<model_path>` and optional `[max_per_task]`.
- Defaults `max_per_task` to 16 when not provided.

Usage:
    bash evaluate_model.sh <model_path> [optional: max_per_task]
---
 benchmarks/language_models/evaluate_model.sh | 24 ++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 benchmarks/language_models/evaluate_model.sh

diff --git a/benchmarks/language_models/evaluate_model.sh b/benchmarks/language_models/evaluate_model.sh
new file mode 100644
index 000000000..e269bbecc
--- /dev/null
+++ b/benchmarks/language_models/evaluate_model.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+# Usage: bash evaluate_model.sh <model_path> [optional: max_per_task]
+# Comment: This script evaluates a HuggingFace-based language model using the NanoChat benchmark infrastructure.
+# model_path: Path to the HuggingFace model to evaluate.
+# max_per_task: (Optional) Maximum number of examples to evaluate per task.
+
+export NANOCHAT_BASE_DIR="$PWD/.cache/nanochat"
+mkdir -p $NANOCHAT_BASE_DIR
+
+EVAL_BUNDLE_URL=https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip
+if [ ! -d "$NANOCHAT_BASE_DIR/eval_bundle" ]; then
+    curl -L -o eval_bundle.zip $EVAL_BUNDLE_URL
+    unzip -q eval_bundle.zip
+    rm eval_bundle.zip
+    mv eval_bundle $NANOCHAT_BASE_DIR
+fi
+
+if [ -z "$2" ]; then
+  MAX_PER_TASK=16
+else
+  MAX_PER_TASK=$2
+fi
+uv run evaluate_model.py --hf_path=$1 --max-per-task=$MAX_PER_TASK
+

From 473d350cad3a20317b5fc71f4c98caf6e8eab58c Mon Sep 17 00:00:00 2001
From: Jasmine-Yuting-Zhang <yuting_zhang_2023@163.com>
Date: Sun, 26 Oct 2025 21:16:12 -0400
Subject: [PATCH 03/17] Added missing explanation of --max_per_task default
 value.

---
 benchmarks/language_models/evaluate_model.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/language_models/evaluate_model.sh b/benchmarks/language_models/evaluate_model.sh
index e269bbecc..4c443e46b 100644
--- a/benchmarks/language_models/evaluate_model.sh
+++ b/benchmarks/language_models/evaluate_model.sh
@@ -2,7 +2,7 @@
 # Usage: bash evaluate_model.sh <model_path> [optional: max_per_task]
 # Comment: This script evaluates a HuggingFace-based language model using the NanoChat benchmark infrastructure.
 # model_path: Path to the HuggingFace model to evaluate.
-# max_per_task: (Optional) Maximum number of examples to evaluate per task.
+# max_per_task: (Optional) Maximum number of examples to evaluate per task, default setting to 16.
 
 export NANOCHAT_BASE_DIR="$PWD/.cache/nanochat"
 mkdir -p $NANOCHAT_BASE_DIR

From f70334894982abc799cb2b14b7065baf04702a75 Mon Sep 17 00:00:00 2001
From: Jasmine-Yuting-Zhang <yuting_zhang_2023@163.com>
Date: Mon, 27 Oct 2025 22:04:07 -0400
Subject: [PATCH 04/17] Moved NanoChat benchmark from
 benchmarks/language_models to plato/benchmarks/language_model.

---
 {benchmarks => plato/benchmarks}/language_models/common.py        | 0
 {benchmarks => plato/benchmarks}/language_models/core_eval.py     | 0
 .../benchmarks}/language_models/evaluate_model.py                 | 0
 .../benchmarks}/language_models/evaluate_model.sh                 | 0
 {benchmarks => plato/benchmarks}/language_models/report.py        | 0
 {benchmarks => plato/benchmarks}/language_models/tokenizer.py     | 0
 6 files changed, 0 insertions(+), 0 deletions(-)
 rename {benchmarks => plato/benchmarks}/language_models/common.py (100%)
 rename {benchmarks => plato/benchmarks}/language_models/core_eval.py (100%)
 rename {benchmarks => plato/benchmarks}/language_models/evaluate_model.py (100%)
 rename {benchmarks => plato/benchmarks}/language_models/evaluate_model.sh (100%)
 rename {benchmarks => plato/benchmarks}/language_models/report.py (100%)
 rename {benchmarks => plato/benchmarks}/language_models/tokenizer.py (100%)

diff --git a/benchmarks/language_models/common.py b/plato/benchmarks/language_models/common.py
similarity index 100%
rename from benchmarks/language_models/common.py
rename to plato/benchmarks/language_models/common.py
diff --git a/benchmarks/language_models/core_eval.py b/plato/benchmarks/language_models/core_eval.py
similarity index 100%
rename from benchmarks/language_models/core_eval.py
rename to plato/benchmarks/language_models/core_eval.py
diff --git a/benchmarks/language_models/evaluate_model.py b/plato/benchmarks/language_models/evaluate_model.py
similarity index 100%
rename from benchmarks/language_models/evaluate_model.py
rename to plato/benchmarks/language_models/evaluate_model.py
diff --git a/benchmarks/language_models/evaluate_model.sh b/plato/benchmarks/language_models/evaluate_model.sh
similarity index 100%
rename from benchmarks/language_models/evaluate_model.sh
rename to plato/benchmarks/language_models/evaluate_model.sh
diff --git a/benchmarks/language_models/report.py b/plato/benchmarks/language_models/report.py
similarity index 100%
rename from benchmarks/language_models/report.py
rename to plato/benchmarks/language_models/report.py
diff --git a/benchmarks/language_models/tokenizer.py b/plato/benchmarks/language_models/tokenizer.py
similarity index 100%
rename from benchmarks/language_models/tokenizer.py
rename to plato/benchmarks/language_models/tokenizer.py

From 08dc54eda1963d7345bef72e9ec091cf1b9729b1 Mon Sep 17 00:00:00 2001
From: Jasmine-Yuting-Zhang <yuting_zhang_2023@163.com>
Date: Tue, 28 Oct 2025 02:54:48 -0400
Subject: [PATCH 05/17] Cleaned up unused code from nanochat.

---
 plato/benchmarks/language_models/common.py    | 154 ------
 plato/benchmarks/language_models/core_eval.py | 269 -----------
 .../language_models/evaluate_model.py         | 224 ---------
 .../language_models/evaluate_model.sh         |  24 -
 plato/benchmarks/language_models/report.py    | 446 ------------------
 plato/benchmarks/language_models/tokenizer.py | 234 ---------
 6 files changed, 1351 deletions(-)
 delete mode 100644 plato/benchmarks/language_models/common.py
 delete mode 100644 plato/benchmarks/language_models/core_eval.py
 delete mode 100644 plato/benchmarks/language_models/evaluate_model.py
 delete mode 100644 plato/benchmarks/language_models/evaluate_model.sh
 delete mode 100644 plato/benchmarks/language_models/report.py
 delete mode 100644 plato/benchmarks/language_models/tokenizer.py

diff --git a/plato/benchmarks/language_models/common.py b/plato/benchmarks/language_models/common.py
deleted file mode 100644
index c13dfd424..000000000
--- a/plato/benchmarks/language_models/common.py
+++ /dev/null
@@ -1,154 +0,0 @@
-"""
-Common utilities for nanochat.
-"""
-
-import os
-import re
-import logging
-import torch
-import torch.distributed as dist
-
-
-class ColoredFormatter(logging.Formatter):
-    """Custom formatter that adds colors to log messages."""
-
-    # ANSI color codes
-    COLORS = {
-        "DEBUG": "\033[36m",  # Cyan
-        "INFO": "\033[32m",  # Green
-        "WARNING": "\033[33m",  # Yellow
-        "ERROR": "\033[31m",  # Red
-        "CRITICAL": "\033[35m",  # Magenta
-    }
-    RESET = "\033[0m"
-    BOLD = "\033[1m"
-
-    def format(self, record):
-        # Add color to the level name
-        levelname = record.levelname
-        if levelname in self.COLORS:
-            record.levelname = (
-                f"{self.COLORS[levelname]}{self.BOLD}{levelname}{self.RESET}"
-            )
-        # Format the message
-        message = super().format(record)
-        # Add color to specific parts of the message
-        if levelname == "INFO":
-            # Highlight numbers and percentages
-            message = re.sub(
-                r"(\d+\.?\d*\s*(?:GB|MB|%|docs))",
-                rf"{self.BOLD}\1{self.RESET}",
-                message,
-            )
-            message = re.sub(
-                r"(Shard \d+)",
-                rf"{self.COLORS['INFO']}{self.BOLD}\1{self.RESET}",
-                message,
-            )
-        return message
-
-
-def setup_default_logging():
-    handler = logging.StreamHandler()
-    handler.setFormatter(
-        ColoredFormatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
-    )
-    logging.basicConfig(level=logging.INFO, handlers=[handler])
-
-
-setup_default_logging()
-logger = logging.getLogger(__name__)
-
-
-def get_base_dir():
-    # co-locate nanochat intermediates with other cached data in ~/.cache (by default)
-    if os.environ.get("NANOCHAT_BASE_DIR"):
-        nanochat_dir = os.environ.get("NANOCHAT_BASE_DIR")
-    else:
-        home_dir = os.path.expanduser("~")
-        cache_dir = os.path.join(home_dir, ".cache")
-        nanochat_dir = os.path.join(cache_dir, "nanochat")
-    os.makedirs(nanochat_dir, exist_ok=True)
-    return nanochat_dir
-
-
-def print0(s="", **kwargs):
-    ddp_rank = int(os.environ.get("RANK", 0))
-    if ddp_rank == 0:
-        print(s, **kwargs)
-
-
-def is_ddp():
-    # TODO is there a proper way
-    return int(os.environ.get("RANK", -1)) != -1
-
-
-def get_dist_info():
-    if is_ddp():
-        assert all(var in os.environ for var in ["RANK", "LOCAL_RANK", "WORLD_SIZE"])
-        ddp_rank = int(os.environ["RANK"])
-        ddp_local_rank = int(os.environ["LOCAL_RANK"])
-        ddp_world_size = int(os.environ["WORLD_SIZE"])
-        return True, ddp_rank, ddp_local_rank, ddp_world_size
-    else:
-        return False, 0, 0, 1
-
-
-def autodetect_device_type():
-    # prefer to use CUDA if available, otherwise use MPS, otherwise fallback on CPU
-    if torch.cuda.is_available():
-        device_type = "cuda"
-    elif torch.backends.mps.is_available():
-        device_type = "mps"
-    else:
-        device_type = "cpu"
-    print0(f"Autodetected device type: {device_type}")
-    return device_type
-
-
-def compute_init(device_type="cuda"):  # cuda|cpu|mps
-    """Basic initialization that we keep doing over and over, so make common."""
-
-    assert device_type in ["cuda", "mps", "cpu"], "Invalid device type atm"
-    if device_type == "cuda":
-        assert torch.cuda.is_available(), (
-            "Your PyTorch installation is not configured for CUDA but device_type is 'cuda'"
-        )
-    if device_type == "mps":
-        assert torch.backends.mps.is_available(), (
-            "Your PyTorch installation is not configured for MPS but device_type is 'mps'"
-        )
-
-    # Reproducibility
-    torch.manual_seed(42)
-    if device_type == "cuda":
-        torch.cuda.manual_seed(42)
-    # skipping full reproducibility for now, possibly investigate slowdown later
-    # torch.use_deterministic_algorithms(True)
-
-    # Precision
-    if device_type == "cuda":
-        torch.set_float32_matmul_precision(
-            "high"
-        )  # uses tf32 instead of fp32 for matmuls
-
-    # Distributed setup: Distributed Data Parallel (DDP), optional, and requires CUDA
-    ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info()
-    if ddp and device_type == "cuda":
-        device = torch.device("cuda", ddp_local_rank)
-        torch.cuda.set_device(device)  # make "cuda" default to this device
-        dist.init_process_group(backend="nccl", device_id=device)
-        dist.barrier()
-    else:
-        device = torch.device(device_type)  # mps|cpu
-
-    if ddp_rank == 0:
-        logger.info(f"Distributed world size: {ddp_world_size}")
-
-    return ddp, ddp_rank, ddp_local_rank, ddp_world_size, device
-
-
-def compute_cleanup():
-    """Companion function to compute_init, to clean things up before script exit"""
-    if is_ddp():
-        dist.destroy_process_group()
diff --git a/plato/benchmarks/language_models/core_eval.py b/plato/benchmarks/language_models/core_eval.py
deleted file mode 100644
index 547d22cd3..000000000
--- a/plato/benchmarks/language_models/core_eval.py
+++ /dev/null
@@ -1,269 +0,0 @@
-"""
-Functions for evaluating the CORE metric, as described in the DCLM paper.
-https://arxiv.org/abs/2406.11794
-
-TODOs:
-- All tasks ~match except for squad. We get 31% reference is 37%. Figure out why.
-"""
-
-import random
-
-from jinja2 import Template
-import torch
-import torch.distributed as dist
-
-# -----------------------------------------------------------------------------
-# Prompt rendering utilities
-
-
-def render_prompts_mc(item, continuation_delimiter, fewshot_examples=None):
-    """Render complete prompts for a multiple choice question"""
-    template_str = """
-{%- for example in fewshot_examples -%}
-{{ example.query }}{{ continuation_delimiter }}{{ example.choices[example.gold] }}
-
-{% endfor -%}
-{{ item.query }}{{ continuation_delimiter }}{{ choice }}""".strip()
-    template = Template(template_str)
-    fewshot_examples = fewshot_examples or []
-    context = {
-        "fewshot_examples": fewshot_examples,
-        "continuation_delimiter": continuation_delimiter,
-        "item": item,
-    }
-    prompts = [template.render(choice=choice, **context) for choice in item["choices"]]
-    return prompts
-
-
-def render_prompts_schema(item, continuation_delimiter, fewshot_examples=None):
-    """Render complete prompts for a schema question"""
-    template_str = """
-{%- for example in fewshot_examples -%}
-{{ example.context_options[example.gold] }}{{ continuation_delimiter }}{{ example.continuation }}
-
-{% endfor -%}
-{{ context }}{{ continuation_delimiter }}{{ item.continuation }}""".strip()
-    template = Template(template_str)
-    fewshot_examples = fewshot_examples or []
-    context = {
-        "fewshot_examples": fewshot_examples,
-        "continuation_delimiter": continuation_delimiter,
-        "item": item,
-    }
-    prompts = [
-        template.render(context=context_option, **context)
-        for context_option in item["context_options"]
-    ]
-    return prompts
-
-
-def render_prompts_lm(item, continuation_delimiter, fewshot_examples=None):
-    """
-    Render complete prompt for a language modeling task.
-    Notice that we manually trim the context in the template,
-    which in some datasets seems to have trailing whitespace (which we don't want).
-    """
-    template_str = """
-{%- for example in fewshot_examples -%}
-{{ example.context | trim }}{{ continuation_delimiter }}{{ example.continuation }}
-
-{% endfor -%}
-{{ item.context | trim }}{{ continuation_delimiter }}{% if include_continuation %}{{ item.continuation }}{% endif %}""".strip()
-    template = Template(template_str)
-    fewshot_examples = fewshot_examples or []
-    context = {
-        "fewshot_examples": fewshot_examples,
-        "continuation_delimiter": continuation_delimiter,
-        "item": item,
-    }
-    # Return two prompts: without and with the continuation
-    prompt_without = template.render(include_continuation=False, **context)
-    prompt_with = template.render(include_continuation=True, **context)
-    # Due to the way the data seems to be stored, I think I need to strip in the case of LM here.
-    # Otherwise we may get trailing whitespaces in prompt_without (which get absorbed into the next
-    # token in prompt_with), meaning we don't get a nice and clean prefix in the token space
-    # to detect the final continuation. Tokenizers...
-    prompt_without = prompt_without.strip()
-    return [prompt_without, prompt_with]
-
-
-def find_common_length(token_sequences, direction="left"):
-    """
-    Find the length of the common prefix or suffix across token sequences
-    - direction: 'left' for prefix, 'right' for suffix
-    """
-    min_len = min(len(seq) for seq in token_sequences)
-    indices = {"left": range(min_len), "right": range(-1, -min_len - 1, -1)}[direction]
-    # Find the first position where the token sequences differ
-    for i, idx in enumerate(indices):
-        token = token_sequences[0][idx]
-        if not all(seq[idx] == token for seq in token_sequences):
-            return i
-    return min_len
-
-
-def stack_sequences(tokens, pad_token_id):
-    """Stack up a list of token sequences, pad to longest on the right"""
-    bsz, seq_len = len(tokens), max(len(x) for x in tokens)
-    input_ids = torch.full((bsz, seq_len), pad_token_id, dtype=torch.long)
-    for i, x in enumerate(tokens):
-        input_ids[i, : len(x)] = torch.tensor(x, dtype=torch.long)
-    return input_ids
-
-
-def batch_sequences_mc(tokenizer, prompts):
-    # In multiple choice, contexts are the same but the continuation is different (common prefix)
-    tokens = tokenizer(prompts, prepend=tokenizer.get_bos_token_id())
-    # figure out the start and end of each continuation
-    answer_start_idx = find_common_length(tokens, direction="left")
-    start_indices = [answer_start_idx] * len(prompts)
-    end_indices = [len(x) for x in tokens]
-    return tokens, start_indices, end_indices
-
-
-def batch_sequences_schema(tokenizer, prompts):
-    # In schema tasks, contexts vary but continuation is the same (common suffix)
-    tokens = tokenizer(prompts, prepend=tokenizer.get_bos_token_id())
-    # figure out the start and end of each context
-    suffix_length = find_common_length(tokens, direction="right")
-    end_indices = [len(x) for x in tokens]
-    start_indices = [ei - suffix_length for ei in end_indices]
-    return tokens, start_indices, end_indices
-
-
-def batch_sequences_lm(tokenizer, prompts):
-    # In LM tasks, we have two prompts: without and with continuation
-    tokens = tokenizer(prompts, prepend=tokenizer.get_bos_token_id())
-    tokens_without, tokens_with = tokens
-    start_idx, end_idx = len(tokens_without), len(tokens_with)
-    assert start_idx < end_idx, (
-        "prompt without is supposed to be a prefix of prompt with"
-    )
-    assert tokens_without == tokens_with[:start_idx], (
-        "prompt without is supposed to be a prefix of prompt with"
-    )
-    # we only need the with continuation prompt in the LM task, i.e. batch size of 1
-    return [tokens_with], [start_idx], [end_idx]
-
-
-@torch.no_grad()
-def forward_model(model, input_ids):
-    """
-    Take BxT tensor of token ids, return BxT tensor of losses and argmax predictions.
-    The last column of losses is set to nan because we don't have autoregressive targets there.
-    """
-    batch_size, seq_len = input_ids.size()
-    outputs = model(input_ids)
-    # Roll the tensor to the left by one position to get the (autoregressive) target ids
-    target_ids = torch.roll(input_ids, shifts=-1, dims=1)
-    # Calculate cross entropy at all positions
-    losses = torch.nn.functional.cross_entropy(
-        outputs.view(batch_size * seq_len, -1),
-        target_ids.view(batch_size * seq_len),
-        reduction="none",
-    ).view(batch_size, seq_len)
-    # Set the last column to be nan because there is no autoregressive loss there
-    losses[:, -1] = float("nan")
-    # Get the argmax predictions at each position
-    predictions = outputs.argmax(dim=-1)
-    return losses, predictions
-
-
-@torch.no_grad()
-def evaluate_example(idx, model, tokenizer, data, device, task_meta):
-    """Evaluate a single example, return True if correct, False otherwise"""
-    item = data[idx]
-    task_type = task_meta["task_type"]
-    num_fewshot = task_meta["num_fewshot"]
-    continuation_delimiter = task_meta["continuation_delimiter"]
-
-    # Sample few-shot examples (excluding current item)
-    fewshot_examples = []
-    if num_fewshot > 0:
-        rng = random.Random(1234 + idx)
-        available_indices = [i for i in range(len(data)) if i != idx]
-        fewshot_indices = rng.sample(available_indices, num_fewshot)
-        fewshot_examples = [data[i] for i in fewshot_indices]
-
-    # Render prompts and batch sequences based on task type
-    if task_type == "multiple_choice":
-        prompts = render_prompts_mc(item, continuation_delimiter, fewshot_examples)
-        tokens, start_idxs, end_idxs = batch_sequences_mc(tokenizer, prompts)
-    elif task_type == "schema":
-        prompts = render_prompts_schema(item, continuation_delimiter, fewshot_examples)
-        tokens, start_idxs, end_idxs = batch_sequences_schema(tokenizer, prompts)
-    elif task_type == "language_modeling":
-        prompts = render_prompts_lm(item, continuation_delimiter, fewshot_examples)
-        tokens, start_idxs, end_idxs = batch_sequences_lm(tokenizer, prompts)
-    else:
-        raise ValueError(f"Unsupported task type: {task_type}")
-
-    # Some models can't forward sequences beyond a certain length (e.g. GPT-2)
-    # In these cases, we have to truncate sequences to max length and adjust the indices
-    if hasattr(model, "max_seq_len") and model.max_seq_len is not None:
-        max_tokens = model.max_seq_len
-        new_tokens, new_start_idxs, new_end_idxs = [], [], []
-        for t, s, e in zip(tokens, start_idxs, end_idxs):
-            if len(t) > max_tokens:
-                num_to_crop = len(t) - max_tokens
-                new_tokens.append(t[-max_tokens:])  # take the last max_tokens tokens
-                new_start_idxs.append(s - num_to_crop)  # shift the indices down
-                new_end_idxs.append(e - num_to_crop)
-                assert s - num_to_crop >= 0, "this should never happen right?"
-                assert e - num_to_crop >= 0, "this should never happen right?"
-            else:
-                new_tokens.append(t)  # keep unchanged
-                new_start_idxs.append(s)
-                new_end_idxs.append(e)
-        tokens, start_idxs, end_idxs = new_tokens, new_start_idxs, new_end_idxs
-
-    # Stack up all the sequences into a batch
-    pad_token_id = tokenizer.get_bos_token_id()  # use BOS as pad token is ok
-    input_ids = stack_sequences(tokens, pad_token_id)
-    input_ids = input_ids.to(device)
-
-    # Forward the model, get the autoregressive loss and argmax prediction at each token
-    losses, predictions = forward_model(model, input_ids)
-
-    # See if the losses/predictions come out correctly
-    if task_type == "language_modeling":
-        # language modeling task is currently always batch size 1
-        si = start_idxs[0]
-        ei = end_idxs[0]
-        # predictions[i] predict input_ids[i+1] autoregressively
-        predicted_tokens = predictions[0, si - 1 : ei - 1]
-        actual_tokens = input_ids[0, si:ei]
-        is_correct = torch.all(predicted_tokens == actual_tokens).item()
-    elif task_type in ["multiple_choice", "schema"]:
-        # For MC/schema: find the option with lowest average loss
-        mean_losses = [
-            losses[i, si - 1 : ei - 1].mean().item()
-            for i, (si, ei) in enumerate(zip(start_idxs, end_idxs))
-        ]
-        pred_idx = mean_losses.index(min(mean_losses))
-        is_correct = pred_idx == item["gold"]
-    else:
-        raise ValueError(f"Unsupported task type: {task_type}")
-
-    return is_correct
-
-
-def evaluate_task(model, tokenizer, data, device, task_meta):
-    """
-    This function is responsible for evaluating one task across many examples.
-    It also handles dispatch to all processes if the script is run with torchrun.
-    """
-    rank = dist.get_rank() if dist.is_initialized() else 0
-    world_size = dist.get_world_size() if dist.is_initialized() else 1
-    correct = torch.zeros(len(data), dtype=torch.float32, device=device)
-    # stride the examples to each rank
-    for idx in range(rank, len(data), world_size):
-        is_correct = evaluate_example(idx, model, tokenizer, data, device, task_meta)
-        correct[idx] = float(is_correct)
-    # sync results across all the processes if running distributed
-    if world_size > 1:
-        dist.barrier()
-        dist.all_reduce(correct, op=dist.ReduceOp.SUM)
-    # compute the mean
-    mean_correct = correct.mean().item()
-    return mean_correct
diff --git a/plato/benchmarks/language_models/evaluate_model.py b/plato/benchmarks/language_models/evaluate_model.py
deleted file mode 100644
index b133013c5..000000000
--- a/plato/benchmarks/language_models/evaluate_model.py
+++ /dev/null
@@ -1,224 +0,0 @@
-"""
-Evlauate the CORE metric for a given model.
-
-Run on a single GPU:
-python base_eval.py --hf-path <path_to_model>
-
-The script will print the CORE metric to the console.
-"""
-
-import os
-import time
-import json
-import random
-import yaml
-from contextlib import nullcontext
-
-import pandas as pd
-import torch
-
-from common import (
-    compute_init,
-    compute_cleanup,
-    print0,
-    get_base_dir,
-    autodetect_device_type,
-)
-from tokenizer import UniversalHuggingFaceTokenizer
-from core_eval import evaluate_task
-
-# -----------------------------------------------------------------------------
-# nanoChat specific function dealing with I/O etc.
-
-
-def evaluate_model(model, tokenizer, device, max_per_task=-1):
-    """
-    Evaluate a base model on the CORE benchmark.
-    - max_per_task: crop the data to this many examples per task for testing (-1 = disable)
-    TODO: clean up this function, delete the need for all the files, for pandas dependency, etc.
-    """
-    # Load config and task metadata
-    base_dir = get_base_dir()
-    eval_bundle_dir = os.path.join(base_dir, "eval_bundle")
-    config_path = os.path.join(eval_bundle_dir, "core.yaml")
-    data_base_path = os.path.join(eval_bundle_dir, "eval_data")
-    eval_meta_data = os.path.join(eval_bundle_dir, "eval_meta_data.csv")
-    with open(config_path, "r") as f:
-        config = yaml.safe_load(f)
-    tasks = config["icl_tasks"]
-    eval_metadata = pd.read_csv(eval_meta_data)
-
-    # Evaluate each task
-    results = {}
-    centered_results = {}
-    for task in tasks:
-        start_time = time.time()
-        label = task["label"]
-        task_meta = {
-            "task_type": task["icl_task_type"],
-            "dataset_uri": task["dataset_uri"],
-            "num_fewshot": task["num_fewshot"][0],
-            "continuation_delimiter": task.get("continuation_delimiter", " "),
-        }
-        print0(
-            f"Evaluating: {label} ({task_meta['num_fewshot']}-shot, type: {task_meta['task_type']})... ",
-            end="",
-        )
-
-        # Load data for this task
-        data_path = os.path.join(data_base_path, task_meta["dataset_uri"])
-        with open(data_path, "r") as f:
-            data = [json.loads(line.strip()) for line in f]
-
-        # shuffle the data because in many cases it appears ordered but we want
-        # the abillity to only run a subset of the data for debugging purposes etc.
-        shuffle_rng = random.Random(1337)
-        shuffle_rng.shuffle(data)
-        if max_per_task > 0:
-            data = data[:max_per_task]
-
-        # run the evaluation for this task
-        accuracy = evaluate_task(model, tokenizer, data, device, task_meta)
-
-        results[label] = accuracy
-        row = eval_metadata[eval_metadata["Eval Task"] == label]
-        random_baseline = row["Random baseline"].values[0]
-        centered_result = (accuracy - 0.01 * random_baseline) / (
-            1.0 - 0.01 * random_baseline
-        )
-        centered_results[label] = centered_result
-        end_time = time.time()
-        print0(
-            f"accuracy: {accuracy:.4f} | centered: {centered_result:.4f} | time: {end_time - start_time:.2f}s"
-        )
-
-    core_metric = sum(centered_results.values()) / len(centered_results)
-    out = {
-        "results": results,
-        "centered_results": centered_results,
-        "core_metric": core_metric,
-    }
-    return out
-
-
-# -----------------------------------------------------------------------------
-# HuggingFace loading utilities and light wrappers for a model
-
-
-class ModelWrapper:
-    """Lightweight wrapper for a HuggingFace model"""
-
-    def __init__(self, model, max_seq_len=None):
-        self.model = model
-        self.max_seq_len = max_seq_len
-
-    def __call__(self, input_ids):
-        outputs = self.model(input_ids)
-        logits = outputs.logits
-        return logits
-
-
-def load_hf_model(hf_path: str, device):
-    print0(f"Loading model from: {hf_path}")
-    from transformers import AutoModelForCausalLM, AutoConfig
-
-    if os.path.exists(hf_path):
-        hf_path = os.path.abspath(hf_path)
-        print0(f"Using absolute path: {hf_path}")
-
-        # Load config to help with token detection
-        config = AutoConfig.from_pretrained(hf_path, local_files_only=True)
-        model = AutoModelForCausalLM.from_pretrained(hf_path, local_files_only=True)
-        tokenizer = UniversalHuggingFaceTokenizer(hf_path, config)
-
-    model.to(device)
-    model.eval()
-    max_seq_len = 1024  # subject to change based on model type, for GPT-2 it's 1024
-    model = ModelWrapper(model, max_seq_len=max_seq_len)
-    return model, tokenizer
-
-
-# -----------------------------------------------------------------------------
-def main():
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--hf-path",
-        type=str,
-        default=None,
-        required=True,
-        help="HuggingFace model path to evaluate",
-    )
-    parser.add_argument(
-        "--max-per-task",
-        type=int,
-        default=-1,
-        help="Max examples per task to evaluate (-1 = disable)",
-    )
-    args = parser.parse_args()
-
-    # distributed / precision setup
-    device_type = autodetect_device_type()
-    ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
-    autocast_ctx = (
-        torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16)
-        if device_type == "cuda"
-        else nullcontext()
-    )
-
-    # Load model and tokenizer from command line or from file system
-    # atm assume that if a path is given, it's a huggingface model path
-    hf_path = args.hf_path
-    print0(f"Loading huggingface model from: {hf_path}")
-    model, tokenizer = load_hf_model(hf_path, device)
-    model_name = hf_path  # just for logging
-    model_slug = hf_path.replace("/", "-")  # for the output csv file
-
-    # Evaluate the model
-    with autocast_ctx:
-        out = evaluate_model(model, tokenizer, device, max_per_task=args.max_per_task)
-
-    # Write out the results to a csv file
-    core_metric = None
-    centered_results = {}
-    if ddp_rank == 0:
-        base_dir = get_base_dir()
-        output_csv_path = os.path.join(base_dir, "base_eval", f"{model_slug}.csv")
-        os.makedirs(os.path.dirname(output_csv_path), exist_ok=True)
-        results = out["results"]
-        centered_results = out["centered_results"]
-        core_metric = out["core_metric"]
-        with open(output_csv_path, "w") as f:
-            f.write(f"{'Task':<35}, {'Accuracy':<10}, {'Centered':<10}\n")
-            for label in results:
-                f.write(
-                    f"{label:<35}, {results[label]:<10.6f}, {centered_results[label]:<10.6f}\n"
-                )
-            f.write(f"{'CORE':<35}, {'':<10}, {core_metric:<10.6f}\n")
-        # Print the content of the csv file to console too
-        print0("=" * 80)
-        print0(f"Model: {model_name}")
-        print0("=" * 80)
-        with open(output_csv_path, "r") as f:
-            print0(f.read())
-
-    # Log to report
-    from report import get_report
-
-    get_report().log(
-        section="Base model evaluation",
-        data=[
-            {
-                "Model": model_name,
-                "CORE metric": core_metric,
-            },
-            centered_results,  # the full table
-        ],
-    )
-
-    compute_cleanup()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/plato/benchmarks/language_models/evaluate_model.sh b/plato/benchmarks/language_models/evaluate_model.sh
deleted file mode 100644
index 4c443e46b..000000000
--- a/plato/benchmarks/language_models/evaluate_model.sh
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/bin/bash
-# Usage: bash evaluate_model.sh <model_path> [optional: max_per_task]
-# Comment: This script evaluates a HuggingFace-based language model using the NanoChat benchmark infrastructure.
-# model_path: Path to the HuggingFace model to evaluate.
-# max_per_task: (Optional) Maximum number of examples to evaluate per task, default setting to 16.
-
-export NANOCHAT_BASE_DIR="$PWD/.cache/nanochat"
-mkdir -p $NANOCHAT_BASE_DIR
-
-EVAL_BUNDLE_URL=https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip
-if [ ! -d "$NANOCHAT_BASE_DIR/eval_bundle" ]; then
-    curl -L -o eval_bundle.zip $EVAL_BUNDLE_URL
-    unzip -q eval_bundle.zip
-    rm eval_bundle.zip
-    mv eval_bundle $NANOCHAT_BASE_DIR
-fi
-
-if [ -z "$2" ]; then
-  MAX_PER_TASK=16
-else
-  MAX_PER_TASK=$2
-fi
-uv run evaluate_model.py --hf_path=$1 --max-per-task=$MAX_PER_TASK
-
diff --git a/plato/benchmarks/language_models/report.py b/plato/benchmarks/language_models/report.py
deleted file mode 100644
index 5721b1be2..000000000
--- a/plato/benchmarks/language_models/report.py
+++ /dev/null
@@ -1,446 +0,0 @@
-"""
-Utilities for generating training report cards. More messy code than usual, will fix.
-"""
-
-import os
-import re
-import shutil
-import subprocess
-import socket
-import datetime
-import platform
-import psutil
-import torch
-
-
-def run_command(cmd):
-    """Run a shell command and return output, or None if it fails."""
-    try:
-        result = subprocess.run(
-            cmd, shell=True, capture_output=True, text=True, timeout=5
-        )
-        if result.returncode == 0:
-            return result.stdout.strip()
-        return None
-    except:
-        return None
-
-
-def get_git_info():
-    """Get current git commit, branch, and dirty status."""
-    info = {}
-    info["commit"] = run_command("git rev-parse --short HEAD") or "unknown"
-    info["branch"] = run_command("git rev-parse --abbrev-ref HEAD") or "unknown"
-
-    # Check if repo is dirty (has uncommitted changes)
-    status = run_command("git status --porcelain")
-    info["dirty"] = bool(status) if status is not None else False
-
-    # Get commit message
-    info["message"] = run_command("git log -1 --pretty=%B") or ""
-    info["message"] = info["message"].split("\n")[0][:80]  # First line, truncated
-
-    return info
-
-
-def get_gpu_info():
-    """Get GPU information."""
-    if not torch.cuda.is_available():
-        return {"available": False}
-
-    num_devices = torch.cuda.device_count()
-    info = {"available": True, "count": num_devices, "names": [], "memory_gb": []}
-
-    for i in range(num_devices):
-        props = torch.cuda.get_device_properties(i)
-        info["names"].append(props.name)
-        info["memory_gb"].append(props.total_memory / (1024**3))
-
-    # Get CUDA version
-    info["cuda_version"] = torch.version.cuda or "unknown"
-
-    return info
-
-
-def get_system_info():
-    """Get system information."""
-    info = {}
-
-    # Basic system info
-    info["hostname"] = socket.gethostname()
-    info["platform"] = platform.system()
-    info["python_version"] = platform.python_version()
-    info["torch_version"] = torch.__version__
-
-    # CPU and memory
-    info["cpu_count"] = psutil.cpu_count(logical=False)
-    info["cpu_count_logical"] = psutil.cpu_count(logical=True)
-    info["memory_gb"] = psutil.virtual_memory().total / (1024**3)
-
-    # User and environment
-    info["user"] = os.environ.get("USER", "unknown")
-    info["nanochat_base_dir"] = os.environ.get("NANOCHAT_BASE_DIR", "out")
-    info["working_dir"] = os.getcwd()
-
-    return info
-
-
-def estimate_cost(gpu_info, runtime_hours=None):
-    """Estimate training cost based on GPU type and runtime."""
-
-    # Rough pricing, from Lambda Cloud
-    default_rate = 2.0
-    gpu_hourly_rates = {
-        "H100": 3.00,
-        "A100": 1.79,
-        "V100": 0.55,
-    }
-
-    if not gpu_info.get("available"):
-        return None
-
-    # Try to identify GPU type from name
-    hourly_rate = None
-    gpu_name = gpu_info["names"][0] if gpu_info["names"] else "unknown"
-    for gpu_type, rate in gpu_hourly_rates.items():
-        if gpu_type in gpu_name:
-            hourly_rate = rate * gpu_info["count"]
-            break
-
-    if hourly_rate is None:
-        hourly_rate = default_rate * gpu_info["count"]  # Default estimate
-
-    return {
-        "hourly_rate": hourly_rate,
-        "gpu_type": gpu_name,
-        "estimated_total": hourly_rate * runtime_hours if runtime_hours else None,
-    }
-
-
-def generate_header():
-    """Generate the header for a training report."""
-    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-
-    git_info = get_git_info()
-    gpu_info = get_gpu_info()
-    sys_info = get_system_info()
-    cost_info = estimate_cost(gpu_info)
-
-    header = f"""# nanochat training report
-
-Generated: {timestamp}
-
-## Environment
-
-### Git Information
-- Branch: {git_info["branch"]}
-- Commit: {git_info["commit"]} {"(dirty)" if git_info["dirty"] else "(clean)"}
-- Message: {git_info["message"]}
-
-### Hardware
-- Platform: {sys_info["platform"]}
-- CPUs: {sys_info["cpu_count"]} cores ({sys_info["cpu_count_logical"]} logical)
-- Memory: {sys_info["memory_gb"]:.1f} GB
-"""
-
-    if gpu_info.get("available"):
-        gpu_names = ", ".join(set(gpu_info["names"]))
-        total_vram = sum(gpu_info["memory_gb"])
-        header += f"""- GPUs: {gpu_info["count"]}x {gpu_names}
-- GPU Memory: {total_vram:.1f} GB total
-- CUDA Version: {gpu_info["cuda_version"]}
-"""
-    else:
-        header += "- GPUs: None available\n"
-
-    if cost_info and cost_info["hourly_rate"] > 0:
-        header += f"""- Hourly Rate: ${cost_info["hourly_rate"]:.2f}/hour\n"""
-
-    header += f"""
-### Software
-- Python: {sys_info["python_version"]}
-- PyTorch: {sys_info["torch_version"]}
-
-"""
-
-    # bloat metrics: package all of the source code and assess its weight
-    packaged = run_command(
-        'files-to-prompt . -e py -e md -e rs -e html -e toml -e sh --ignore "*target*" --cxml'
-    )
-    num_chars = len(packaged)
-    num_lines = len(packaged.split("\n"))
-    num_files = len([x for x in packaged.split("\n") if x.startswith("<source>")])
-    num_tokens = num_chars // 4  # assume approximately 4 chars per token
-
-    # count dependencies via uv.lock
-    uv_lock_lines = 0
-    if os.path.exists("uv.lock"):
-        with open("uv.lock", "r") as f:
-            uv_lock_lines = len(f.readlines())
-
-    header += f"""
-### Bloat
-- Characters: {num_chars:,}
-- Lines: {num_lines:,}
-- Files: {num_files:,}
-- Tokens (approx): {num_tokens:,}
-- Dependencies (uv.lock lines): {uv_lock_lines:,}
-
-"""
-    return header
-
-
-# -----------------------------------------------------------------------------
-
-
-def slugify(text):
-    """Slugify a text string."""
-    return text.lower().replace(" ", "-")
-
-
-# the expected files and their order
-EXPECTED_FILES = [
-    "tokenizer-training.md",
-    "tokenizer-evaluation.md",
-    "base-model-training.md",
-    "base-model-loss.md",
-    "base-model-evaluation.md",
-    "midtraining.md",
-    "chat-evaluation-mid.md",
-    "chat-sft.md",
-    "chat-evaluation-sft.md",
-    "chat-rl.md",
-    "chat-evaluation-rl.md",
-]
-# the metrics we're currently interested in
-chat_metrics = ["ARC-Easy", "ARC-Challenge", "MMLU", "GSM8K", "HumanEval", "ChatCORE"]
-
-
-def extract(section, keys):
-    """simple def to extract a single key from a section"""
-    if not isinstance(keys, list):
-        keys = [keys]  # convenience
-    out = {}
-    for line in section.split("\n"):
-        for key in keys:
-            if key in line:
-                out[key] = line.split(":")[1].strip()
-    return out
-
-
-def extract_timestamp(content, prefix):
-    """Extract timestamp from content with given prefix."""
-    for line in content.split("\n"):
-        if line.startswith(prefix):
-            time_str = line.split(":", 1)[1].strip()
-            try:
-                return datetime.datetime.strptime(time_str, "%Y-%m-%d %H:%M:%S")
-            except:
-                pass
-    return None
-
-
-class Report:
-    """Maintains a bunch of logs, generates a final markdown report."""
-
-    def __init__(self, report_dir):
-        os.makedirs(report_dir, exist_ok=True)
-        self.report_dir = report_dir
-
-    def log(self, section, data):
-        """Log a section of data to the report."""
-        slug = slugify(section)
-        file_name = f"{slug}.md"
-        file_path = os.path.join(self.report_dir, file_name)
-        with open(file_path, "w") as f:
-            f.write(f"## {section}\n")
-            f.write(
-                f"timestamp: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n"
-            )
-            for item in data:
-                if not item:
-                    # skip falsy values like None or empty dict etc.
-                    continue
-                if isinstance(item, str):
-                    # directly write the string
-                    f.write(item)
-                else:
-                    # render a dict
-                    for k, v in item.items():
-                        if isinstance(v, float):
-                            vstr = f"{v:.4f}"
-                        elif isinstance(v, int) and v >= 10000:
-                            vstr = f"{v:,.0f}"
-                        else:
-                            vstr = str(v)
-                        f.write(f"- {k}: {vstr}\n")
-            f.write("\n")
-        return file_path
-
-    def generate(self):
-        """Generate the final report."""
-        report_dir = self.report_dir
-        report_file = os.path.join(report_dir, "report.md")
-        print(f"Generating report to {report_file}")
-        final_metrics = {}  # the most important final metrics we'll add as table at the end
-        start_time = None
-        end_time = None
-        with open(report_file, "w") as out_file:
-            # write the header first
-            header_file = os.path.join(report_dir, "header.md")
-            if os.path.exists(header_file):
-                with open(header_file, "r") as f:
-                    header_content = f.read()
-                    out_file.write(header_content)
-                    start_time = extract_timestamp(header_content, "Run started:")
-                    # capture bloat data for summary later (the stuff after Bloat header and until \n\n)
-                    bloat_data = re.search(
-                        r"### Bloat\n(.*?)\n\n", header_content, re.DOTALL
-                    )
-                    bloat_data = bloat_data.group(1) if bloat_data else ""
-            else:
-                start_time = (
-                    None  # will cause us to not write the total wall clock time
-                )
-                bloat_data = "[bloat data missing]"
-                print(
-                    f"Warning: {header_file} does not exist. Did you forget to run `nanochat reset`?"
-                )
-            # process all the individual sections
-            for file_name in EXPECTED_FILES:
-                section_file = os.path.join(report_dir, file_name)
-                if not os.path.exists(section_file):
-                    print(f"Warning: {section_file} does not exist, skipping")
-                    continue
-                with open(section_file, "r") as in_file:
-                    section = in_file.read()
-                # Extract timestamp from this section (the last section's timestamp will "stick" as end_time)
-                if "rl" not in file_name:
-                    # Skip RL sections for end_time calculation because RL is experimental
-                    end_time = extract_timestamp(section, "timestamp:")
-                # extract the most important metrics from the sections
-                if file_name == "base-model-evaluation.md":
-                    final_metrics["base"] = extract(section, "CORE")
-                if file_name == "chat-evaluation-mid.md":
-                    final_metrics["mid"] = extract(section, chat_metrics)
-                if file_name == "chat-evaluation-sft.md":
-                    final_metrics["sft"] = extract(section, chat_metrics)
-                if file_name == "chat-evaluation-rl.md":
-                    final_metrics["rl"] = extract(
-                        section, "GSM8K"
-                    )  # RL only evals GSM8K
-                # append this section of the report
-                out_file.write(section)
-                out_file.write("\n")
-            # add the final metrics table
-            out_file.write("## Summary\n\n")
-            # Copy over the bloat metrics from the header
-            out_file.write(bloat_data)
-            out_file.write("\n\n")
-            # Collect all unique metric names
-            all_metrics = set()
-            for stage_metrics in final_metrics.values():
-                all_metrics.update(stage_metrics.keys())
-            # Custom ordering: CORE first, ChatCORE last, rest in middle
-            all_metrics = sorted(
-                all_metrics, key=lambda x: (x != "CORE", x == "ChatCORE", x)
-            )
-            # Fixed column widths
-            stages = ["base", "mid", "sft", "rl"]
-            metric_width = 15
-            value_width = 8
-            # Write table header
-            header = f"| {'Metric'.ljust(metric_width)} |"
-            for stage in stages:
-                header += f" {stage.upper().ljust(value_width)} |"
-            out_file.write(header + "\n")
-            # Write separator
-            separator = f"|{'-' * (metric_width + 2)}|"
-            for stage in stages:
-                separator += f"{'-' * (value_width + 2)}|"
-            out_file.write(separator + "\n")
-            # Write table rows
-            for metric in all_metrics:
-                row = f"| {metric.ljust(metric_width)} |"
-                for stage in stages:
-                    value = final_metrics.get(stage, {}).get(metric, "-")
-                    row += f" {str(value).ljust(value_width)} |"
-                out_file.write(row + "\n")
-            out_file.write("\n")
-            # Calculate and write total wall clock time
-            if start_time and end_time:
-                duration = end_time - start_time
-                total_seconds = int(duration.total_seconds())
-                hours = total_seconds // 3600
-                minutes = (total_seconds % 3600) // 60
-                out_file.write(f"Total wall clock time: {hours}h{minutes}m\n")
-            else:
-                out_file.write("Total wall clock time: unknown\n")
-        # also cp the report.md file to current directory
-        print(f"Copying report.md to current directory for convenience")
-        shutil.copy(report_file, "report.md")
-        return report_file
-
-    def reset(self):
-        """Reset the report."""
-        # Remove section files
-        for file_name in EXPECTED_FILES:
-            file_path = os.path.join(self.report_dir, file_name)
-            if os.path.exists(file_path):
-                os.remove(file_path)
-        # Remove report.md if it exists
-        report_file = os.path.join(self.report_dir, "report.md")
-        if os.path.exists(report_file):
-            os.remove(report_file)
-        # Generate and write the header section with start timestamp
-        header_file = os.path.join(self.report_dir, "header.md")
-        header = generate_header()
-        start_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-        with open(header_file, "w") as f:
-            f.write(header)
-            f.write(f"Run started: {start_time}\n\n---\n\n")
-        print(f"Reset report and wrote header to {header_file}")
-
-
-# -----------------------------------------------------------------------------
-# nanochat-specific convenience functions
-
-
-class DummyReport:
-    def log(self, *args, **kwargs):
-        pass
-
-    def reset(self, *args, **kwargs):
-        pass
-
-
-def get_report():
-    # just for convenience, only rank 0 logs to report
-    from common import get_base_dir, get_dist_info
-
-    ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info()
-    if ddp_rank == 0:
-        report_dir = os.path.join(get_base_dir(), "report")
-        return Report(report_dir)
-    else:
-        return DummyReport()
-
-
-if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser(
-        description="Generate or reset nanochat training reports."
-    )
-    parser.add_argument(
-        "command",
-        nargs="?",
-        default="generate",
-        choices=["generate", "reset"],
-        help="Operation to perform (default: generate)",
-    )
-    args = parser.parse_args()
-    if args.command == "generate":
-        get_report().generate()
-    elif args.command == "reset":
-        get_report().reset()
diff --git a/plato/benchmarks/language_models/tokenizer.py b/plato/benchmarks/language_models/tokenizer.py
deleted file mode 100644
index fc962c1d0..000000000
--- a/plato/benchmarks/language_models/tokenizer.py
+++ /dev/null
@@ -1,234 +0,0 @@
-"""
-BPE Tokenizer in the style of GPT-4.
-
-Two implementations are available:
-1) HuggingFace Tokenizer that can do both training and inference but is really confusing
-2) Universal Wrapper that can load any HuggingFace tokenizer (e.g., for GPT-2 which has slightly different tokenization rules than GPT-4) for inference only.
-"""
-
-import os
-
-SPECIAL_TOKENS = [
-    # every document begins with the Beginning of Sequence (BOS) token that delimits documents
-    "<|bos|>",
-    # tokens below are only used during finetuning to render Conversations into token ids
-    "<|user_start|>",  # user messages
-    "<|user_end|>",
-    "<|assistant_start|>",  # assistant messages
-    "<|assistant_end|>",
-    "<|python_start|>",  # assistant invokes python REPL tool
-    "<|python_end|>",
-    "<|output_start|>",  # python REPL outputs back to assistant
-    "<|output_end|>",
-]
-
-# NOTE: this split pattern deviates from GPT-4 in that we use \p{N}{1,2} instead of \p{N}{1,3}
-# I did this because I didn't want to "waste" too many tokens on numbers for smaller vocab sizes.
-# I haven't validated that this is actually a good idea, TODO.
-SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,2}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
-
-# -----------------------------------------------------------------------------
-# Generic GPT-4-style tokenizer based on HuggingFace Tokenizer
-from tokenizers import Tokenizer as HFTokenizer
-from tokenizers import pre_tokenizers, decoders, Regex
-from tokenizers.models import BPE
-from tokenizers.trainers import BpeTrainer
-
-
-class HuggingFaceTokenizer:
-    """Light wrapper around HuggingFace Tokenizer for some utilities"""
-
-    def __init__(self, tokenizer):
-        self.tokenizer = tokenizer
-
-    @classmethod
-    def from_pretrained(cls, hf_path):
-        # init from a HuggingFace pretrained tokenizer (e.g. "gpt2")
-        tokenizer = HFTokenizer.from_pretrained(hf_path)
-        return cls(tokenizer)
-
-    @classmethod
-    def from_directory(cls, tokenizer_dir):
-        # init from a local directory on disk (e.g. "out/tokenizer")
-        tokenizer_path = os.path.join(tokenizer_dir, "tokenizer.json")
-        tokenizer = HFTokenizer.from_file(tokenizer_path)
-        return cls(tokenizer)
-
-    @classmethod
-    def train_from_iterator(cls, text_iterator, vocab_size):
-        # train from an iterator of text
-        # Configure the HuggingFace Tokenizer
-        tokenizer = HFTokenizer(
-            BPE(
-                byte_fallback=True,  # needed!
-                unk_token=None,
-                fuse_unk=False,
-            )
-        )
-        # Normalizer: None
-        tokenizer.normalizer = None
-        # Pre-tokenizer: GPT-4 style
-        # the regex pattern used by GPT-4 to split text into groups before BPE
-        # NOTE: The pattern was changed from \p{N}{1,3} to \p{N}{1,2} because I suspect it is harmful to
-        # very small models and smaller vocab sizes, because it is a little bit wasteful in the token space.
-        # (but I haven't validated this! TODO)
-        gpt4_split_regex = Regex(
-            SPLIT_PATTERN
-        )  # huggingface demands that you wrap it in Regex!!
-        tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
-            [
-                pre_tokenizers.Split(
-                    pattern=gpt4_split_regex, behavior="isolated", invert=False
-                ),
-                pre_tokenizers.ByteLevel(add_prefix_space=False, use_regex=False),
-            ]
-        )
-        # Decoder: ByteLevel (it pairs together with the ByteLevel pre-tokenizer)
-        tokenizer.decoder = decoders.ByteLevel()
-        # Post-processor: None
-        tokenizer.post_processor = None
-        # Trainer: BPE
-        trainer = BpeTrainer(
-            vocab_size=vocab_size,
-            show_progress=True,
-            min_frequency=0,  # no minimum frequency
-            initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
-            special_tokens=SPECIAL_TOKENS,
-        )
-        # Kick off the training
-        tokenizer.train_from_iterator(text_iterator, trainer)
-        return cls(tokenizer)
-
-    def get_vocab_size(self):
-        return self.tokenizer.get_vocab_size()
-
-    def get_special_tokens(self):
-        special_tokens_map = self.tokenizer.get_added_tokens_decoder()
-        special_tokens = [w.content for w in special_tokens_map.values()]
-        return special_tokens
-
-    def id_to_token(self, id):
-        return self.tokenizer.id_to_token(id)
-
-    def _encode_one(self, text, prepend=None, append=None):
-        # encode a single string
-        # prepend/append can be either a string of a special token or a token id directly.
-        assert isinstance(text, str)
-        ids = []
-        if prepend is not None:
-            prepend_id = (
-                prepend if isinstance(prepend, int) else self.encode_special(prepend)
-            )
-            ids.append(prepend_id)
-        ids.extend(self.tokenizer.encode(text, add_special_tokens=False).ids)
-        if append is not None:
-            append_id = (
-                append if isinstance(append, int) else self.encode_special(append)
-            )
-            ids.append(append_id)
-        return ids
-
-    def encode_special(self, text):
-        # encode a single special token via exact match
-        return self.tokenizer.token_to_id(text)
-
-    def get_bos_token_id(self):
-        bos = self.encode_special("<|bos|>")
-        return bos
-
-    def encode(self, text, *args, **kwargs):
-        if isinstance(text, str):
-            return self._encode_one(text, *args, **kwargs)
-        elif isinstance(text, list):
-            return [self._encode_one(t, *args, **kwargs) for t in text]
-        else:
-            raise ValueError(f"Invalid input type: {type(text)}")
-
-    def __call__(self, *args, **kwargs):
-        return self.encode(*args, **kwargs)
-
-    def decode(self, ids):
-        return self.tokenizer.decode(ids, skip_special_tokens=False)
-
-    def save(self, tokenizer_dir):
-        # save the tokenizer to disk
-        os.makedirs(tokenizer_dir, exist_ok=True)
-        tokenizer_path = os.path.join(tokenizer_dir, "tokenizer.json")
-        self.tokenizer.save(tokenizer_path)
-        print(f"Saved tokenizer to {tokenizer_path}")
-
-
-# -----------------------------------------------------------------------------
-# Universal Tokenizer Wrapper that works with any HuggingFace model
-class UniversalHuggingFaceTokenizer:
-    """Universal wrapper that works with any HuggingFace model"""
-
-    def __init__(self, tokenizer_dir, model_config=None):
-        self.tokenizer = HuggingFaceTokenizer.from_directory(tokenizer_dir)
-        self.model_config = model_config
-        self._pad_token_id = None
-        self._bos_token_id = None
-        self._detect_special_tokens()
-
-    def _detect_special_tokens(self):
-        """Auto-detect special tokens for any model"""
-        # Try to get pad token from tokenizer
-        if hasattr(self.tokenizer, "tokenizer"):
-            tokenizer_obj = self.tokenizer.tokenizer
-
-            # Try common pad token names
-            pad_candidates = ["<pad>", "[PAD]", "<|pad|>", "</s>", "<|endoftext|>"]
-            for candidate in pad_candidates:
-                if hasattr(tokenizer_obj, "token_to_id"):
-                    token_id = tokenizer_obj.token_to_id(candidate)
-                    if token_id is not None:
-                        self._pad_token_id = token_id
-                        break
-
-            # Try common BOS token names
-            bos_candidates = ["<s>", "[CLS]", "<|startoftext|>", "<|endoftext|>"]
-            for candidate in bos_candidates:
-                if hasattr(tokenizer_obj, "token_to_id"):
-                    token_id = tokenizer_obj.token_to_id(candidate)
-                    if token_id is not None:
-                        self._bos_token_id = token_id
-                        break
-
-        # Fallback to config-based detection
-        if self.model_config and hasattr(self.model_config, "pad_token_id"):
-            self._pad_token_id = self.model_config.pad_token_id
-
-        if self.model_config and hasattr(self.model_config, "bos_token_id"):
-            self._bos_token_id = self.model_config.bos_token_id
-
-        # Final fallbacks based on common patterns
-        if self._pad_token_id is None:
-            # Most models use either 0 or their EOS token
-            self._pad_token_id = 0
-
-        if self._bos_token_id is None:
-            # Use pad token as fallback
-            self._bos_token_id = self._pad_token_id
-
-    def get_bos_token_id(self):
-        return self._bos_token_id
-
-    def get_pad_token_id(self):
-        return self._pad_token_id
-
-    def __call__(self, prompts, prepend=None):
-        """Universal tokenization method"""
-        if isinstance(prompts, str):
-            prompts = [prompts]
-
-        result = []
-        for prompt in prompts:
-            tokens = self.tokenizer.encode(prompt)
-            if prepend is not None:
-                tokens = [prepend] + tokens
-            result.append(tokens)
-
-        return result[0] if len(result) == 1 else result
-
-    def __getattr__(self, name):
-        return getattr(self.tokenizer, name)

From a978bebb17748ca708d6291aba35bf663a9729f1 Mon Sep 17 00:00:00 2001
From: Jasmine-Yuting-Zhang <yuting_zhang_2023@163.com>
Date: Tue, 28 Oct 2025 02:58:50 -0400
Subject: [PATCH 06/17] Added abstract eval_model() to TestingStrategy.

---
 plato/trainers/strategies/base.py | 33 +++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/plato/trainers/strategies/base.py b/plato/trainers/strategies/base.py
index a4c3159b6..a119ab20b 100644
--- a/plato/trainers/strategies/base.py
+++ b/plato/trainers/strategies/base.py
@@ -554,3 +554,36 @@ def test_model(
             setting eval mode, and computing the metric.
         """
         pass
+
+    @abstractmethod
+    def eval_model(
+        self,
+        model: nn.Module,
+        config: dict[str, Any],
+        benchmark,
+        sampler,
+        context: TrainingContext,
+    ) -> dict[str, Any]:
+        """
+        Evaluate the model on benchmark and return results.
+
+        Args:
+            model: The model to test
+            config: Testing configuration dictionary
+            benchmark: Benchmark instance for evaluation
+            sampler: Optional data sampler for test set
+            context: Training context with device, client_id, etc.
+
+        Returns:
+            Benchmark results dictionary containing evaluation metrics.
+            For CORE benchmark, this includes:
+                - 'results': per-task accuracies
+                - 'centered_results': normalized scores
+                - 'core_metric': overall CORE score
+
+        Note:
+            This method should handle moving model to device,
+            setting eval mode, and computing the benchmark metrics.
+            The specific return format depends on the benchmark type.
+        """
+        pass
\ No newline at end of file

From c9db3e02002155ca55b1635c2777aed955a66013 Mon Sep 17 00:00:00 2001
From: Jasmine-Yuting-Zhang <yuting_zhang_2023@163.com>
Date: Tue, 28 Oct 2025 03:02:43 -0400
Subject: [PATCH 07/17] Added eval_model() to DefaultTestingStrategy.

- Introduced eval_model() in testing.py to define a placeholder interface for benchmark-based evaluation.
- The default strategy now raises NotImplementedError to prompt use of specialized testing strategies.
---
 plato/trainers/strategies/testing.py | 32 ++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/plato/trainers/strategies/testing.py b/plato/trainers/strategies/testing.py
index 80e6c424f..dd76b9ff4 100644
--- a/plato/trainers/strategies/testing.py
+++ b/plato/trainers/strategies/testing.py
@@ -7,6 +7,7 @@
 
 import logging
 import os
+from typing import Any
 
 import torch
 
@@ -97,3 +98,34 @@ def test_model(self, model, config, testset, sampler, context):
             )
 
         return accuracy
+
+    def eval_model(
+        self,
+        model,
+        config,
+        benchmark,
+        sampler,
+        context
+    ) -> dict[str, Any]:
+        """
+        Evaluate the model on benchmark and return results.
+
+        Args:
+            model: The model to test
+            config: Testing configuration dictionary
+            benchmark: Benchmark instance for evaluation
+            sampler: Optional data sampler for test set
+            context: Training context with device, client_id, etc.
+
+        Returns:
+            Benchmark results dictionary
+
+        Note:
+            DefaultTestingStrategy does not implement benchmark evaluation.
+            Use a specialized testing strategy (e.g., LLMSplitLearningTestingStrategy)
+            for benchmark support.
+        """
+        raise NotImplementedError(
+            "DefaultTestingStrategy does not support benchmark evaluation. "
+            "Please implement a custom TestingStrategy with eval_model() for your use case."
+        )
\ No newline at end of file

From 34ec5820b53590ba15c24b3bb07bf7e0c74d93f3 Mon Sep 17 00:00:00 2001
From: Jasmine-Yuting-Zhang <yuting_zhang_2023@163.com>
Date: Tue, 28 Oct 2025 03:05:47 -0400
Subject: [PATCH 08/17] Added benchmark result save/load utilities.

- Added static methods save_benchmark_result() and load_benchmark_result() in base.py for saving and loading benchmark evaluation results.
---
 plato/trainers/base.py | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/plato/trainers/base.py b/plato/trainers/base.py
index 306ca0c4f..9b67a86b7 100644
--- a/plato/trainers/base.py
+++ b/plato/trainers/base.py
@@ -3,6 +3,7 @@
 """
 
 import os
+import json
 from abc import ABC, abstractmethod
 from typing import Any, Optional
 
@@ -77,6 +78,39 @@ def load_accuracy(filename=None):
 
         return accuracy
 
+    @staticmethod
+    def save_benchmark_result(benchmark_result, filename=None):
+        """Saving the benchmark result to a file."""
+        model_path = Config().params["model_path"]
+        model_name = Config().trainer.model_name
+
+        if not os.path.exists(model_path):
+            os.makedirs(model_path)
+
+        if filename is not None:
+            benchmark_result_path = f"{model_path}/{filename}"
+        else:
+            benchmark_result_path = f"{model_path}/{model_name}.eval"
+
+        with open(benchmark_result_path, "w", encoding="utf-8") as file:
+            json.dump(benchmark_result, file)
+
+    @staticmethod
+    def load_benchmark_result(filename=None):
+        """Loading the benchmark result from a file."""
+        model_path = Config().params["model_path"]
+        model_name = Config().trainer.model_name
+
+        if filename is not None:
+            benchmark_result_path = f"{model_path}/{filename}"
+        else:
+            benchmark_result_path = f"{model_path}/{model_name}.eval"
+
+        with open(benchmark_result_path, encoding="utf-8") as file:
+            benchmark_result = json.load(file)
+
+        return benchmark_result
+
     def pause_training(self):
         """Remove files of running trainers."""
         if hasattr(Config().trainer, "max_concurrency"):

From 956922f38bf8ed00b2f3a1d39e774a9b0441b05a Mon Sep 17 00:00:00 2001
From: Jasmine-Yuting-Zhang <yuting_zhang_2023@163.com>
Date: Tue, 28 Oct 2025 03:11:09 -0400
Subject: [PATCH 09/17] Implemented benchmark evaluation pipeline with
 multiprocessing.

- Implemented benchmark evaluation pipeline in plato/trainers/composable.py.
- Added eval_model(), eval(), and eval_process() methods.
---
 plato/trainers/composable.py | 79 ++++++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)

diff --git a/plato/trainers/composable.py b/plato/trainers/composable.py
index a1da4a5eb..164c111bb 100644
--- a/plato/trainers/composable.py
+++ b/plato/trainers/composable.py
@@ -793,6 +793,85 @@ def test_model(self, config, testset, sampler=None, **kwargs):
 
         return accuracy
 
+    def eval_process(self, config, benchmark, sampler=None, **kwargs):
+        """The evaluating loop, run in a separate process."""
+        self.eval_model(config, benchmark, sampler, **kwargs)
+
+        model_name = Config().trainer.model_name
+        filename = f"{model_name}_{self.client_id}_{config['run_id']}.eval"
+        self.save_benchmark_result(self.benchmark_result, filename)
+
+    def eval(self, benchmark, sampler=None, **kwargs) -> dict[str, Any]:
+        """
+        Evaluate the model using the provided benchmark.
+
+        Args:
+            benchmark: benchmark instance (from benchmarks.registry.get())
+            sampler: The sampler for the test dataset
+            **kwargs: Additional keyword arguments
+
+        Returns:
+            Accuracy on benchmark
+        """
+        config = Config().trainer._asdict()
+        config["run_id"] = Config().params["run_id"]
+
+        if "max_concurrency" in config:
+            model = self._require_model()
+            model.cpu()
+
+            if mp.get_start_method(allow_none=True) != "spawn":
+                mp.set_start_method("spawn", force=True)
+
+            eval_proc = mp.Process(
+                target=self.eval_process,
+                args=(config, benchmark, sampler),
+                kwargs=kwargs,
+            )
+            eval_proc.start()
+            eval_proc.join()
+
+            model_name = Config().trainer.model_name
+            filename = f"{model_name}_{self.client_id}_{Config().params['run_id']}.eval"
+
+            try:
+                benchmark_result = self.load_benchmark_result(filename)
+            except OSError as error:
+                raise ValueError(
+                    f"Evaluating on client {self.client_id} failed."
+                ) from error
+
+            self.pause_training()
+            return benchmark_result
+        else:
+            return self.eval_model(config, benchmark, sampler, **kwargs)
+
+    def eval_model(self, config, benchmark, sampler=None, **kwargs):
+        """
+        Evaluate the model using benchmark.
+
+        Args:
+            config: Evaluation configuration dictionary
+            benchmark: Benchmark instance (from benchmarks.registry.get())
+            sampler: Optional data sampler (usually None for benchmarks)
+            **kwargs: Additional keyword arguments
+
+        Returns:
+            Benchmark results dictionary containing:
+                - 'results': per-task accuracies
+                - 'centered_results': normalized scores
+                - 'core_metric': overall CORE score
+        """
+
+        model = self._require_model()
+        result = self.testing_strategy.eval_model(
+            model, config, benchmark, sampler, self.context
+        )
+
+        self.benchmark_result = result
+
+        return result
+
     def obtain_model_update(self, config, trainset, sampler):
         """
         Obtain model updates from training.

From 103c6656df09ab9664d842156ca713e1d9bf5aab Mon Sep 17 00:00:00 2001
From: Jasmine-Yuting-Zhang <yuting_zhang_2023@163.com>
Date: Tue, 28 Oct 2025 03:11:58 -0400
Subject: [PATCH 10/17] Added registry for benchmark.

---
 plato/benchmarks/registry.py | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100644 plato/benchmarks/registry.py

diff --git a/plato/benchmarks/registry.py b/plato/benchmarks/registry.py
new file mode 100644
index 000000000..3cfb03252
--- /dev/null
+++ b/plato/benchmarks/registry.py
@@ -0,0 +1,30 @@
+"""
+Registry for benchmarks.
+
+Enables runtime benchmark selection via configuration.
+"""
+from plato.benchmarks import core
+from plato.benchmarks.base import Benchmark as BenchmarkBase
+
+registered_benchmarks: dict[str, type[BenchmarkBase]] = {
+    "core": core.Benchmark,
+}
+
+def get(type: str) -> BenchmarkBase:
+    """Get an instance of the benchmark."""
+    if type in registered_benchmarks:
+        benchmark_cls = registered_benchmarks[type]
+        registered_benchmark = benchmark_cls()
+    else:
+        available = list(registered_benchmarks.keys())
+        raise ValueError(
+            f"No such benchmark: {type}. "
+            f"Available benchmarks: {available}"
+        )
+    
+    return registered_benchmark
+
+
+def list_benchmarks():
+    """List all available benchmark types."""
+    return list(registered_benchmarks.keys())

From c3c5020034ebeca9c76624c2a27c001396092627 Mon Sep 17 00:00:00 2001
From: Jasmine-Yuting-Zhang <yuting_zhang_2023@163.com>
Date: Tue, 28 Oct 2025 03:15:07 -0400
Subject: [PATCH 11/17] Added base class for evaluating trained models.

---
 plato/benchmarks/base.py | 135 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 135 insertions(+)
 create mode 100644 plato/benchmarks/base.py

diff --git a/plato/benchmarks/base.py b/plato/benchmarks/base.py
new file mode 100644
index 000000000..a6e78516b
--- /dev/null
+++ b/plato/benchmarks/base.py
@@ -0,0 +1,135 @@
+"""
+Base class for benchmarks evaluating trained models.
+"""
+
+from typing import Any
+from abc import ABC, abstractmethod
+import gzip
+import logging
+import os
+import sys
+import tarfile
+import zipfile
+from pathlib import Path
+from urllib.parse import urlparse
+import requests
+import contextlib, time
+
+class Benchmark(ABC):
+    """Base class for model benchmarks."""
+
+    def __init__(self):
+        """
+        Initialize the benchmark.
+        """
+        super().__init__()
+    
+    @abstractmethod
+    def evaluate(self) -> dict[str, Any]:
+        """
+        Evaluate the model on benchmark tasks.
+
+        evaluate() returns evaluation results.
+        
+        Returns:
+            Dictionary of evaluation metrics
+            
+        Example:
+            >>> results = benchmark.evaluate()
+            >>> print(results)  
+            {'task1_accuracy': 0.85, 'overall': 0.875}
+        """
+        pass
+    
+    @abstractmethod
+    def get_formatted_result(self) -> str:
+        pass
+
+    # Borrowed from plato/datasources/base.py
+    @staticmethod
+    @contextlib.contextmanager
+    def _download_guard(data_path: str):
+        """Serialise dataset downloads to avoid concurrent corruption."""
+        os.makedirs(data_path, exist_ok=True)
+        lock_file = os.path.join(data_path, ".download.lock")
+        lock_fd = None
+        waited = False
+
+        try:
+            while True:
+                try:
+                    lock_fd = os.open(lock_file, os.O_CREAT | os.O_EXCL | os.O_RDWR)
+                    break
+                except FileExistsError:
+                    if not waited:
+                        logging.info(
+                            "Another process is preparing the dataset at %s. Waiting.",
+                            data_path,
+                        )
+                        waited = True
+                    time.sleep(1)
+            yield
+        finally:
+            if lock_fd is not None:
+                os.close(lock_fd)
+                try:
+                    os.remove(lock_file)
+                except FileNotFoundError:
+                    pass
+
+    @staticmethod
+    def download(url, data_path):
+        """Download a dataset from a URL if it is not already available."""
+        url_parse = urlparse(url)
+        file_name = os.path.join(data_path, url_parse.path.split("/")[-1])
+        os.makedirs(data_path, exist_ok=True)
+        sentinel = Path(f"{file_name}.complete")
+
+        if sentinel.exists():
+            return
+
+        with Benchmark._download_guard(data_path):
+            if sentinel.exists():
+                return
+
+            logging.info("Downloading %s.", url)
+
+            res = requests.get(url, stream=True, timeout=60)
+            total_size = int(res.headers.get("Content-Length", 0))
+            downloaded_size = 0
+
+            with open(file_name, "wb+") as file:
+                for chunk in res.iter_content(chunk_size=1024):
+                    if not chunk:
+                        continue
+                    downloaded_size += len(chunk)
+                    file.write(chunk)
+                    file.flush()
+                    if total_size:
+                        sys.stdout.write(f"\r{100 * downloaded_size / total_size:.1f}%")
+                        sys.stdout.flush()
+                if total_size:
+                    sys.stdout.write("\n")
+
+            # Unzip the compressed file just downloaded
+            logging.info("Decompressing the dataset downloaded.")
+            name, suffix = os.path.splitext(file_name)
+
+            if file_name.endswith("tar.gz"):
+                with tarfile.open(file_name, "r:gz") as tar:
+                    tar.extractall(data_path)
+                os.remove(file_name)
+            elif suffix == ".zip":
+                logging.info("Extracting %s to %s.", file_name, data_path)
+                with zipfile.ZipFile(file_name, "r") as zip_ref:
+                    zip_ref.extractall(data_path)
+            elif suffix == ".gz":
+                with gzip.open(file_name, "rb") as zipped_file:
+                    with open(name, "wb") as unzipped_file:
+                        unzipped_file.write(zipped_file.read())
+                os.remove(file_name)
+            else:
+                logging.info("Unknown compressed file type for %s.", file_name)
+                sys.exit()
+
+            sentinel.touch()
\ No newline at end of file

From ed4b025de0e91902a7a43b87ffd32f775e706a11 Mon Sep 17 00:00:00 2001
From: Jasmine-Yuting-Zhang <yuting_zhang_2023@163.com>
Date: Tue, 28 Oct 2025 03:16:10 -0400
Subject: [PATCH 12/17] Added CORE benchmark implementation for language
 models.

---
 plato/benchmarks/core.py | 188 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 188 insertions(+)
 create mode 100644 plato/benchmarks/core.py

diff --git a/plato/benchmarks/core.py b/plato/benchmarks/core.py
new file mode 100644
index 000000000..854048c2a
--- /dev/null
+++ b/plato/benchmarks/core.py
@@ -0,0 +1,188 @@
+"""
+CORE benchmark implementation for evaluating language models.
+Borrowed and adapted from: https://github.com/karpathy/nanochat
+"""
+
+import json
+import logging
+import os
+import random
+import time
+from typing import Any
+
+import pandas as pd
+import torch
+import yaml
+
+from plato.benchmarks import base
+from plato.benchmarks.core_helpers import core
+from plato.config import Config
+
+
+class Benchmark(base.Benchmark):
+    """
+    CORE benchmark - evaluates language models on the CORE suite.
+    """
+    
+    def __init__(self):
+        """
+        Initialize CORE benchmark -- load benchmark tasks and data.
+        """
+        super().__init__()
+        
+        # These will be set externally before evaluate() is called
+        self.model = None
+        self.device = None
+        self.tokenizer = None
+                
+        # Get configuration specific to CORE benchmark
+        self.random_seed = getattr(Config().benchmark, 'random_seed', 24)
+        self.max_per_task = getattr(Config().benchmark, 'max_per_task', -1)
+
+        # Load benchmark tasks and datasets
+        self._load_benchmark_data()
+
+
+    def _load_benchmark_data(self):
+        """
+        Load CORE benchmark tasks and evaluation data.
+        
+        Downloads the evaluation bundle if not already present, then loads
+        task configurations and data files.
+        """
+        # Get base directory and ensure eval_bundle is downloaded
+        benchmark_base_dir = Config.params["benchmark_path"]
+        
+        # Download eval_bundle if not present
+        if not os.path.exists(benchmark_base_dir):
+            logging.info("CORE evaluation bundle not found. Downloading...")
+            eval_bundle_url = "https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip"
+            Benchmark.download(eval_bundle_url, benchmark_base_dir)
+        
+        # Load benchmark configuration
+        eval_bundle_dir = os.path.join(benchmark_base_dir, "eval_bundle")
+        config_path = os.path.join(eval_bundle_dir, "core.yaml")
+        self.eval_meta_data_path = os.path.join(eval_bundle_dir, "eval_meta_data.csv")
+        self.data_base_path = os.path.join(eval_bundle_dir, "eval_data")
+        
+        with open(config_path, "r") as f:
+            config = yaml.safe_load(f)
+        
+        self.tasks = config["icl_tasks"]
+        self.eval_metadata = pd.read_csv(self.eval_meta_data_path)
+    
+    def evaluate(self) -> dict[str, Any]:
+        """
+        Evaluate the model on all CORE tasks.
+        
+        Returns:
+            Dictionary containing:
+                - 'results': per-task accuracies
+                - 'centered_results': normalized scores
+                - 'core_metric': overall CORE score
+        """
+
+        if self.model is None:
+            raise RuntimeError("Trainer has no model - cannot run benchmark")
+        
+        if self.tokenizer is None:
+            raise RuntimeError("Trainer has no tokenizer - cannot run benchmark")
+
+        results = {}
+        centered_results = {}
+        
+        # Set model to eval mode
+        self.model.eval()
+        
+        with torch.no_grad():
+            for task in self.tasks:
+                start_time = time.time()
+                label = task["label"]
+                
+                task_meta = {
+                    "task_type": task["icl_task_type"],
+                    "dataset_uri": task["dataset_uri"],
+                    "num_fewshot": task["num_fewshot"][0],
+                    "continuation_delimiter": task.get("continuation_delimiter", " "),
+                }
+                
+                logging.info(
+                    "Evaluating task: %s (%d-shot, type: %s)",
+                    label,
+                    task_meta['num_fewshot'],
+                    task_meta['task_type']
+                )
+                
+                # Load data for this task (matching evaluate_model.py pattern)
+                data_path = os.path.join(self.data_base_path, task_meta["dataset_uri"])
+                with open(data_path, "r") as f:
+                    data = [json.loads(line.strip()) for line in f]
+                
+                # Shuffle the data for reproducibility (matching evaluate_model.py)
+                shuffle_rng = random.Random(self.random_seed)
+                shuffle_rng.shuffle(data)
+                
+                # Crop data if max_per_task is specified
+                if self.max_per_task > 0:
+                    data = data[:self.max_per_task]
+                
+                # Run evaluation using existing core_eval logic
+                accuracy = core.evaluate_task(
+                    self.model,      # Model in CUDA memory from trainer
+                    self.tokenizer,  # Tokenizer from trainer
+                    data,
+                    self.device,
+                    task_meta
+                )
+                
+                results[label] = accuracy
+                
+                # Compute centered result (normalized by random baseline)
+                row = self.eval_metadata[self.eval_metadata["Eval Task"] == label]
+                random_baseline = row["Random baseline"].values[0]
+                centered = (accuracy - 0.01 * random_baseline) / (
+                    1.0 - 0.01 * random_baseline
+                )
+                centered_results[label] = centered
+                
+                elapsed = time.time() - start_time
+                logging.info(
+                    "accuracy: %.4f | centered: %.4f | time: %.2fs",
+                    accuracy,
+                    centered,
+                    elapsed
+                )
+        
+        # Compute overall CORE metric
+        core_metric = sum(centered_results.values()) / len(centered_results)
+
+        
+        return {
+            "results": results,
+            "centered_results": centered_results,
+            "core_metric": core_metric,
+        }
+    
+    def get_formatted_result(self, evaluation_result: dict[str, Any]) -> str:
+        """
+        Format the evaluation results for display.
+
+        Args:
+            evaluation_result: The dictionary returned by the evaluate() method.
+        Returns:
+            A formatted string summarizing the results.
+        """
+        results = evaluation_result["results"]
+        centered_results = evaluation_result["centered_results"]
+        core_metric = evaluation_result["core_metric"]
+
+        result_lines = [f"{'Task':<35}, {'Accuracy':<10}, {'Centered':<10}"]
+        for task, acc in results.items():
+            centered = centered_results[task]
+            result_lines.append(
+                f"{task:<35}, {acc:<10.6f}, {centered:<10.6f}"
+            )
+        result_lines.append(f"{'Overall CORE Metric':<35}, {'':<10}, {core_metric:<10.6f}\n")
+
+        return "\n".join(result_lines)
+    
\ No newline at end of file

From d8cf2147ff13607776864ca2a78f632ee8484c1a Mon Sep 17 00:00:00 2001
From: Jasmine-Yuting-Zhang <yuting_zhang_2023@163.com>
Date: Tue, 28 Oct 2025 03:18:51 -0400
Subject: [PATCH 13/17] Added helper functions for CORE benchmark
 implementation.

---
 plato/benchmarks/__init__.py               |   0
 plato/benchmarks/core_helpers/core.py      | 281 ++++++++++++++++++++
 plato/benchmarks/core_helpers/tokenizer.py | 290 +++++++++++++++++++++
 3 files changed, 571 insertions(+)
 create mode 100644 plato/benchmarks/__init__.py
 create mode 100644 plato/benchmarks/core_helpers/core.py
 create mode 100644 plato/benchmarks/core_helpers/tokenizer.py

diff --git a/plato/benchmarks/__init__.py b/plato/benchmarks/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/plato/benchmarks/core_helpers/core.py b/plato/benchmarks/core_helpers/core.py
new file mode 100644
index 000000000..2639c9c48
--- /dev/null
+++ b/plato/benchmarks/core_helpers/core.py
@@ -0,0 +1,281 @@
+"""
+Borrowed and adapted from: https://github.com/karpathy/nanochat
+
+Functions for evaluating the CORE metric, as described in the DCLM paper.
+https://arxiv.org/abs/2406.11794
+
+TODOs:
+- All tasks ~match except for squad. We get 31% reference is 37%. Figure out why.
+"""
+
+import random
+
+from jinja2 import Template
+import torch
+
+from plato.benchmarks.core_helpers.tokenizer import UniversalHuggingFaceTokenizer
+
+
+def render_prompts_mc(item, continuation_delimiter, fewshot_examples=None):
+    """Render complete prompts for a multiple choice question"""
+    template_str = """
+{%- for example in fewshot_examples -%}
+{{ example.query }}{{ continuation_delimiter }}{{ example.choices[example.gold] }}
+
+{% endfor -%}
+{{ item.query }}{{ continuation_delimiter }}{{ choice }}""".strip()
+    template = Template(template_str)
+    fewshot_examples = fewshot_examples or []
+    context = {
+        "fewshot_examples": fewshot_examples,
+        "continuation_delimiter": continuation_delimiter,
+        "item": item,
+    }
+    prompts = [template.render(choice=choice, **context) for choice in item["choices"]]
+    return prompts
+
+
+def render_prompts_schema(item, continuation_delimiter, fewshot_examples=None):
+    """Render complete prompts for a schema question"""
+    template_str = """
+{%- for example in fewshot_examples -%}
+{{ example.context_options[example.gold] }}{{ continuation_delimiter }}{{ example.continuation }}
+
+{% endfor -%}
+{{ context }}{{ continuation_delimiter }}{{ item.continuation }}""".strip()
+    template = Template(template_str)
+    fewshot_examples = fewshot_examples or []
+    context = {
+        "fewshot_examples": fewshot_examples,
+        "continuation_delimiter": continuation_delimiter,
+        "item": item,
+    }
+    prompts = [
+        template.render(context=context_option, **context)
+        for context_option in item["context_options"]
+    ]
+    return prompts
+
+
+def render_prompts_lm(item, continuation_delimiter, fewshot_examples=None):
+    """
+    Render complete prompt for a language modeling task.
+    Notice that we manually trim the context in the template,
+    which in some datasets seems to have trailing whitespace (which we don't want).
+    """
+    template_str = """
+{%- for example in fewshot_examples -%}
+{{ example.context | trim }}{{ continuation_delimiter }}{{ example.continuation }}
+
+{% endfor -%}
+{{ item.context | trim }}{{ continuation_delimiter }}{% if include_continuation %}{{ item.continuation }}{% endif %}""".strip()
+    template = Template(template_str)
+    fewshot_examples = fewshot_examples or []
+    context = {
+        "fewshot_examples": fewshot_examples,
+        "continuation_delimiter": continuation_delimiter,
+        "item": item,
+    }
+    # Return two prompts: without and with the continuation
+    prompt_without = template.render(include_continuation=False, **context)
+    prompt_with = template.render(include_continuation=True, **context)
+    # Due to the way the data seems to be stored, I think I need to strip in the case of LM here.
+    # Otherwise we may get trailing whitespaces in prompt_without (which get absorbed into the next
+    # token in prompt_with), meaning we don't get a nice and clean prefix in the token space
+    # to detect the final continuation. Tokenizers...
+    prompt_without = prompt_without.strip()
+    return [prompt_without, prompt_with]
+
+
+def find_common_length(token_sequences, direction="left"):
+    """
+    Find the length of the common prefix or suffix across token sequences
+    - direction: 'left' for prefix, 'right' for suffix
+    """
+    min_len = min(len(seq) for seq in token_sequences)
+    indices = {"left": range(min_len), "right": range(-1, -min_len - 1, -1)}[direction]
+    # Find the first position where the token sequences differ
+    for i, idx in enumerate(indices):
+        token = token_sequences[0][idx]
+        if not all(seq[idx] == token for seq in token_sequences):
+            return i
+    return min_len
+
+
+def stack_sequences(tokens, pad_token_id):
+    """Stack up a list of token sequences, pad to longest on the right"""
+    bsz, seq_len = len(tokens), max(len(x) for x in tokens)
+    input_ids = torch.full((bsz, seq_len), pad_token_id, dtype=torch.long)
+    for i, x in enumerate(tokens):
+        input_ids[i, : len(x)] = torch.tensor(x, dtype=torch.long)
+    return input_ids
+
+
+def batch_sequences_mc(tokenizer, prompts):
+    # In multiple choice, contexts are the same but the continuation is different (common prefix)
+    tokens = tokenizer(prompts, prepend=tokenizer.get_bos_token_id())
+    # figure out the start and end of each continuation
+    answer_start_idx = find_common_length(tokens, direction="left")
+    start_indices = [answer_start_idx] * len(prompts)
+    end_indices = [len(x) for x in tokens]
+    return tokens, start_indices, end_indices
+
+
+def batch_sequences_schema(tokenizer, prompts):
+    # In schema tasks, contexts vary but continuation is the same (common suffix)
+    tokens = tokenizer(prompts, prepend=tokenizer.get_bos_token_id())
+    # figure out the start and end of each context
+    suffix_length = find_common_length(tokens, direction="right")
+    end_indices = [len(x) for x in tokens]
+    start_indices = [ei - suffix_length for ei in end_indices]
+    return tokens, start_indices, end_indices
+
+
+def batch_sequences_lm(tokenizer, prompts):
+    # In LM tasks, we have two prompts: without and with continuation
+    tokens = tokenizer(prompts, prepend=tokenizer.get_bos_token_id())
+    tokens_without, tokens_with = tokens
+    start_idx, end_idx = len(tokens_without), len(tokens_with)
+    assert start_idx < end_idx, (
+        "prompt without is supposed to be a prefix of prompt with"
+    )
+    assert tokens_without == tokens_with[:start_idx], (
+        "prompt without is supposed to be a prefix of prompt with"
+    )
+    # we only need the with continuation prompt in the LM task, i.e. batch size of 1
+    return [tokens_with], [start_idx], [end_idx]
+
+
+@torch.no_grad()
+def forward_model(model, input_ids):
+    """
+    Take BxT tensor of token ids, return BxT tensor of losses and argmax predictions.
+    The last column of losses is set to nan because we don't have autoregressive targets there.
+    """
+    batch_size, seq_len = input_ids.size()
+    outputs = model(input_ids)
+    
+    # Extract logits from model output (handles both raw tensors and HuggingFace output objects)
+    if hasattr(outputs, 'logits'):
+        logits = outputs.logits
+    else:
+        logits = outputs
+    
+    # Roll the tensor to the left by one position to get the (autoregressive) target ids
+    target_ids = torch.roll(input_ids, shifts=-1, dims=1)
+    # Calculate cross entropy at all positions
+    losses = torch.nn.functional.cross_entropy(
+        logits.view(batch_size * seq_len, -1),
+        target_ids.view(batch_size * seq_len),
+        reduction="none",
+    ).view(batch_size, seq_len)
+    # Set the last column to be nan because there is no autoregressive loss there
+    losses[:, -1] = float("nan")
+    # Get the argmax predictions at each position
+    predictions = logits.argmax(dim=-1)
+    return losses, predictions
+
+
+@torch.no_grad()
+def evaluate_example(idx, model, tokenizer, data, device, task_meta):
+    """Evaluate a single example, return True if correct, False otherwise"""
+    item = data[idx]
+    task_type = task_meta["task_type"]
+    num_fewshot = task_meta["num_fewshot"]
+    continuation_delimiter = task_meta["continuation_delimiter"]
+
+    # Sample few-shot examples (excluding current item)
+    fewshot_examples = []
+    if num_fewshot > 0:
+        rng = random.Random(1234 + idx)
+        available_indices = [i for i in range(len(data)) if i != idx]
+        fewshot_indices = rng.sample(available_indices, num_fewshot)
+        fewshot_examples = [data[i] for i in fewshot_indices]
+
+    # Render prompts and batch sequences based on task type
+    if task_type == "multiple_choice":
+        prompts = render_prompts_mc(item, continuation_delimiter, fewshot_examples)
+        tokens, start_idxs, end_idxs = batch_sequences_mc(tokenizer, prompts)
+    elif task_type == "schema":
+        prompts = render_prompts_schema(item, continuation_delimiter, fewshot_examples)
+        tokens, start_idxs, end_idxs = batch_sequences_schema(tokenizer, prompts)
+    elif task_type == "language_modeling":
+        prompts = render_prompts_lm(item, continuation_delimiter, fewshot_examples)
+        tokens, start_idxs, end_idxs = batch_sequences_lm(tokenizer, prompts)
+    else:
+        raise ValueError(f"Unsupported task type: {task_type}")
+
+    # Some models can't forward sequences beyond a certain length (e.g. GPT-2)
+    # In these cases, we have to truncate sequences to max length and adjust the indices
+    max_tokens = None
+    if hasattr(model, "max_seq_len") and model.max_seq_len is not None:
+        max_tokens = model.max_seq_len
+    elif hasattr(model, "config"):
+        # For HuggingFace models, check common config attributes
+        if hasattr(model.config, "n_positions"):
+            max_tokens = model.config.n_positions
+        elif hasattr(model.config, "max_position_embeddings"):
+            max_tokens = model.config.max_position_embeddings
+    else:
+        max_tokens = 1024  # default to 1024 (GPT-2) if no info available
+
+    if max_tokens is not None:
+        new_tokens, new_start_idxs, new_end_idxs = [], [], []
+        for t, s, e in zip(tokens, start_idxs, end_idxs):
+            if len(t) > max_tokens:
+                num_to_crop = len(t) - max_tokens
+                new_tokens.append(t[-max_tokens:])  # take the last max_tokens tokens
+                new_start_idxs.append(s - num_to_crop)  # shift the indices down
+                new_end_idxs.append(e - num_to_crop)
+                assert s - num_to_crop >= 0, "this should never happen right?"
+                assert e - num_to_crop >= 0, "this should never happen right?"
+            else:
+                new_tokens.append(t)  # keep unchanged
+                new_start_idxs.append(s)
+                new_end_idxs.append(e)
+        tokens, start_idxs, end_idxs = new_tokens, new_start_idxs, new_end_idxs
+
+    # Stack up all the sequences into a batch
+    pad_token_id = tokenizer.get_bos_token_id()  # use BOS as pad token is ok
+    input_ids = stack_sequences(tokens, pad_token_id)
+    input_ids = input_ids.to(device)
+
+    # Forward the model, get the autoregressive loss and argmax prediction at each token
+    losses, predictions = forward_model(model, input_ids)
+
+    # See if the losses/predictions come out correctly
+    if task_type == "language_modeling":
+        # language modeling task is currently always batch size 1
+        si = start_idxs[0]
+        ei = end_idxs[0]
+        # predictions[i] predict input_ids[i+1] autoregressively
+        predicted_tokens = predictions[0, si - 1 : ei - 1]
+        actual_tokens = input_ids[0, si:ei]
+        is_correct = torch.all(predicted_tokens == actual_tokens).item()
+    elif task_type in ["multiple_choice", "schema"]:
+        # For MC/schema: find the option with lowest average loss
+        mean_losses = [
+            losses[i, si - 1 : ei - 1].mean().item()
+            for i, (si, ei) in enumerate(zip(start_idxs, end_idxs))
+        ]
+        pred_idx = mean_losses.index(min(mean_losses))
+        is_correct = pred_idx == item["gold"]
+    else:
+        raise ValueError(f"Unsupported task type: {task_type}")
+
+    return is_correct
+
+
+def evaluate_task(model, tokenizer, data, device, task_meta):
+    """
+    This function is responsible for evaluating one task across many examples.
+    """
+    # wrap tokenizer with Universal wrapper for compatibility
+    tokenizer = UniversalHuggingFaceTokenizer(tokenizer)
+    correct = torch.zeros(len(data), dtype=torch.float32, device=device)
+    for idx in range(len(data)):
+        is_correct = evaluate_example(idx, model, tokenizer, data, device, task_meta)
+        correct[idx] = float(is_correct)
+    # compute the mean
+    mean_correct = correct.mean().item()
+    return mean_correct
diff --git a/plato/benchmarks/core_helpers/tokenizer.py b/plato/benchmarks/core_helpers/tokenizer.py
new file mode 100644
index 000000000..7b6f0e4d5
--- /dev/null
+++ b/plato/benchmarks/core_helpers/tokenizer.py
@@ -0,0 +1,290 @@
+"""
+BPE Tokenizer in the style of GPT-4.
+
+Two implementations are available:
+1) HuggingFace Tokenizer that can do both training and inference but is really confusing
+2) Universal Wrapper that can load any HuggingFace tokenizer (e.g., for GPT-2 which has slightly different tokenization rules than GPT-4) for inference only.
+"""
+
+import os
+
+SPECIAL_TOKENS = [
+    # every document begins with the Beginning of Sequence (BOS) token that delimits documents
+    "<|bos|>",
+    # tokens below are only used during finetuning to render Conversations into token ids
+    "<|user_start|>",  # user messages
+    "<|user_end|>",
+    "<|assistant_start|>",  # assistant messages
+    "<|assistant_end|>",
+    "<|python_start|>",  # assistant invokes python REPL tool
+    "<|python_end|>",
+    "<|output_start|>",  # python REPL outputs back to assistant
+    "<|output_end|>",
+]
+
+# NOTE: this split pattern deviates from GPT-4 in that we use \p{N}{1,2} instead of \p{N}{1,3}
+# I did this because I didn't want to "waste" too many tokens on numbers for smaller vocab sizes.
+# I haven't validated that this is actually a good idea, TODO.
+SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,2}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
+
+# -----------------------------------------------------------------------------
+# Generic GPT-4-style tokenizer based on HuggingFace Tokenizer
+from tokenizers import Tokenizer as HFTokenizer
+from tokenizers import pre_tokenizers, decoders, Regex
+from tokenizers.models import BPE
+from tokenizers.trainers import BpeTrainer
+
+
+class HuggingFaceTokenizer:
+    """Light wrapper around HuggingFace Tokenizer for some utilities"""
+
+    def __init__(self, tokenizer):
+        self.tokenizer = tokenizer
+
+    @classmethod
+    def from_pretrained(cls, hf_path):
+        # init from a HuggingFace pretrained tokenizer (e.g. "gpt2")
+        tokenizer = HFTokenizer.from_pretrained(hf_path)
+        return cls(tokenizer)
+
+    @classmethod
+    def from_directory(cls, tokenizer_dir):
+        # init from a local directory on disk (e.g. "out/tokenizer")
+        tokenizer_path = os.path.join(tokenizer_dir, "tokenizer.json")
+        tokenizer = HFTokenizer.from_file(tokenizer_path)
+        return cls(tokenizer)
+
+    @classmethod
+    def train_from_iterator(cls, text_iterator, vocab_size):
+        # train from an iterator of text
+        # Configure the HuggingFace Tokenizer
+        tokenizer = HFTokenizer(
+            BPE(
+                byte_fallback=True,  # needed!
+                unk_token=None,
+                fuse_unk=False,
+            )
+        )
+        # Normalizer: None
+        tokenizer.normalizer = None
+        # Pre-tokenizer: GPT-4 style
+        # the regex pattern used by GPT-4 to split text into groups before BPE
+        # NOTE: The pattern was changed from \p{N}{1,3} to \p{N}{1,2} because I suspect it is harmful to
+        # very small models and smaller vocab sizes, because it is a little bit wasteful in the token space.
+        # (but I haven't validated this! TODO)
+        gpt4_split_regex = Regex(
+            SPLIT_PATTERN
+        )  # huggingface demands that you wrap it in Regex!!
+        tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
+            [
+                pre_tokenizers.Split(
+                    pattern=gpt4_split_regex, behavior="isolated", invert=False
+                ),
+                pre_tokenizers.ByteLevel(add_prefix_space=False, use_regex=False),
+            ]
+        )
+        # Decoder: ByteLevel (it pairs together with the ByteLevel pre-tokenizer)
+        tokenizer.decoder = decoders.ByteLevel()
+        # Post-processor: None
+        tokenizer.post_processor = None
+        # Trainer: BPE
+        trainer = BpeTrainer(
+            vocab_size=vocab_size,
+            show_progress=True,
+            min_frequency=0,  # no minimum frequency
+            initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
+            special_tokens=SPECIAL_TOKENS,
+        )
+        # Kick off the training
+        tokenizer.train_from_iterator(text_iterator, trainer)
+        return cls(tokenizer)
+
+    def get_vocab_size(self):
+        return self.tokenizer.get_vocab_size()
+
+    def get_special_tokens(self):
+        special_tokens_map = self.tokenizer.get_added_tokens_decoder()
+        special_tokens = [w.content for w in special_tokens_map.values()]
+        return special_tokens
+
+    def id_to_token(self, id):
+        return self.tokenizer.id_to_token(id)
+
+    def _encode_one(self, text, prepend=None, append=None):
+        # encode a single string
+        # prepend/append can be either a string of a special token or a token id directly.
+        assert isinstance(text, str)
+        ids = []
+        if prepend is not None:
+            prepend_id = (
+                prepend if isinstance(prepend, int) else self.encode_special(prepend)
+            )
+            ids.append(prepend_id)
+        ids.extend(self.tokenizer.encode(text, add_special_tokens=False).ids)
+        if append is not None:
+            append_id = (
+                append if isinstance(append, int) else self.encode_special(append)
+            )
+            ids.append(append_id)
+        return ids
+
+    def encode_special(self, text):
+        # encode a single special token via exact match
+        return self.tokenizer.token_to_id(text)
+
+    def get_bos_token_id(self):
+        bos = self.encode_special("<|bos|>")
+        return bos
+
+    def encode(self, text, *args, **kwargs):
+        if isinstance(text, str):
+            return self._encode_one(text, *args, **kwargs)
+        elif isinstance(text, list):
+            return [self._encode_one(t, *args, **kwargs) for t in text]
+        else:
+            raise ValueError(f"Invalid input type: {type(text)}")
+
+    def __call__(self, *args, **kwargs):
+        return self.encode(*args, **kwargs)
+
+    def decode(self, ids):
+        return self.tokenizer.decode(ids, skip_special_tokens=False)
+
+    def save(self, tokenizer_dir):
+        # save the tokenizer to disk
+        os.makedirs(tokenizer_dir, exist_ok=True)
+        tokenizer_path = os.path.join(tokenizer_dir, "tokenizer.json")
+        self.tokenizer.save(tokenizer_path)
+        print(f"Saved tokenizer to {tokenizer_path}")
+
+
+# -----------------------------------------------------------------------------
+# Universal Tokenizer Wrapper that works with any HuggingFace model
+# For example, GPT2TokenizerFast doesn't have a get_bos_token_id() method,
+# so we need this wrapper to provide a unified interface.
+
+
+class UniversalHuggingFaceTokenizer:
+    """
+    Universal wrapper that provides a consistent interface for any HuggingFace tokenizer.
+    
+    This wrapper automatically detects special tokens (BOS, PAD, EOS) and provides
+    utility methods that work across different tokenizer implementations.
+    """
+
+    def __init__(self, tokenizer):
+        """
+        Initialize the wrapper with a HuggingFace tokenizer.
+        
+        Args:
+            tokenizer: A HuggingFace tokenizer instance (e.g., GPT2TokenizerFast)
+        """
+        self.tokenizer = tokenizer
+        self._pad_token_id = None
+        self._bos_token_id = None
+        self._eos_token_id = None
+        self._detect_special_tokens()
+
+    def _detect_special_tokens(self):
+        """
+        Auto-detect special token IDs from the tokenizer.
+        
+        Detection strategy (in order of priority):
+        1. Try direct attributes on the tokenizer (bos_token_id, pad_token_id, eos_token_id)
+        2. For missing tokens, use EOS as BOS/PAD for models like GPT-2
+        3. Try token_to_id() method with common token names
+        4. Final fallbacks: 0 for pad, pad for bos
+        """
+        # Strategy 1: Direct attributes (works for most HuggingFace tokenizers)
+        if hasattr(self.tokenizer, 'bos_token_id') and self.tokenizer.bos_token_id is not None:
+            self._bos_token_id = self.tokenizer.bos_token_id
+        
+        if hasattr(self.tokenizer, 'pad_token_id') and self.tokenizer.pad_token_id is not None:
+            self._pad_token_id = self.tokenizer.pad_token_id
+        
+        if hasattr(self.tokenizer, 'eos_token_id') and self.tokenizer.eos_token_id is not None:
+            self._eos_token_id = self.tokenizer.eos_token_id
+            # For GPT-2 and similar models, BOS is often the same as EOS
+            if self._bos_token_id is None:
+                self._bos_token_id = self._eos_token_id
+            # Use EOS as pad if no pad token exists
+            if self._pad_token_id is None:
+                self._pad_token_id = self._eos_token_id
+        
+        # Strategy 2: Try token_to_id method for tokenizers with nested structure
+        if hasattr(self.tokenizer, "tokenizer"):
+            tokenizer_obj = self.tokenizer.tokenizer
+
+            if self._pad_token_id is None:
+                pad_candidates = ["<pad>", "[PAD]", "<|pad|>", "</s>", "<|endoftext|>"]
+                self._pad_token_id = self._try_token_candidates(tokenizer_obj, pad_candidates)
+
+            if self._bos_token_id is None:
+                bos_candidates = ["<s>", "[CLS]", "<|startoftext|>", "<|endoftext|>"]
+                self._bos_token_id = self._try_token_candidates(tokenizer_obj, bos_candidates)
+
+        # Strategy 3: Final fallbacks
+        if self._pad_token_id is None:
+            self._pad_token_id = 0  # Most models default to 0
+
+        if self._bos_token_id is None:
+            self._bos_token_id = self._pad_token_id
+
+    def _try_token_candidates(self, tokenizer_obj, candidates):
+        """
+        Try to find a token ID from a list of candidate token strings.
+        
+        Args:
+            tokenizer_obj: The tokenizer object with token_to_id method
+            candidates: List of token strings to try
+            
+        Returns:
+            Token ID if found, None otherwise
+        """
+        if not hasattr(tokenizer_obj, "token_to_id"):
+            return None
+            
+        for candidate in candidates:
+            token_id = tokenizer_obj.token_to_id(candidate)
+            if token_id is not None:
+                return token_id
+        return None
+
+    def get_bos_token_id(self):
+        """Get the beginning-of-sequence token ID."""
+        return self._bos_token_id
+
+    def get_pad_token_id(self):
+        """Get the padding token ID."""
+        return self._pad_token_id
+    
+    def get_eos_token_id(self):
+        """Get the end-of-sequence token ID."""
+        return self._eos_token_id
+
+    def __call__(self, prompts, prepend=None):
+        """
+        Tokenize prompts with optional prepended token.
+        
+        Args:
+            prompts: Single string or list of strings to tokenize
+            prepend: Optional token ID to prepend to each sequence
+            
+        Returns:
+            List of token IDs, or list of lists if multiple prompts
+        """
+        if isinstance(prompts, str):
+            prompts = [prompts]
+
+        result = []
+        for prompt in prompts:
+            tokens = self.tokenizer.encode(prompt)
+            if prepend is not None:
+                tokens = [prepend] + tokens
+            result.append(tokens)
+
+        return result[0] if len(result) == 1 else result
+
+    def __getattr__(self, name):
+        """Delegate all other attributes to the wrapped tokenizer."""
+        return getattr(self.tokenizer, name)

From aaac39bbf61eab5234068df88be9e1e4d4e0543c Mon Sep 17 00:00:00 2001
From: Jasmine-Yuting-Zhang <yuting_zhang_2023@163.com>
Date: Tue, 28 Oct 2025 03:21:26 -0400
Subject: [PATCH 14/17] Added benchmark evaluation support in fedavg.py.

---
 plato/servers/fedavg.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/plato/servers/fedavg.py b/plato/servers/fedavg.py
index ee636cb49..b95fe76b8 100644
--- a/plato/servers/fedavg.py
+++ b/plato/servers/fedavg.py
@@ -7,6 +7,7 @@
 import os
 
 from plato.algorithms import registry as algorithms_registry
+from plato.benchmarks import registry as benchmarks_registry
 from plato.config import Config
 from plato.datasources import registry as datasources_registry
 from plato.processors import registry as processor_registry
@@ -50,6 +51,8 @@ def __init__(
         self.testset_sampler = None
         self.total_samples = 0
 
+        self.benchmark = None
+
         self.total_clients = Config().clients.total_clients
         self.clients_per_round = Config().clients.per_round
 
@@ -252,6 +255,17 @@ async def _process_reports(self):
             trainer = self.require_trainer()
             self.accuracy = trainer.test(self.testset, self.testset_sampler)
 
+        # Evaluating the global model on the specified benchmark
+        if hasattr(Config().config, "benchmark") and hasattr(Config().benchmark, "type"):
+            benchmark_type = Config().benchmark.type
+            if self.benchmark is None:
+                self.benchmark = benchmarks_registry.get(benchmark_type)
+            logging.info("[%s] Started model evaluation on benchmark %s.", self, benchmark_type)
+            trainer = self.require_trainer()
+            self.benchmark_result = trainer.eval(self.benchmark, self.testset_sampler)
+            logging.info("[%s] Model evaluation result on benchmark %s:\n%s.", self, benchmark_type, self.benchmark.get_formatted_result(self.benchmark_result))
+
+
         if hasattr(Config().trainer, "target_perplexity"):
             logging.info(
                 fonts.colourize(

From 0a36f74a0d3f72309b9f012060a17e1db25c85bb Mon Sep 17 00:00:00 2001
From: Jasmine-Yuting-Zhang <yuting_zhang_2023@163.com>
Date: Tue, 28 Oct 2025 03:21:59 -0400
Subject: [PATCH 15/17] Added benchmark configuration support in config.py.

---
 plato/config.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/plato/config.py b/plato/config.py
index ec7d840a0..a3c73344e 100644
--- a/plato/config.py
+++ b/plato/config.py
@@ -153,6 +153,7 @@ class Config:
     clients: Any
     server: Any
     data: Any
+    benchmark: Any
     trainer: Any
     algorithm: Any
     results: Any
@@ -342,6 +343,20 @@ def __new__(cls):
                     Config.params["base_path"], "data"
                 )
 
+            # User specific benchmark
+            if hasattr(config, "benchmark"):
+                Config.benchmark = config.benchmark
+                
+                # Directory of benchmark dataset
+                if hasattr(Config().benchmark, "data_path"):
+                    Config.params["benchmark_path"] = os.path.join(
+                        Config.params["base_path"], Config().benchmark.data_path
+                    )
+                else:
+                    Config.params["benchmark_path"] = os.path.join(
+                        Config.params["base_path"], "benchmark"
+                    )
+
             # Pretrained models
             if hasattr(Config().server, "model_path"):
                 Config.params["model_path"] = os.path.join(
@@ -401,6 +416,10 @@ def __new__(cls):
 
             if hasattr(config, "parameters"):
                 Config.parameters = config.parameters
+            
+            # Benchmark configuration (for model evaluation)
+            if hasattr(config, "benchmark"):
+                Config.benchmark = config.benchmark
 
         return cls._instance
 

From e60d99bc73572896cd1975cc8b979a544066fcae Mon Sep 17 00:00:00 2001
From: Jasmine-Yuting-Zhang <yuting_zhang_2023@163.com>
Date: Tue, 28 Oct 2025 03:26:31 -0400
Subject: [PATCH 16/17] Added support for split learning benchmark evaluation.

- Enabled benchmark evaluation in split learning to test and validate benchmark implementations.
---
 .../split_learning_trainer.py                 | 50 +++++++++++++++++++
 .../split_learning_wikitext2_gpt2.toml        |  7 ++-
 plato/trainers/split_learning.py              |  2 +
 3 files changed, 58 insertions(+), 1 deletion(-)

diff --git a/examples/split_learning/llm_split_learning/split_learning_trainer.py b/examples/split_learning/llm_split_learning/split_learning_trainer.py
index b921eba66..87b97a56d 100644
--- a/examples/split_learning/llm_split_learning/split_learning_trainer.py
+++ b/examples/split_learning/llm_split_learning/split_learning_trainer.py
@@ -161,7 +161,57 @@ def test_model(self, model, config, testset, sampler, context):
         # Save other metric information such as accuracy
         tester.log_metrics("eval", metrics)
         return metrics["eval_accuracy"]
+    
+    def eval_model( 
+        self,
+        model,
+        config,
+        benchmark,
+        sampler,
+        context
+    ):
+        """
+        Evaluate the model using the benchmark specified in the configuration.
+
+        This is a specialized implementation for HuggingFace-based models.
+
+        Arguments:
+            model: The model to evaluate
+            config: Testing configuration dictionary
+            benchmark: Benchmark instance (e.g., from plato.benchmarks.registry.get())
+            sampler: Optional data sampler (not used for CORE benchmark)
+            context: Training context
 
+        Returns:
+            Benchmark results dictionary containing:
+                - 'results': per-task accuracies (for CORE)
+                - 'centered_results': normalized scores (for CORE)
+                - 'core_metric': overall benchmark score (for CORE)
+        """
+
+        if hasattr(model, "copy_weight"):
+            model.copy_weight()
+
+        # Get base model if available
+        base_model = model.base_model if hasattr(model, "base_model") else model
+        
+        # Set model to eval mode and move to device
+        base_model.to(context.device)
+        base_model.eval()
+
+    
+        if hasattr(benchmark, 'model'):
+            benchmark.model = base_model
+        if hasattr(benchmark, 'device'):
+            benchmark.device = context.device
+        if hasattr(benchmark, 'tokenizer') and self.tokenizer is not None:
+            benchmark.tokenizer = self.tokenizer
+
+        # Use benchmark's evaluate method to get results
+        # benchmark.evaluate() returns dict with metrics
+        results = benchmark.evaluate()
+
+        return results
 
 # ============================================================================
 # Custom Callbacks for LLM Split Learning
diff --git a/examples/split_learning/llm_split_learning/split_learning_wikitext2_gpt2.toml b/examples/split_learning/llm_split_learning/split_learning_wikitext2_gpt2.toml
index cd068c09c..88c67e48b 100644
--- a/examples/split_learning/llm_split_learning/split_learning_wikitext2_gpt2.toml
+++ b/examples/split_learning/llm_split_learning/split_learning_wikitext2_gpt2.toml
@@ -40,13 +40,18 @@ random_seed = 1
 # IID, biased, or sharded?
 sampler = "iid"
 
+[benchmark]
+type = "core"       # Benchmark type (from registry)
+max_per_task = 16   # Limit samples per task for faster evaluation
+random_seed = 1
+
 [trainer]
 
 # The type of the trainer
 type = "split_learning"
 
 # The maximum number of training rounds
-rounds = 100000
+rounds = 10
 
 # The machine learning model
 model_type = "huggingface"
diff --git a/plato/trainers/split_learning.py b/plato/trainers/split_learning.py
index 8a8b81536..ac09322ee 100644
--- a/plato/trainers/split_learning.py
+++ b/plato/trainers/split_learning.py
@@ -214,6 +214,8 @@ def test_model(self, model, config, testset, sampler, context):
         accuracy = correct / total
         return accuracy
 
+    def eval_model(self, model, config, benchmark, sampler, context):
+        raise NotImplementedError("eval_model is not implemented yet for SplitLearningTestingStrategy.")
 
 # pylint:disable=too-many-instance-attributes
 class Trainer(ComposableTrainer):

From ec1c1ba3403550fae5a1fcd0e44d73f10880576d Mon Sep 17 00:00:00 2001
From: Jasmine-Yuting-Zhang <yuting_zhang_2023@163.com>
Date: Tue, 28 Oct 2025 03:32:50 -0400
Subject: [PATCH 17/17] Reformatted code using Ruff.

---
 .../split_learning_trainer.py                 | 21 ++---
 .../fedunlearning/fedunlearning_server.py     |  4 +-
 plato/benchmarks/base.py                      | 13 +--
 plato/benchmarks/core.py                      | 79 +++++++++----------
 plato/benchmarks/core_helpers/core.py         |  6 +-
 plato/benchmarks/core_helpers/tokenizer.py    | 47 +++++++----
 plato/benchmarks/registry.py                  |  7 +-
 plato/config.py                               |  4 +-
 plato/servers/fedavg.py                       | 16 +++-
 plato/trainers/split_learning.py              |  5 +-
 plato/trainers/strategies/base.py             |  2 +-
 plato/trainers/strategies/testing.py          | 11 +--
 12 files changed, 112 insertions(+), 103 deletions(-)

diff --git a/examples/split_learning/llm_split_learning/split_learning_trainer.py b/examples/split_learning/llm_split_learning/split_learning_trainer.py
index 87b97a56d..348d6a34b 100644
--- a/examples/split_learning/llm_split_learning/split_learning_trainer.py
+++ b/examples/split_learning/llm_split_learning/split_learning_trainer.py
@@ -161,15 +161,8 @@ def test_model(self, model, config, testset, sampler, context):
         # Save other metric information such as accuracy
         tester.log_metrics("eval", metrics)
         return metrics["eval_accuracy"]
-    
-    def eval_model( 
-        self,
-        model,
-        config,
-        benchmark,
-        sampler,
-        context
-    ):
+
+    def eval_model(self, model, config, benchmark, sampler, context):
         """
         Evaluate the model using the benchmark specified in the configuration.
 
@@ -194,17 +187,16 @@ def eval_model(
 
         # Get base model if available
         base_model = model.base_model if hasattr(model, "base_model") else model
-        
+
         # Set model to eval mode and move to device
         base_model.to(context.device)
         base_model.eval()
 
-    
-        if hasattr(benchmark, 'model'):
+        if hasattr(benchmark, "model"):
             benchmark.model = base_model
-        if hasattr(benchmark, 'device'):
+        if hasattr(benchmark, "device"):
             benchmark.device = context.device
-        if hasattr(benchmark, 'tokenizer') and self.tokenizer is not None:
+        if hasattr(benchmark, "tokenizer") and self.tokenizer is not None:
             benchmark.tokenizer = self.tokenizer
 
         # Use benchmark's evaluate method to get results
@@ -213,6 +205,7 @@ def eval_model(
 
         return results
 
+
 # ============================================================================
 # Custom Callbacks for LLM Split Learning
 # ============================================================================
diff --git a/examples/unlearning/fedunlearning/fedunlearning_server.py b/examples/unlearning/fedunlearning/fedunlearning_server.py
index 8938b6549..6d6ade576 100644
--- a/examples/unlearning/fedunlearning/fedunlearning_server.py
+++ b/examples/unlearning/fedunlearning/fedunlearning_server.py
@@ -43,9 +43,7 @@ async def aggregate_deltas(self, updates, deltas_received, context):
 
         if not filtered_pairs:
             if self._fallback_to_original:
-                return await super().aggregate_deltas(
-                    updates, deltas_received, context
-                )
+                return await super().aggregate_deltas(updates, deltas_received, context)
 
             zero_delta = self._zero_delta(
                 context, deltas_received[0] if deltas_received else None
diff --git a/plato/benchmarks/base.py b/plato/benchmarks/base.py
index a6e78516b..3978723ce 100644
--- a/plato/benchmarks/base.py
+++ b/plato/benchmarks/base.py
@@ -15,6 +15,7 @@
 import requests
 import contextlib, time
 
+
 class Benchmark(ABC):
     """Base class for model benchmarks."""
 
@@ -23,24 +24,24 @@ def __init__(self):
         Initialize the benchmark.
         """
         super().__init__()
-    
+
     @abstractmethod
     def evaluate(self) -> dict[str, Any]:
         """
         Evaluate the model on benchmark tasks.
 
         evaluate() returns evaluation results.
-        
+
         Returns:
             Dictionary of evaluation metrics
-            
+
         Example:
             >>> results = benchmark.evaluate()
-            >>> print(results)  
+            >>> print(results)
             {'task1_accuracy': 0.85, 'overall': 0.875}
         """
         pass
-    
+
     @abstractmethod
     def get_formatted_result(self) -> str:
         pass
@@ -132,4 +133,4 @@ def download(url, data_path):
                 logging.info("Unknown compressed file type for %s.", file_name)
                 sys.exit()
 
-            sentinel.touch()
\ No newline at end of file
+            sentinel.touch()
diff --git a/plato/benchmarks/core.py b/plato/benchmarks/core.py
index 854048c2a..b2af7ef53 100644
--- a/plato/benchmarks/core.py
+++ b/plato/benchmarks/core.py
@@ -23,58 +23,59 @@ class Benchmark(base.Benchmark):
     """
     CORE benchmark - evaluates language models on the CORE suite.
     """
-    
+
     def __init__(self):
         """
         Initialize CORE benchmark -- load benchmark tasks and data.
         """
         super().__init__()
-        
+
         # These will be set externally before evaluate() is called
         self.model = None
         self.device = None
         self.tokenizer = None
-                
+
         # Get configuration specific to CORE benchmark
-        self.random_seed = getattr(Config().benchmark, 'random_seed', 24)
-        self.max_per_task = getattr(Config().benchmark, 'max_per_task', -1)
+        self.random_seed = getattr(Config().benchmark, "random_seed", 24)
+        self.max_per_task = getattr(Config().benchmark, "max_per_task", -1)
 
         # Load benchmark tasks and datasets
         self._load_benchmark_data()
 
-
     def _load_benchmark_data(self):
         """
         Load CORE benchmark tasks and evaluation data.
-        
+
         Downloads the evaluation bundle if not already present, then loads
         task configurations and data files.
         """
         # Get base directory and ensure eval_bundle is downloaded
         benchmark_base_dir = Config.params["benchmark_path"]
-        
+
         # Download eval_bundle if not present
         if not os.path.exists(benchmark_base_dir):
             logging.info("CORE evaluation bundle not found. Downloading...")
-            eval_bundle_url = "https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip"
+            eval_bundle_url = (
+                "https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip"
+            )
             Benchmark.download(eval_bundle_url, benchmark_base_dir)
-        
+
         # Load benchmark configuration
         eval_bundle_dir = os.path.join(benchmark_base_dir, "eval_bundle")
         config_path = os.path.join(eval_bundle_dir, "core.yaml")
         self.eval_meta_data_path = os.path.join(eval_bundle_dir, "eval_meta_data.csv")
         self.data_base_path = os.path.join(eval_bundle_dir, "eval_data")
-        
+
         with open(config_path, "r") as f:
             config = yaml.safe_load(f)
-        
+
         self.tasks = config["icl_tasks"]
         self.eval_metadata = pd.read_csv(self.eval_meta_data_path)
-    
+
     def evaluate(self) -> dict[str, Any]:
         """
         Evaluate the model on all CORE tasks.
-        
+
         Returns:
             Dictionary containing:
                 - 'results': per-task accuracies
@@ -84,59 +85,59 @@ def evaluate(self) -> dict[str, Any]:
 
         if self.model is None:
             raise RuntimeError("Trainer has no model - cannot run benchmark")
-        
+
         if self.tokenizer is None:
             raise RuntimeError("Trainer has no tokenizer - cannot run benchmark")
 
         results = {}
         centered_results = {}
-        
+
         # Set model to eval mode
         self.model.eval()
-        
+
         with torch.no_grad():
             for task in self.tasks:
                 start_time = time.time()
                 label = task["label"]
-                
+
                 task_meta = {
                     "task_type": task["icl_task_type"],
                     "dataset_uri": task["dataset_uri"],
                     "num_fewshot": task["num_fewshot"][0],
                     "continuation_delimiter": task.get("continuation_delimiter", " "),
                 }
-                
+
                 logging.info(
                     "Evaluating task: %s (%d-shot, type: %s)",
                     label,
-                    task_meta['num_fewshot'],
-                    task_meta['task_type']
+                    task_meta["num_fewshot"],
+                    task_meta["task_type"],
                 )
-                
+
                 # Load data for this task (matching evaluate_model.py pattern)
                 data_path = os.path.join(self.data_base_path, task_meta["dataset_uri"])
                 with open(data_path, "r") as f:
                     data = [json.loads(line.strip()) for line in f]
-                
+
                 # Shuffle the data for reproducibility (matching evaluate_model.py)
                 shuffle_rng = random.Random(self.random_seed)
                 shuffle_rng.shuffle(data)
-                
+
                 # Crop data if max_per_task is specified
                 if self.max_per_task > 0:
-                    data = data[:self.max_per_task]
-                
+                    data = data[: self.max_per_task]
+
                 # Run evaluation using existing core_eval logic
                 accuracy = core.evaluate_task(
-                    self.model,      # Model in CUDA memory from trainer
+                    self.model,  # Model in CUDA memory from trainer
                     self.tokenizer,  # Tokenizer from trainer
                     data,
                     self.device,
-                    task_meta
+                    task_meta,
                 )
-                
+
                 results[label] = accuracy
-                
+
                 # Compute centered result (normalized by random baseline)
                 row = self.eval_metadata[self.eval_metadata["Eval Task"] == label]
                 random_baseline = row["Random baseline"].values[0]
@@ -144,25 +145,24 @@ def evaluate(self) -> dict[str, Any]:
                     1.0 - 0.01 * random_baseline
                 )
                 centered_results[label] = centered
-                
+
                 elapsed = time.time() - start_time
                 logging.info(
                     "accuracy: %.4f | centered: %.4f | time: %.2fs",
                     accuracy,
                     centered,
-                    elapsed
+                    elapsed,
                 )
-        
+
         # Compute overall CORE metric
         core_metric = sum(centered_results.values()) / len(centered_results)
 
-        
         return {
             "results": results,
             "centered_results": centered_results,
             "core_metric": core_metric,
         }
-    
+
     def get_formatted_result(self, evaluation_result: dict[str, Any]) -> str:
         """
         Format the evaluation results for display.
@@ -179,10 +179,9 @@ def get_formatted_result(self, evaluation_result: dict[str, Any]) -> str:
         result_lines = [f"{'Task':<35}, {'Accuracy':<10}, {'Centered':<10}"]
         for task, acc in results.items():
             centered = centered_results[task]
-            result_lines.append(
-                f"{task:<35}, {acc:<10.6f}, {centered:<10.6f}"
-            )
-        result_lines.append(f"{'Overall CORE Metric':<35}, {'':<10}, {core_metric:<10.6f}\n")
+            result_lines.append(f"{task:<35}, {acc:<10.6f}, {centered:<10.6f}")
+        result_lines.append(
+            f"{'Overall CORE Metric':<35}, {'':<10}, {core_metric:<10.6f}\n"
+        )
 
         return "\n".join(result_lines)
-    
\ No newline at end of file
diff --git a/plato/benchmarks/core_helpers/core.py b/plato/benchmarks/core_helpers/core.py
index 2639c9c48..d767dd49d 100644
--- a/plato/benchmarks/core_helpers/core.py
+++ b/plato/benchmarks/core_helpers/core.py
@@ -154,13 +154,13 @@ def forward_model(model, input_ids):
     """
     batch_size, seq_len = input_ids.size()
     outputs = model(input_ids)
-    
+
     # Extract logits from model output (handles both raw tensors and HuggingFace output objects)
-    if hasattr(outputs, 'logits'):
+    if hasattr(outputs, "logits"):
         logits = outputs.logits
     else:
         logits = outputs
-    
+
     # Roll the tensor to the left by one position to get the (autoregressive) target ids
     target_ids = torch.roll(input_ids, shifts=-1, dims=1)
     # Calculate cross entropy at all positions
diff --git a/plato/benchmarks/core_helpers/tokenizer.py b/plato/benchmarks/core_helpers/tokenizer.py
index 7b6f0e4d5..05dc619ab 100644
--- a/plato/benchmarks/core_helpers/tokenizer.py
+++ b/plato/benchmarks/core_helpers/tokenizer.py
@@ -167,7 +167,7 @@ def save(self, tokenizer_dir):
 class UniversalHuggingFaceTokenizer:
     """
     Universal wrapper that provides a consistent interface for any HuggingFace tokenizer.
-    
+
     This wrapper automatically detects special tokens (BOS, PAD, EOS) and provides
     utility methods that work across different tokenizer implementations.
     """
@@ -175,7 +175,7 @@ class UniversalHuggingFaceTokenizer:
     def __init__(self, tokenizer):
         """
         Initialize the wrapper with a HuggingFace tokenizer.
-        
+
         Args:
             tokenizer: A HuggingFace tokenizer instance (e.g., GPT2TokenizerFast)
         """
@@ -188,7 +188,7 @@ def __init__(self, tokenizer):
     def _detect_special_tokens(self):
         """
         Auto-detect special token IDs from the tokenizer.
-        
+
         Detection strategy (in order of priority):
         1. Try direct attributes on the tokenizer (bos_token_id, pad_token_id, eos_token_id)
         2. For missing tokens, use EOS as BOS/PAD for models like GPT-2
@@ -196,13 +196,22 @@ def _detect_special_tokens(self):
         4. Final fallbacks: 0 for pad, pad for bos
         """
         # Strategy 1: Direct attributes (works for most HuggingFace tokenizers)
-        if hasattr(self.tokenizer, 'bos_token_id') and self.tokenizer.bos_token_id is not None:
+        if (
+            hasattr(self.tokenizer, "bos_token_id")
+            and self.tokenizer.bos_token_id is not None
+        ):
             self._bos_token_id = self.tokenizer.bos_token_id
-        
-        if hasattr(self.tokenizer, 'pad_token_id') and self.tokenizer.pad_token_id is not None:
+
+        if (
+            hasattr(self.tokenizer, "pad_token_id")
+            and self.tokenizer.pad_token_id is not None
+        ):
             self._pad_token_id = self.tokenizer.pad_token_id
-        
-        if hasattr(self.tokenizer, 'eos_token_id') and self.tokenizer.eos_token_id is not None:
+
+        if (
+            hasattr(self.tokenizer, "eos_token_id")
+            and self.tokenizer.eos_token_id is not None
+        ):
             self._eos_token_id = self.tokenizer.eos_token_id
             # For GPT-2 and similar models, BOS is often the same as EOS
             if self._bos_token_id is None:
@@ -210,18 +219,22 @@ def _detect_special_tokens(self):
             # Use EOS as pad if no pad token exists
             if self._pad_token_id is None:
                 self._pad_token_id = self._eos_token_id
-        
+
         # Strategy 2: Try token_to_id method for tokenizers with nested structure
         if hasattr(self.tokenizer, "tokenizer"):
             tokenizer_obj = self.tokenizer.tokenizer
 
             if self._pad_token_id is None:
                 pad_candidates = ["<pad>", "[PAD]", "<|pad|>", "</s>", "<|endoftext|>"]
-                self._pad_token_id = self._try_token_candidates(tokenizer_obj, pad_candidates)
+                self._pad_token_id = self._try_token_candidates(
+                    tokenizer_obj, pad_candidates
+                )
 
             if self._bos_token_id is None:
                 bos_candidates = ["<s>", "[CLS]", "<|startoftext|>", "<|endoftext|>"]
-                self._bos_token_id = self._try_token_candidates(tokenizer_obj, bos_candidates)
+                self._bos_token_id = self._try_token_candidates(
+                    tokenizer_obj, bos_candidates
+                )
 
         # Strategy 3: Final fallbacks
         if self._pad_token_id is None:
@@ -233,17 +246,17 @@ def _detect_special_tokens(self):
     def _try_token_candidates(self, tokenizer_obj, candidates):
         """
         Try to find a token ID from a list of candidate token strings.
-        
+
         Args:
             tokenizer_obj: The tokenizer object with token_to_id method
             candidates: List of token strings to try
-            
+
         Returns:
             Token ID if found, None otherwise
         """
         if not hasattr(tokenizer_obj, "token_to_id"):
             return None
-            
+
         for candidate in candidates:
             token_id = tokenizer_obj.token_to_id(candidate)
             if token_id is not None:
@@ -257,7 +270,7 @@ def get_bos_token_id(self):
     def get_pad_token_id(self):
         """Get the padding token ID."""
         return self._pad_token_id
-    
+
     def get_eos_token_id(self):
         """Get the end-of-sequence token ID."""
         return self._eos_token_id
@@ -265,11 +278,11 @@ def get_eos_token_id(self):
     def __call__(self, prompts, prepend=None):
         """
         Tokenize prompts with optional prepended token.
-        
+
         Args:
             prompts: Single string or list of strings to tokenize
             prepend: Optional token ID to prepend to each sequence
-            
+
         Returns:
             List of token IDs, or list of lists if multiple prompts
         """
diff --git a/plato/benchmarks/registry.py b/plato/benchmarks/registry.py
index 3cfb03252..1325e8bff 100644
--- a/plato/benchmarks/registry.py
+++ b/plato/benchmarks/registry.py
@@ -3,6 +3,7 @@
 
 Enables runtime benchmark selection via configuration.
 """
+
 from plato.benchmarks import core
 from plato.benchmarks.base import Benchmark as BenchmarkBase
 
@@ -10,6 +11,7 @@
     "core": core.Benchmark,
 }
 
+
 def get(type: str) -> BenchmarkBase:
     """Get an instance of the benchmark."""
     if type in registered_benchmarks:
@@ -18,10 +20,9 @@ def get(type: str) -> BenchmarkBase:
     else:
         available = list(registered_benchmarks.keys())
         raise ValueError(
-            f"No such benchmark: {type}. "
-            f"Available benchmarks: {available}"
+            f"No such benchmark: {type}. Available benchmarks: {available}"
         )
-    
+
     return registered_benchmark
 
 
diff --git a/plato/config.py b/plato/config.py
index a3c73344e..9d0f68b15 100644
--- a/plato/config.py
+++ b/plato/config.py
@@ -346,7 +346,7 @@ def __new__(cls):
             # User specific benchmark
             if hasattr(config, "benchmark"):
                 Config.benchmark = config.benchmark
-                
+
                 # Directory of benchmark dataset
                 if hasattr(Config().benchmark, "data_path"):
                     Config.params["benchmark_path"] = os.path.join(
@@ -416,7 +416,7 @@ def __new__(cls):
 
             if hasattr(config, "parameters"):
                 Config.parameters = config.parameters
-            
+
             # Benchmark configuration (for model evaluation)
             if hasattr(config, "benchmark"):
                 Config.benchmark = config.benchmark
diff --git a/plato/servers/fedavg.py b/plato/servers/fedavg.py
index b95fe76b8..30b262b5a 100644
--- a/plato/servers/fedavg.py
+++ b/plato/servers/fedavg.py
@@ -256,15 +256,23 @@ async def _process_reports(self):
             self.accuracy = trainer.test(self.testset, self.testset_sampler)
 
         # Evaluating the global model on the specified benchmark
-        if hasattr(Config().config, "benchmark") and hasattr(Config().benchmark, "type"):
+        if hasattr(Config().config, "benchmark") and hasattr(
+            Config().benchmark, "type"
+        ):
             benchmark_type = Config().benchmark.type
             if self.benchmark is None:
                 self.benchmark = benchmarks_registry.get(benchmark_type)
-            logging.info("[%s] Started model evaluation on benchmark %s.", self, benchmark_type)
+            logging.info(
+                "[%s] Started model evaluation on benchmark %s.", self, benchmark_type
+            )
             trainer = self.require_trainer()
             self.benchmark_result = trainer.eval(self.benchmark, self.testset_sampler)
-            logging.info("[%s] Model evaluation result on benchmark %s:\n%s.", self, benchmark_type, self.benchmark.get_formatted_result(self.benchmark_result))
-
+            logging.info(
+                "[%s] Model evaluation result on benchmark %s:\n%s.",
+                self,
+                benchmark_type,
+                self.benchmark.get_formatted_result(self.benchmark_result),
+            )
 
         if hasattr(Config().trainer, "target_perplexity"):
             logging.info(
diff --git a/plato/trainers/split_learning.py b/plato/trainers/split_learning.py
index ac09322ee..dcacb08da 100644
--- a/plato/trainers/split_learning.py
+++ b/plato/trainers/split_learning.py
@@ -215,7 +215,10 @@ def test_model(self, model, config, testset, sampler, context):
         return accuracy
 
     def eval_model(self, model, config, benchmark, sampler, context):
-        raise NotImplementedError("eval_model is not implemented yet for SplitLearningTestingStrategy.")
+        raise NotImplementedError(
+            "eval_model is not implemented yet for SplitLearningTestingStrategy."
+        )
+
 
 # pylint:disable=too-many-instance-attributes
 class Trainer(ComposableTrainer):
diff --git a/plato/trainers/strategies/base.py b/plato/trainers/strategies/base.py
index a119ab20b..30032f252 100644
--- a/plato/trainers/strategies/base.py
+++ b/plato/trainers/strategies/base.py
@@ -586,4 +586,4 @@ def eval_model(
             setting eval mode, and computing the benchmark metrics.
             The specific return format depends on the benchmark type.
         """
-        pass
\ No newline at end of file
+        pass
diff --git a/plato/trainers/strategies/testing.py b/plato/trainers/strategies/testing.py
index dd76b9ff4..b170f35a0 100644
--- a/plato/trainers/strategies/testing.py
+++ b/plato/trainers/strategies/testing.py
@@ -99,14 +99,7 @@ def test_model(self, model, config, testset, sampler, context):
 
         return accuracy
 
-    def eval_model(
-        self,
-        model,
-        config,
-        benchmark,
-        sampler,
-        context
-    ) -> dict[str, Any]:
+    def eval_model(self, model, config, benchmark, sampler, context) -> dict[str, Any]:
         """
         Evaluate the model on benchmark and return results.
 
@@ -128,4 +121,4 @@ def eval_model(
         raise NotImplementedError(
             "DefaultTestingStrategy does not support benchmark evaluation. "
             "Please implement a custom TestingStrategy with eval_model() for your use case."
-        )
\ No newline at end of file
+        )