From 9cc99be72e5fcc0426ecf7596042f11c6c6f45fe Mon Sep 17 00:00:00 2001 From: Jasmine-Yuting-Zhang Date: Sun, 26 Oct 2025 20:33:33 -0400 Subject: [PATCH 01/17] Added NanoChat evaluation modules under benchmarks/language_models. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Integrated NanoChat’s benchmark components into Plato to enable direct evaluation of Plato models using the NanoChat benchmark. Added files: - common.py: shared utilities and configurations for the benchmark - core_eval.py: implements the CORE benchmark evaluation logic - evaluate_model.py: main entry point to run model evaluation from Plato - report.py: handles result aggregation and reporting - tokenizer.py: provides tokenization utilities for language model evaluation --- benchmarks/language_models/common.py | 154 +++++++ benchmarks/language_models/core_eval.py | 269 +++++++++++ benchmarks/language_models/evaluate_model.py | 224 ++++++++++ benchmarks/language_models/report.py | 446 +++++++++++++++++++ benchmarks/language_models/tokenizer.py | 234 ++++++++++ 5 files changed, 1327 insertions(+) create mode 100644 benchmarks/language_models/common.py create mode 100644 benchmarks/language_models/core_eval.py create mode 100644 benchmarks/language_models/evaluate_model.py create mode 100644 benchmarks/language_models/report.py create mode 100644 benchmarks/language_models/tokenizer.py diff --git a/benchmarks/language_models/common.py b/benchmarks/language_models/common.py new file mode 100644 index 000000000..c13dfd424 --- /dev/null +++ b/benchmarks/language_models/common.py @@ -0,0 +1,154 @@ +""" +Common utilities for nanochat. +""" + +import os +import re +import logging +import torch +import torch.distributed as dist + + +class ColoredFormatter(logging.Formatter): + """Custom formatter that adds colors to log messages.""" + + # ANSI color codes + COLORS = { + "DEBUG": "\033[36m", # Cyan + "INFO": "\033[32m", # Green + "WARNING": "\033[33m", # Yellow + "ERROR": "\033[31m", # Red + "CRITICAL": "\033[35m", # Magenta + } + RESET = "\033[0m" + BOLD = "\033[1m" + + def format(self, record): + # Add color to the level name + levelname = record.levelname + if levelname in self.COLORS: + record.levelname = ( + f"{self.COLORS[levelname]}{self.BOLD}{levelname}{self.RESET}" + ) + # Format the message + message = super().format(record) + # Add color to specific parts of the message + if levelname == "INFO": + # Highlight numbers and percentages + message = re.sub( + r"(\d+\.?\d*\s*(?:GB|MB|%|docs))", + rf"{self.BOLD}\1{self.RESET}", + message, + ) + message = re.sub( + r"(Shard \d+)", + rf"{self.COLORS['INFO']}{self.BOLD}\1{self.RESET}", + message, + ) + return message + + +def setup_default_logging(): + handler = logging.StreamHandler() + handler.setFormatter( + ColoredFormatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") + ) + logging.basicConfig(level=logging.INFO, handlers=[handler]) + + +setup_default_logging() +logger = logging.getLogger(__name__) + + +def get_base_dir(): + # co-locate nanochat intermediates with other cached data in ~/.cache (by default) + if os.environ.get("NANOCHAT_BASE_DIR"): + nanochat_dir = os.environ.get("NANOCHAT_BASE_DIR") + else: + home_dir = os.path.expanduser("~") + cache_dir = os.path.join(home_dir, ".cache") + nanochat_dir = os.path.join(cache_dir, "nanochat") + os.makedirs(nanochat_dir, exist_ok=True) + return nanochat_dir + + +def print0(s="", **kwargs): + ddp_rank = int(os.environ.get("RANK", 0)) + if ddp_rank == 0: + print(s, **kwargs) + + +def is_ddp(): + # TODO is there a proper way + return int(os.environ.get("RANK", -1)) != -1 + + +def get_dist_info(): + if is_ddp(): + assert all(var in os.environ for var in ["RANK", "LOCAL_RANK", "WORLD_SIZE"]) + ddp_rank = int(os.environ["RANK"]) + ddp_local_rank = int(os.environ["LOCAL_RANK"]) + ddp_world_size = int(os.environ["WORLD_SIZE"]) + return True, ddp_rank, ddp_local_rank, ddp_world_size + else: + return False, 0, 0, 1 + + +def autodetect_device_type(): + # prefer to use CUDA if available, otherwise use MPS, otherwise fallback on CPU + if torch.cuda.is_available(): + device_type = "cuda" + elif torch.backends.mps.is_available(): + device_type = "mps" + else: + device_type = "cpu" + print0(f"Autodetected device type: {device_type}") + return device_type + + +def compute_init(device_type="cuda"): # cuda|cpu|mps + """Basic initialization that we keep doing over and over, so make common.""" + + assert device_type in ["cuda", "mps", "cpu"], "Invalid device type atm" + if device_type == "cuda": + assert torch.cuda.is_available(), ( + "Your PyTorch installation is not configured for CUDA but device_type is 'cuda'" + ) + if device_type == "mps": + assert torch.backends.mps.is_available(), ( + "Your PyTorch installation is not configured for MPS but device_type is 'mps'" + ) + + # Reproducibility + torch.manual_seed(42) + if device_type == "cuda": + torch.cuda.manual_seed(42) + # skipping full reproducibility for now, possibly investigate slowdown later + # torch.use_deterministic_algorithms(True) + + # Precision + if device_type == "cuda": + torch.set_float32_matmul_precision( + "high" + ) # uses tf32 instead of fp32 for matmuls + + # Distributed setup: Distributed Data Parallel (DDP), optional, and requires CUDA + ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info() + if ddp and device_type == "cuda": + device = torch.device("cuda", ddp_local_rank) + torch.cuda.set_device(device) # make "cuda" default to this device + dist.init_process_group(backend="nccl", device_id=device) + dist.barrier() + else: + device = torch.device(device_type) # mps|cpu + + if ddp_rank == 0: + logger.info(f"Distributed world size: {ddp_world_size}") + + return ddp, ddp_rank, ddp_local_rank, ddp_world_size, device + + +def compute_cleanup(): + """Companion function to compute_init, to clean things up before script exit""" + if is_ddp(): + dist.destroy_process_group() diff --git a/benchmarks/language_models/core_eval.py b/benchmarks/language_models/core_eval.py new file mode 100644 index 000000000..547d22cd3 --- /dev/null +++ b/benchmarks/language_models/core_eval.py @@ -0,0 +1,269 @@ +""" +Functions for evaluating the CORE metric, as described in the DCLM paper. +https://arxiv.org/abs/2406.11794 + +TODOs: +- All tasks ~match except for squad. We get 31% reference is 37%. Figure out why. +""" + +import random + +from jinja2 import Template +import torch +import torch.distributed as dist + +# ----------------------------------------------------------------------------- +# Prompt rendering utilities + + +def render_prompts_mc(item, continuation_delimiter, fewshot_examples=None): + """Render complete prompts for a multiple choice question""" + template_str = """ +{%- for example in fewshot_examples -%} +{{ example.query }}{{ continuation_delimiter }}{{ example.choices[example.gold] }} + +{% endfor -%} +{{ item.query }}{{ continuation_delimiter }}{{ choice }}""".strip() + template = Template(template_str) + fewshot_examples = fewshot_examples or [] + context = { + "fewshot_examples": fewshot_examples, + "continuation_delimiter": continuation_delimiter, + "item": item, + } + prompts = [template.render(choice=choice, **context) for choice in item["choices"]] + return prompts + + +def render_prompts_schema(item, continuation_delimiter, fewshot_examples=None): + """Render complete prompts for a schema question""" + template_str = """ +{%- for example in fewshot_examples -%} +{{ example.context_options[example.gold] }}{{ continuation_delimiter }}{{ example.continuation }} + +{% endfor -%} +{{ context }}{{ continuation_delimiter }}{{ item.continuation }}""".strip() + template = Template(template_str) + fewshot_examples = fewshot_examples or [] + context = { + "fewshot_examples": fewshot_examples, + "continuation_delimiter": continuation_delimiter, + "item": item, + } + prompts = [ + template.render(context=context_option, **context) + for context_option in item["context_options"] + ] + return prompts + + +def render_prompts_lm(item, continuation_delimiter, fewshot_examples=None): + """ + Render complete prompt for a language modeling task. + Notice that we manually trim the context in the template, + which in some datasets seems to have trailing whitespace (which we don't want). + """ + template_str = """ +{%- for example in fewshot_examples -%} +{{ example.context | trim }}{{ continuation_delimiter }}{{ example.continuation }} + +{% endfor -%} +{{ item.context | trim }}{{ continuation_delimiter }}{% if include_continuation %}{{ item.continuation }}{% endif %}""".strip() + template = Template(template_str) + fewshot_examples = fewshot_examples or [] + context = { + "fewshot_examples": fewshot_examples, + "continuation_delimiter": continuation_delimiter, + "item": item, + } + # Return two prompts: without and with the continuation + prompt_without = template.render(include_continuation=False, **context) + prompt_with = template.render(include_continuation=True, **context) + # Due to the way the data seems to be stored, I think I need to strip in the case of LM here. + # Otherwise we may get trailing whitespaces in prompt_without (which get absorbed into the next + # token in prompt_with), meaning we don't get a nice and clean prefix in the token space + # to detect the final continuation. Tokenizers... + prompt_without = prompt_without.strip() + return [prompt_without, prompt_with] + + +def find_common_length(token_sequences, direction="left"): + """ + Find the length of the common prefix or suffix across token sequences + - direction: 'left' for prefix, 'right' for suffix + """ + min_len = min(len(seq) for seq in token_sequences) + indices = {"left": range(min_len), "right": range(-1, -min_len - 1, -1)}[direction] + # Find the first position where the token sequences differ + for i, idx in enumerate(indices): + token = token_sequences[0][idx] + if not all(seq[idx] == token for seq in token_sequences): + return i + return min_len + + +def stack_sequences(tokens, pad_token_id): + """Stack up a list of token sequences, pad to longest on the right""" + bsz, seq_len = len(tokens), max(len(x) for x in tokens) + input_ids = torch.full((bsz, seq_len), pad_token_id, dtype=torch.long) + for i, x in enumerate(tokens): + input_ids[i, : len(x)] = torch.tensor(x, dtype=torch.long) + return input_ids + + +def batch_sequences_mc(tokenizer, prompts): + # In multiple choice, contexts are the same but the continuation is different (common prefix) + tokens = tokenizer(prompts, prepend=tokenizer.get_bos_token_id()) + # figure out the start and end of each continuation + answer_start_idx = find_common_length(tokens, direction="left") + start_indices = [answer_start_idx] * len(prompts) + end_indices = [len(x) for x in tokens] + return tokens, start_indices, end_indices + + +def batch_sequences_schema(tokenizer, prompts): + # In schema tasks, contexts vary but continuation is the same (common suffix) + tokens = tokenizer(prompts, prepend=tokenizer.get_bos_token_id()) + # figure out the start and end of each context + suffix_length = find_common_length(tokens, direction="right") + end_indices = [len(x) for x in tokens] + start_indices = [ei - suffix_length for ei in end_indices] + return tokens, start_indices, end_indices + + +def batch_sequences_lm(tokenizer, prompts): + # In LM tasks, we have two prompts: without and with continuation + tokens = tokenizer(prompts, prepend=tokenizer.get_bos_token_id()) + tokens_without, tokens_with = tokens + start_idx, end_idx = len(tokens_without), len(tokens_with) + assert start_idx < end_idx, ( + "prompt without is supposed to be a prefix of prompt with" + ) + assert tokens_without == tokens_with[:start_idx], ( + "prompt without is supposed to be a prefix of prompt with" + ) + # we only need the with continuation prompt in the LM task, i.e. batch size of 1 + return [tokens_with], [start_idx], [end_idx] + + +@torch.no_grad() +def forward_model(model, input_ids): + """ + Take BxT tensor of token ids, return BxT tensor of losses and argmax predictions. + The last column of losses is set to nan because we don't have autoregressive targets there. + """ + batch_size, seq_len = input_ids.size() + outputs = model(input_ids) + # Roll the tensor to the left by one position to get the (autoregressive) target ids + target_ids = torch.roll(input_ids, shifts=-1, dims=1) + # Calculate cross entropy at all positions + losses = torch.nn.functional.cross_entropy( + outputs.view(batch_size * seq_len, -1), + target_ids.view(batch_size * seq_len), + reduction="none", + ).view(batch_size, seq_len) + # Set the last column to be nan because there is no autoregressive loss there + losses[:, -1] = float("nan") + # Get the argmax predictions at each position + predictions = outputs.argmax(dim=-1) + return losses, predictions + + +@torch.no_grad() +def evaluate_example(idx, model, tokenizer, data, device, task_meta): + """Evaluate a single example, return True if correct, False otherwise""" + item = data[idx] + task_type = task_meta["task_type"] + num_fewshot = task_meta["num_fewshot"] + continuation_delimiter = task_meta["continuation_delimiter"] + + # Sample few-shot examples (excluding current item) + fewshot_examples = [] + if num_fewshot > 0: + rng = random.Random(1234 + idx) + available_indices = [i for i in range(len(data)) if i != idx] + fewshot_indices = rng.sample(available_indices, num_fewshot) + fewshot_examples = [data[i] for i in fewshot_indices] + + # Render prompts and batch sequences based on task type + if task_type == "multiple_choice": + prompts = render_prompts_mc(item, continuation_delimiter, fewshot_examples) + tokens, start_idxs, end_idxs = batch_sequences_mc(tokenizer, prompts) + elif task_type == "schema": + prompts = render_prompts_schema(item, continuation_delimiter, fewshot_examples) + tokens, start_idxs, end_idxs = batch_sequences_schema(tokenizer, prompts) + elif task_type == "language_modeling": + prompts = render_prompts_lm(item, continuation_delimiter, fewshot_examples) + tokens, start_idxs, end_idxs = batch_sequences_lm(tokenizer, prompts) + else: + raise ValueError(f"Unsupported task type: {task_type}") + + # Some models can't forward sequences beyond a certain length (e.g. GPT-2) + # In these cases, we have to truncate sequences to max length and adjust the indices + if hasattr(model, "max_seq_len") and model.max_seq_len is not None: + max_tokens = model.max_seq_len + new_tokens, new_start_idxs, new_end_idxs = [], [], [] + for t, s, e in zip(tokens, start_idxs, end_idxs): + if len(t) > max_tokens: + num_to_crop = len(t) - max_tokens + new_tokens.append(t[-max_tokens:]) # take the last max_tokens tokens + new_start_idxs.append(s - num_to_crop) # shift the indices down + new_end_idxs.append(e - num_to_crop) + assert s - num_to_crop >= 0, "this should never happen right?" + assert e - num_to_crop >= 0, "this should never happen right?" + else: + new_tokens.append(t) # keep unchanged + new_start_idxs.append(s) + new_end_idxs.append(e) + tokens, start_idxs, end_idxs = new_tokens, new_start_idxs, new_end_idxs + + # Stack up all the sequences into a batch + pad_token_id = tokenizer.get_bos_token_id() # use BOS as pad token is ok + input_ids = stack_sequences(tokens, pad_token_id) + input_ids = input_ids.to(device) + + # Forward the model, get the autoregressive loss and argmax prediction at each token + losses, predictions = forward_model(model, input_ids) + + # See if the losses/predictions come out correctly + if task_type == "language_modeling": + # language modeling task is currently always batch size 1 + si = start_idxs[0] + ei = end_idxs[0] + # predictions[i] predict input_ids[i+1] autoregressively + predicted_tokens = predictions[0, si - 1 : ei - 1] + actual_tokens = input_ids[0, si:ei] + is_correct = torch.all(predicted_tokens == actual_tokens).item() + elif task_type in ["multiple_choice", "schema"]: + # For MC/schema: find the option with lowest average loss + mean_losses = [ + losses[i, si - 1 : ei - 1].mean().item() + for i, (si, ei) in enumerate(zip(start_idxs, end_idxs)) + ] + pred_idx = mean_losses.index(min(mean_losses)) + is_correct = pred_idx == item["gold"] + else: + raise ValueError(f"Unsupported task type: {task_type}") + + return is_correct + + +def evaluate_task(model, tokenizer, data, device, task_meta): + """ + This function is responsible for evaluating one task across many examples. + It also handles dispatch to all processes if the script is run with torchrun. + """ + rank = dist.get_rank() if dist.is_initialized() else 0 + world_size = dist.get_world_size() if dist.is_initialized() else 1 + correct = torch.zeros(len(data), dtype=torch.float32, device=device) + # stride the examples to each rank + for idx in range(rank, len(data), world_size): + is_correct = evaluate_example(idx, model, tokenizer, data, device, task_meta) + correct[idx] = float(is_correct) + # sync results across all the processes if running distributed + if world_size > 1: + dist.barrier() + dist.all_reduce(correct, op=dist.ReduceOp.SUM) + # compute the mean + mean_correct = correct.mean().item() + return mean_correct diff --git a/benchmarks/language_models/evaluate_model.py b/benchmarks/language_models/evaluate_model.py new file mode 100644 index 000000000..b133013c5 --- /dev/null +++ b/benchmarks/language_models/evaluate_model.py @@ -0,0 +1,224 @@ +""" +Evlauate the CORE metric for a given model. + +Run on a single GPU: +python base_eval.py --hf-path + +The script will print the CORE metric to the console. +""" + +import os +import time +import json +import random +import yaml +from contextlib import nullcontext + +import pandas as pd +import torch + +from common import ( + compute_init, + compute_cleanup, + print0, + get_base_dir, + autodetect_device_type, +) +from tokenizer import UniversalHuggingFaceTokenizer +from core_eval import evaluate_task + +# ----------------------------------------------------------------------------- +# nanoChat specific function dealing with I/O etc. + + +def evaluate_model(model, tokenizer, device, max_per_task=-1): + """ + Evaluate a base model on the CORE benchmark. + - max_per_task: crop the data to this many examples per task for testing (-1 = disable) + TODO: clean up this function, delete the need for all the files, for pandas dependency, etc. + """ + # Load config and task metadata + base_dir = get_base_dir() + eval_bundle_dir = os.path.join(base_dir, "eval_bundle") + config_path = os.path.join(eval_bundle_dir, "core.yaml") + data_base_path = os.path.join(eval_bundle_dir, "eval_data") + eval_meta_data = os.path.join(eval_bundle_dir, "eval_meta_data.csv") + with open(config_path, "r") as f: + config = yaml.safe_load(f) + tasks = config["icl_tasks"] + eval_metadata = pd.read_csv(eval_meta_data) + + # Evaluate each task + results = {} + centered_results = {} + for task in tasks: + start_time = time.time() + label = task["label"] + task_meta = { + "task_type": task["icl_task_type"], + "dataset_uri": task["dataset_uri"], + "num_fewshot": task["num_fewshot"][0], + "continuation_delimiter": task.get("continuation_delimiter", " "), + } + print0( + f"Evaluating: {label} ({task_meta['num_fewshot']}-shot, type: {task_meta['task_type']})... ", + end="", + ) + + # Load data for this task + data_path = os.path.join(data_base_path, task_meta["dataset_uri"]) + with open(data_path, "r") as f: + data = [json.loads(line.strip()) for line in f] + + # shuffle the data because in many cases it appears ordered but we want + # the abillity to only run a subset of the data for debugging purposes etc. + shuffle_rng = random.Random(1337) + shuffle_rng.shuffle(data) + if max_per_task > 0: + data = data[:max_per_task] + + # run the evaluation for this task + accuracy = evaluate_task(model, tokenizer, data, device, task_meta) + + results[label] = accuracy + row = eval_metadata[eval_metadata["Eval Task"] == label] + random_baseline = row["Random baseline"].values[0] + centered_result = (accuracy - 0.01 * random_baseline) / ( + 1.0 - 0.01 * random_baseline + ) + centered_results[label] = centered_result + end_time = time.time() + print0( + f"accuracy: {accuracy:.4f} | centered: {centered_result:.4f} | time: {end_time - start_time:.2f}s" + ) + + core_metric = sum(centered_results.values()) / len(centered_results) + out = { + "results": results, + "centered_results": centered_results, + "core_metric": core_metric, + } + return out + + +# ----------------------------------------------------------------------------- +# HuggingFace loading utilities and light wrappers for a model + + +class ModelWrapper: + """Lightweight wrapper for a HuggingFace model""" + + def __init__(self, model, max_seq_len=None): + self.model = model + self.max_seq_len = max_seq_len + + def __call__(self, input_ids): + outputs = self.model(input_ids) + logits = outputs.logits + return logits + + +def load_hf_model(hf_path: str, device): + print0(f"Loading model from: {hf_path}") + from transformers import AutoModelForCausalLM, AutoConfig + + if os.path.exists(hf_path): + hf_path = os.path.abspath(hf_path) + print0(f"Using absolute path: {hf_path}") + + # Load config to help with token detection + config = AutoConfig.from_pretrained(hf_path, local_files_only=True) + model = AutoModelForCausalLM.from_pretrained(hf_path, local_files_only=True) + tokenizer = UniversalHuggingFaceTokenizer(hf_path, config) + + model.to(device) + model.eval() + max_seq_len = 1024 # subject to change based on model type, for GPT-2 it's 1024 + model = ModelWrapper(model, max_seq_len=max_seq_len) + return model, tokenizer + + +# ----------------------------------------------------------------------------- +def main(): + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument( + "--hf-path", + type=str, + default=None, + required=True, + help="HuggingFace model path to evaluate", + ) + parser.add_argument( + "--max-per-task", + type=int, + default=-1, + help="Max examples per task to evaluate (-1 = disable)", + ) + args = parser.parse_args() + + # distributed / precision setup + device_type = autodetect_device_type() + ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type) + autocast_ctx = ( + torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) + if device_type == "cuda" + else nullcontext() + ) + + # Load model and tokenizer from command line or from file system + # atm assume that if a path is given, it's a huggingface model path + hf_path = args.hf_path + print0(f"Loading huggingface model from: {hf_path}") + model, tokenizer = load_hf_model(hf_path, device) + model_name = hf_path # just for logging + model_slug = hf_path.replace("/", "-") # for the output csv file + + # Evaluate the model + with autocast_ctx: + out = evaluate_model(model, tokenizer, device, max_per_task=args.max_per_task) + + # Write out the results to a csv file + core_metric = None + centered_results = {} + if ddp_rank == 0: + base_dir = get_base_dir() + output_csv_path = os.path.join(base_dir, "base_eval", f"{model_slug}.csv") + os.makedirs(os.path.dirname(output_csv_path), exist_ok=True) + results = out["results"] + centered_results = out["centered_results"] + core_metric = out["core_metric"] + with open(output_csv_path, "w") as f: + f.write(f"{'Task':<35}, {'Accuracy':<10}, {'Centered':<10}\n") + for label in results: + f.write( + f"{label:<35}, {results[label]:<10.6f}, {centered_results[label]:<10.6f}\n" + ) + f.write(f"{'CORE':<35}, {'':<10}, {core_metric:<10.6f}\n") + # Print the content of the csv file to console too + print0("=" * 80) + print0(f"Model: {model_name}") + print0("=" * 80) + with open(output_csv_path, "r") as f: + print0(f.read()) + + # Log to report + from report import get_report + + get_report().log( + section="Base model evaluation", + data=[ + { + "Model": model_name, + "CORE metric": core_metric, + }, + centered_results, # the full table + ], + ) + + compute_cleanup() + + +if __name__ == "__main__": + main() diff --git a/benchmarks/language_models/report.py b/benchmarks/language_models/report.py new file mode 100644 index 000000000..5721b1be2 --- /dev/null +++ b/benchmarks/language_models/report.py @@ -0,0 +1,446 @@ +""" +Utilities for generating training report cards. More messy code than usual, will fix. +""" + +import os +import re +import shutil +import subprocess +import socket +import datetime +import platform +import psutil +import torch + + +def run_command(cmd): + """Run a shell command and return output, or None if it fails.""" + try: + result = subprocess.run( + cmd, shell=True, capture_output=True, text=True, timeout=5 + ) + if result.returncode == 0: + return result.stdout.strip() + return None + except: + return None + + +def get_git_info(): + """Get current git commit, branch, and dirty status.""" + info = {} + info["commit"] = run_command("git rev-parse --short HEAD") or "unknown" + info["branch"] = run_command("git rev-parse --abbrev-ref HEAD") or "unknown" + + # Check if repo is dirty (has uncommitted changes) + status = run_command("git status --porcelain") + info["dirty"] = bool(status) if status is not None else False + + # Get commit message + info["message"] = run_command("git log -1 --pretty=%B") or "" + info["message"] = info["message"].split("\n")[0][:80] # First line, truncated + + return info + + +def get_gpu_info(): + """Get GPU information.""" + if not torch.cuda.is_available(): + return {"available": False} + + num_devices = torch.cuda.device_count() + info = {"available": True, "count": num_devices, "names": [], "memory_gb": []} + + for i in range(num_devices): + props = torch.cuda.get_device_properties(i) + info["names"].append(props.name) + info["memory_gb"].append(props.total_memory / (1024**3)) + + # Get CUDA version + info["cuda_version"] = torch.version.cuda or "unknown" + + return info + + +def get_system_info(): + """Get system information.""" + info = {} + + # Basic system info + info["hostname"] = socket.gethostname() + info["platform"] = platform.system() + info["python_version"] = platform.python_version() + info["torch_version"] = torch.__version__ + + # CPU and memory + info["cpu_count"] = psutil.cpu_count(logical=False) + info["cpu_count_logical"] = psutil.cpu_count(logical=True) + info["memory_gb"] = psutil.virtual_memory().total / (1024**3) + + # User and environment + info["user"] = os.environ.get("USER", "unknown") + info["nanochat_base_dir"] = os.environ.get("NANOCHAT_BASE_DIR", "out") + info["working_dir"] = os.getcwd() + + return info + + +def estimate_cost(gpu_info, runtime_hours=None): + """Estimate training cost based on GPU type and runtime.""" + + # Rough pricing, from Lambda Cloud + default_rate = 2.0 + gpu_hourly_rates = { + "H100": 3.00, + "A100": 1.79, + "V100": 0.55, + } + + if not gpu_info.get("available"): + return None + + # Try to identify GPU type from name + hourly_rate = None + gpu_name = gpu_info["names"][0] if gpu_info["names"] else "unknown" + for gpu_type, rate in gpu_hourly_rates.items(): + if gpu_type in gpu_name: + hourly_rate = rate * gpu_info["count"] + break + + if hourly_rate is None: + hourly_rate = default_rate * gpu_info["count"] # Default estimate + + return { + "hourly_rate": hourly_rate, + "gpu_type": gpu_name, + "estimated_total": hourly_rate * runtime_hours if runtime_hours else None, + } + + +def generate_header(): + """Generate the header for a training report.""" + timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + git_info = get_git_info() + gpu_info = get_gpu_info() + sys_info = get_system_info() + cost_info = estimate_cost(gpu_info) + + header = f"""# nanochat training report + +Generated: {timestamp} + +## Environment + +### Git Information +- Branch: {git_info["branch"]} +- Commit: {git_info["commit"]} {"(dirty)" if git_info["dirty"] else "(clean)"} +- Message: {git_info["message"]} + +### Hardware +- Platform: {sys_info["platform"]} +- CPUs: {sys_info["cpu_count"]} cores ({sys_info["cpu_count_logical"]} logical) +- Memory: {sys_info["memory_gb"]:.1f} GB +""" + + if gpu_info.get("available"): + gpu_names = ", ".join(set(gpu_info["names"])) + total_vram = sum(gpu_info["memory_gb"]) + header += f"""- GPUs: {gpu_info["count"]}x {gpu_names} +- GPU Memory: {total_vram:.1f} GB total +- CUDA Version: {gpu_info["cuda_version"]} +""" + else: + header += "- GPUs: None available\n" + + if cost_info and cost_info["hourly_rate"] > 0: + header += f"""- Hourly Rate: ${cost_info["hourly_rate"]:.2f}/hour\n""" + + header += f""" +### Software +- Python: {sys_info["python_version"]} +- PyTorch: {sys_info["torch_version"]} + +""" + + # bloat metrics: package all of the source code and assess its weight + packaged = run_command( + 'files-to-prompt . -e py -e md -e rs -e html -e toml -e sh --ignore "*target*" --cxml' + ) + num_chars = len(packaged) + num_lines = len(packaged.split("\n")) + num_files = len([x for x in packaged.split("\n") if x.startswith("")]) + num_tokens = num_chars // 4 # assume approximately 4 chars per token + + # count dependencies via uv.lock + uv_lock_lines = 0 + if os.path.exists("uv.lock"): + with open("uv.lock", "r") as f: + uv_lock_lines = len(f.readlines()) + + header += f""" +### Bloat +- Characters: {num_chars:,} +- Lines: {num_lines:,} +- Files: {num_files:,} +- Tokens (approx): {num_tokens:,} +- Dependencies (uv.lock lines): {uv_lock_lines:,} + +""" + return header + + +# ----------------------------------------------------------------------------- + + +def slugify(text): + """Slugify a text string.""" + return text.lower().replace(" ", "-") + + +# the expected files and their order +EXPECTED_FILES = [ + "tokenizer-training.md", + "tokenizer-evaluation.md", + "base-model-training.md", + "base-model-loss.md", + "base-model-evaluation.md", + "midtraining.md", + "chat-evaluation-mid.md", + "chat-sft.md", + "chat-evaluation-sft.md", + "chat-rl.md", + "chat-evaluation-rl.md", +] +# the metrics we're currently interested in +chat_metrics = ["ARC-Easy", "ARC-Challenge", "MMLU", "GSM8K", "HumanEval", "ChatCORE"] + + +def extract(section, keys): + """simple def to extract a single key from a section""" + if not isinstance(keys, list): + keys = [keys] # convenience + out = {} + for line in section.split("\n"): + for key in keys: + if key in line: + out[key] = line.split(":")[1].strip() + return out + + +def extract_timestamp(content, prefix): + """Extract timestamp from content with given prefix.""" + for line in content.split("\n"): + if line.startswith(prefix): + time_str = line.split(":", 1)[1].strip() + try: + return datetime.datetime.strptime(time_str, "%Y-%m-%d %H:%M:%S") + except: + pass + return None + + +class Report: + """Maintains a bunch of logs, generates a final markdown report.""" + + def __init__(self, report_dir): + os.makedirs(report_dir, exist_ok=True) + self.report_dir = report_dir + + def log(self, section, data): + """Log a section of data to the report.""" + slug = slugify(section) + file_name = f"{slug}.md" + file_path = os.path.join(self.report_dir, file_name) + with open(file_path, "w") as f: + f.write(f"## {section}\n") + f.write( + f"timestamp: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n" + ) + for item in data: + if not item: + # skip falsy values like None or empty dict etc. + continue + if isinstance(item, str): + # directly write the string + f.write(item) + else: + # render a dict + for k, v in item.items(): + if isinstance(v, float): + vstr = f"{v:.4f}" + elif isinstance(v, int) and v >= 10000: + vstr = f"{v:,.0f}" + else: + vstr = str(v) + f.write(f"- {k}: {vstr}\n") + f.write("\n") + return file_path + + def generate(self): + """Generate the final report.""" + report_dir = self.report_dir + report_file = os.path.join(report_dir, "report.md") + print(f"Generating report to {report_file}") + final_metrics = {} # the most important final metrics we'll add as table at the end + start_time = None + end_time = None + with open(report_file, "w") as out_file: + # write the header first + header_file = os.path.join(report_dir, "header.md") + if os.path.exists(header_file): + with open(header_file, "r") as f: + header_content = f.read() + out_file.write(header_content) + start_time = extract_timestamp(header_content, "Run started:") + # capture bloat data for summary later (the stuff after Bloat header and until \n\n) + bloat_data = re.search( + r"### Bloat\n(.*?)\n\n", header_content, re.DOTALL + ) + bloat_data = bloat_data.group(1) if bloat_data else "" + else: + start_time = ( + None # will cause us to not write the total wall clock time + ) + bloat_data = "[bloat data missing]" + print( + f"Warning: {header_file} does not exist. Did you forget to run `nanochat reset`?" + ) + # process all the individual sections + for file_name in EXPECTED_FILES: + section_file = os.path.join(report_dir, file_name) + if not os.path.exists(section_file): + print(f"Warning: {section_file} does not exist, skipping") + continue + with open(section_file, "r") as in_file: + section = in_file.read() + # Extract timestamp from this section (the last section's timestamp will "stick" as end_time) + if "rl" not in file_name: + # Skip RL sections for end_time calculation because RL is experimental + end_time = extract_timestamp(section, "timestamp:") + # extract the most important metrics from the sections + if file_name == "base-model-evaluation.md": + final_metrics["base"] = extract(section, "CORE") + if file_name == "chat-evaluation-mid.md": + final_metrics["mid"] = extract(section, chat_metrics) + if file_name == "chat-evaluation-sft.md": + final_metrics["sft"] = extract(section, chat_metrics) + if file_name == "chat-evaluation-rl.md": + final_metrics["rl"] = extract( + section, "GSM8K" + ) # RL only evals GSM8K + # append this section of the report + out_file.write(section) + out_file.write("\n") + # add the final metrics table + out_file.write("## Summary\n\n") + # Copy over the bloat metrics from the header + out_file.write(bloat_data) + out_file.write("\n\n") + # Collect all unique metric names + all_metrics = set() + for stage_metrics in final_metrics.values(): + all_metrics.update(stage_metrics.keys()) + # Custom ordering: CORE first, ChatCORE last, rest in middle + all_metrics = sorted( + all_metrics, key=lambda x: (x != "CORE", x == "ChatCORE", x) + ) + # Fixed column widths + stages = ["base", "mid", "sft", "rl"] + metric_width = 15 + value_width = 8 + # Write table header + header = f"| {'Metric'.ljust(metric_width)} |" + for stage in stages: + header += f" {stage.upper().ljust(value_width)} |" + out_file.write(header + "\n") + # Write separator + separator = f"|{'-' * (metric_width + 2)}|" + for stage in stages: + separator += f"{'-' * (value_width + 2)}|" + out_file.write(separator + "\n") + # Write table rows + for metric in all_metrics: + row = f"| {metric.ljust(metric_width)} |" + for stage in stages: + value = final_metrics.get(stage, {}).get(metric, "-") + row += f" {str(value).ljust(value_width)} |" + out_file.write(row + "\n") + out_file.write("\n") + # Calculate and write total wall clock time + if start_time and end_time: + duration = end_time - start_time + total_seconds = int(duration.total_seconds()) + hours = total_seconds // 3600 + minutes = (total_seconds % 3600) // 60 + out_file.write(f"Total wall clock time: {hours}h{minutes}m\n") + else: + out_file.write("Total wall clock time: unknown\n") + # also cp the report.md file to current directory + print(f"Copying report.md to current directory for convenience") + shutil.copy(report_file, "report.md") + return report_file + + def reset(self): + """Reset the report.""" + # Remove section files + for file_name in EXPECTED_FILES: + file_path = os.path.join(self.report_dir, file_name) + if os.path.exists(file_path): + os.remove(file_path) + # Remove report.md if it exists + report_file = os.path.join(self.report_dir, "report.md") + if os.path.exists(report_file): + os.remove(report_file) + # Generate and write the header section with start timestamp + header_file = os.path.join(self.report_dir, "header.md") + header = generate_header() + start_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + with open(header_file, "w") as f: + f.write(header) + f.write(f"Run started: {start_time}\n\n---\n\n") + print(f"Reset report and wrote header to {header_file}") + + +# ----------------------------------------------------------------------------- +# nanochat-specific convenience functions + + +class DummyReport: + def log(self, *args, **kwargs): + pass + + def reset(self, *args, **kwargs): + pass + + +def get_report(): + # just for convenience, only rank 0 logs to report + from common import get_base_dir, get_dist_info + + ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info() + if ddp_rank == 0: + report_dir = os.path.join(get_base_dir(), "report") + return Report(report_dir) + else: + return DummyReport() + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser( + description="Generate or reset nanochat training reports." + ) + parser.add_argument( + "command", + nargs="?", + default="generate", + choices=["generate", "reset"], + help="Operation to perform (default: generate)", + ) + args = parser.parse_args() + if args.command == "generate": + get_report().generate() + elif args.command == "reset": + get_report().reset() diff --git a/benchmarks/language_models/tokenizer.py b/benchmarks/language_models/tokenizer.py new file mode 100644 index 000000000..fc962c1d0 --- /dev/null +++ b/benchmarks/language_models/tokenizer.py @@ -0,0 +1,234 @@ +""" +BPE Tokenizer in the style of GPT-4. + +Two implementations are available: +1) HuggingFace Tokenizer that can do both training and inference but is really confusing +2) Universal Wrapper that can load any HuggingFace tokenizer (e.g., for GPT-2 which has slightly different tokenization rules than GPT-4) for inference only. +""" + +import os + +SPECIAL_TOKENS = [ + # every document begins with the Beginning of Sequence (BOS) token that delimits documents + "<|bos|>", + # tokens below are only used during finetuning to render Conversations into token ids + "<|user_start|>", # user messages + "<|user_end|>", + "<|assistant_start|>", # assistant messages + "<|assistant_end|>", + "<|python_start|>", # assistant invokes python REPL tool + "<|python_end|>", + "<|output_start|>", # python REPL outputs back to assistant + "<|output_end|>", +] + +# NOTE: this split pattern deviates from GPT-4 in that we use \p{N}{1,2} instead of \p{N}{1,3} +# I did this because I didn't want to "waste" too many tokens on numbers for smaller vocab sizes. +# I haven't validated that this is actually a good idea, TODO. +SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,2}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""" + +# ----------------------------------------------------------------------------- +# Generic GPT-4-style tokenizer based on HuggingFace Tokenizer +from tokenizers import Tokenizer as HFTokenizer +from tokenizers import pre_tokenizers, decoders, Regex +from tokenizers.models import BPE +from tokenizers.trainers import BpeTrainer + + +class HuggingFaceTokenizer: + """Light wrapper around HuggingFace Tokenizer for some utilities""" + + def __init__(self, tokenizer): + self.tokenizer = tokenizer + + @classmethod + def from_pretrained(cls, hf_path): + # init from a HuggingFace pretrained tokenizer (e.g. "gpt2") + tokenizer = HFTokenizer.from_pretrained(hf_path) + return cls(tokenizer) + + @classmethod + def from_directory(cls, tokenizer_dir): + # init from a local directory on disk (e.g. "out/tokenizer") + tokenizer_path = os.path.join(tokenizer_dir, "tokenizer.json") + tokenizer = HFTokenizer.from_file(tokenizer_path) + return cls(tokenizer) + + @classmethod + def train_from_iterator(cls, text_iterator, vocab_size): + # train from an iterator of text + # Configure the HuggingFace Tokenizer + tokenizer = HFTokenizer( + BPE( + byte_fallback=True, # needed! + unk_token=None, + fuse_unk=False, + ) + ) + # Normalizer: None + tokenizer.normalizer = None + # Pre-tokenizer: GPT-4 style + # the regex pattern used by GPT-4 to split text into groups before BPE + # NOTE: The pattern was changed from \p{N}{1,3} to \p{N}{1,2} because I suspect it is harmful to + # very small models and smaller vocab sizes, because it is a little bit wasteful in the token space. + # (but I haven't validated this! TODO) + gpt4_split_regex = Regex( + SPLIT_PATTERN + ) # huggingface demands that you wrap it in Regex!! + tokenizer.pre_tokenizer = pre_tokenizers.Sequence( + [ + pre_tokenizers.Split( + pattern=gpt4_split_regex, behavior="isolated", invert=False + ), + pre_tokenizers.ByteLevel(add_prefix_space=False, use_regex=False), + ] + ) + # Decoder: ByteLevel (it pairs together with the ByteLevel pre-tokenizer) + tokenizer.decoder = decoders.ByteLevel() + # Post-processor: None + tokenizer.post_processor = None + # Trainer: BPE + trainer = BpeTrainer( + vocab_size=vocab_size, + show_progress=True, + min_frequency=0, # no minimum frequency + initial_alphabet=pre_tokenizers.ByteLevel.alphabet(), + special_tokens=SPECIAL_TOKENS, + ) + # Kick off the training + tokenizer.train_from_iterator(text_iterator, trainer) + return cls(tokenizer) + + def get_vocab_size(self): + return self.tokenizer.get_vocab_size() + + def get_special_tokens(self): + special_tokens_map = self.tokenizer.get_added_tokens_decoder() + special_tokens = [w.content for w in special_tokens_map.values()] + return special_tokens + + def id_to_token(self, id): + return self.tokenizer.id_to_token(id) + + def _encode_one(self, text, prepend=None, append=None): + # encode a single string + # prepend/append can be either a string of a special token or a token id directly. + assert isinstance(text, str) + ids = [] + if prepend is not None: + prepend_id = ( + prepend if isinstance(prepend, int) else self.encode_special(prepend) + ) + ids.append(prepend_id) + ids.extend(self.tokenizer.encode(text, add_special_tokens=False).ids) + if append is not None: + append_id = ( + append if isinstance(append, int) else self.encode_special(append) + ) + ids.append(append_id) + return ids + + def encode_special(self, text): + # encode a single special token via exact match + return self.tokenizer.token_to_id(text) + + def get_bos_token_id(self): + bos = self.encode_special("<|bos|>") + return bos + + def encode(self, text, *args, **kwargs): + if isinstance(text, str): + return self._encode_one(text, *args, **kwargs) + elif isinstance(text, list): + return [self._encode_one(t, *args, **kwargs) for t in text] + else: + raise ValueError(f"Invalid input type: {type(text)}") + + def __call__(self, *args, **kwargs): + return self.encode(*args, **kwargs) + + def decode(self, ids): + return self.tokenizer.decode(ids, skip_special_tokens=False) + + def save(self, tokenizer_dir): + # save the tokenizer to disk + os.makedirs(tokenizer_dir, exist_ok=True) + tokenizer_path = os.path.join(tokenizer_dir, "tokenizer.json") + self.tokenizer.save(tokenizer_path) + print(f"Saved tokenizer to {tokenizer_path}") + + +# ----------------------------------------------------------------------------- +# Universal Tokenizer Wrapper that works with any HuggingFace model +class UniversalHuggingFaceTokenizer: + """Universal wrapper that works with any HuggingFace model""" + + def __init__(self, tokenizer_dir, model_config=None): + self.tokenizer = HuggingFaceTokenizer.from_directory(tokenizer_dir) + self.model_config = model_config + self._pad_token_id = None + self._bos_token_id = None + self._detect_special_tokens() + + def _detect_special_tokens(self): + """Auto-detect special tokens for any model""" + # Try to get pad token from tokenizer + if hasattr(self.tokenizer, "tokenizer"): + tokenizer_obj = self.tokenizer.tokenizer + + # Try common pad token names + pad_candidates = ["", "[PAD]", "<|pad|>", "", "<|endoftext|>"] + for candidate in pad_candidates: + if hasattr(tokenizer_obj, "token_to_id"): + token_id = tokenizer_obj.token_to_id(candidate) + if token_id is not None: + self._pad_token_id = token_id + break + + # Try common BOS token names + bos_candidates = ["", "[CLS]", "<|startoftext|>", "<|endoftext|>"] + for candidate in bos_candidates: + if hasattr(tokenizer_obj, "token_to_id"): + token_id = tokenizer_obj.token_to_id(candidate) + if token_id is not None: + self._bos_token_id = token_id + break + + # Fallback to config-based detection + if self.model_config and hasattr(self.model_config, "pad_token_id"): + self._pad_token_id = self.model_config.pad_token_id + + if self.model_config and hasattr(self.model_config, "bos_token_id"): + self._bos_token_id = self.model_config.bos_token_id + + # Final fallbacks based on common patterns + if self._pad_token_id is None: + # Most models use either 0 or their EOS token + self._pad_token_id = 0 + + if self._bos_token_id is None: + # Use pad token as fallback + self._bos_token_id = self._pad_token_id + + def get_bos_token_id(self): + return self._bos_token_id + + def get_pad_token_id(self): + return self._pad_token_id + + def __call__(self, prompts, prepend=None): + """Universal tokenization method""" + if isinstance(prompts, str): + prompts = [prompts] + + result = [] + for prompt in prompts: + tokens = self.tokenizer.encode(prompt) + if prepend is not None: + tokens = [prepend] + tokens + result.append(tokens) + + return result[0] if len(result) == 1 else result + + def __getattr__(self, name): + return getattr(self.tokenizer, name) From 06e666a5401ce98fd9c5ac769a5d7a63068a0b61 Mon Sep 17 00:00:00 2001 From: Jasmine-Yuting-Zhang Date: Sun, 26 Oct 2025 21:14:28 -0400 Subject: [PATCH 02/17] Added evaluation script for running NanoChat benchmark on HuggingFace models. Features: - Automatically sets up the NanoChat datasets under `.cache/nanochat`. - Downloads and unpacks the CORE evaluation bundle if not already available. - Invokes `evaluate_model.py` with the specified HuggingFace model path. - Adds argument parsing for `` and optional `[max_per_task]`. - Defaults `max_per_task` to 16 when not provided. Usage: bash evaluate_model.sh [optional: max_per_task] --- benchmarks/language_models/evaluate_model.sh | 24 ++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 benchmarks/language_models/evaluate_model.sh diff --git a/benchmarks/language_models/evaluate_model.sh b/benchmarks/language_models/evaluate_model.sh new file mode 100644 index 000000000..e269bbecc --- /dev/null +++ b/benchmarks/language_models/evaluate_model.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# Usage: bash evaluate_model.sh [optional: max_per_task] +# Comment: This script evaluates a HuggingFace-based language model using the NanoChat benchmark infrastructure. +# model_path: Path to the HuggingFace model to evaluate. +# max_per_task: (Optional) Maximum number of examples to evaluate per task. + +export NANOCHAT_BASE_DIR="$PWD/.cache/nanochat" +mkdir -p $NANOCHAT_BASE_DIR + +EVAL_BUNDLE_URL=https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip +if [ ! -d "$NANOCHAT_BASE_DIR/eval_bundle" ]; then + curl -L -o eval_bundle.zip $EVAL_BUNDLE_URL + unzip -q eval_bundle.zip + rm eval_bundle.zip + mv eval_bundle $NANOCHAT_BASE_DIR +fi + +if [ -z "$2" ]; then + MAX_PER_TASK=16 +else + MAX_PER_TASK=$2 +fi +uv run evaluate_model.py --hf_path=$1 --max-per-task=$MAX_PER_TASK + From 473d350cad3a20317b5fc71f4c98caf6e8eab58c Mon Sep 17 00:00:00 2001 From: Jasmine-Yuting-Zhang Date: Sun, 26 Oct 2025 21:16:12 -0400 Subject: [PATCH 03/17] Added missing explanation of --max_per_task default value. --- benchmarks/language_models/evaluate_model.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/language_models/evaluate_model.sh b/benchmarks/language_models/evaluate_model.sh index e269bbecc..4c443e46b 100644 --- a/benchmarks/language_models/evaluate_model.sh +++ b/benchmarks/language_models/evaluate_model.sh @@ -2,7 +2,7 @@ # Usage: bash evaluate_model.sh [optional: max_per_task] # Comment: This script evaluates a HuggingFace-based language model using the NanoChat benchmark infrastructure. # model_path: Path to the HuggingFace model to evaluate. -# max_per_task: (Optional) Maximum number of examples to evaluate per task. +# max_per_task: (Optional) Maximum number of examples to evaluate per task, default setting to 16. export NANOCHAT_BASE_DIR="$PWD/.cache/nanochat" mkdir -p $NANOCHAT_BASE_DIR From f70334894982abc799cb2b14b7065baf04702a75 Mon Sep 17 00:00:00 2001 From: Jasmine-Yuting-Zhang Date: Mon, 27 Oct 2025 22:04:07 -0400 Subject: [PATCH 04/17] Moved NanoChat benchmark from benchmarks/language_models to plato/benchmarks/language_model. --- {benchmarks => plato/benchmarks}/language_models/common.py | 0 {benchmarks => plato/benchmarks}/language_models/core_eval.py | 0 .../benchmarks}/language_models/evaluate_model.py | 0 .../benchmarks}/language_models/evaluate_model.sh | 0 {benchmarks => plato/benchmarks}/language_models/report.py | 0 {benchmarks => plato/benchmarks}/language_models/tokenizer.py | 0 6 files changed, 0 insertions(+), 0 deletions(-) rename {benchmarks => plato/benchmarks}/language_models/common.py (100%) rename {benchmarks => plato/benchmarks}/language_models/core_eval.py (100%) rename {benchmarks => plato/benchmarks}/language_models/evaluate_model.py (100%) rename {benchmarks => plato/benchmarks}/language_models/evaluate_model.sh (100%) rename {benchmarks => plato/benchmarks}/language_models/report.py (100%) rename {benchmarks => plato/benchmarks}/language_models/tokenizer.py (100%) diff --git a/benchmarks/language_models/common.py b/plato/benchmarks/language_models/common.py similarity index 100% rename from benchmarks/language_models/common.py rename to plato/benchmarks/language_models/common.py diff --git a/benchmarks/language_models/core_eval.py b/plato/benchmarks/language_models/core_eval.py similarity index 100% rename from benchmarks/language_models/core_eval.py rename to plato/benchmarks/language_models/core_eval.py diff --git a/benchmarks/language_models/evaluate_model.py b/plato/benchmarks/language_models/evaluate_model.py similarity index 100% rename from benchmarks/language_models/evaluate_model.py rename to plato/benchmarks/language_models/evaluate_model.py diff --git a/benchmarks/language_models/evaluate_model.sh b/plato/benchmarks/language_models/evaluate_model.sh similarity index 100% rename from benchmarks/language_models/evaluate_model.sh rename to plato/benchmarks/language_models/evaluate_model.sh diff --git a/benchmarks/language_models/report.py b/plato/benchmarks/language_models/report.py similarity index 100% rename from benchmarks/language_models/report.py rename to plato/benchmarks/language_models/report.py diff --git a/benchmarks/language_models/tokenizer.py b/plato/benchmarks/language_models/tokenizer.py similarity index 100% rename from benchmarks/language_models/tokenizer.py rename to plato/benchmarks/language_models/tokenizer.py From 08dc54eda1963d7345bef72e9ec091cf1b9729b1 Mon Sep 17 00:00:00 2001 From: Jasmine-Yuting-Zhang Date: Tue, 28 Oct 2025 02:54:48 -0400 Subject: [PATCH 05/17] Cleaned up unused code from nanochat. --- plato/benchmarks/language_models/common.py | 154 ------ plato/benchmarks/language_models/core_eval.py | 269 ----------- .../language_models/evaluate_model.py | 224 --------- .../language_models/evaluate_model.sh | 24 - plato/benchmarks/language_models/report.py | 446 ------------------ plato/benchmarks/language_models/tokenizer.py | 234 --------- 6 files changed, 1351 deletions(-) delete mode 100644 plato/benchmarks/language_models/common.py delete mode 100644 plato/benchmarks/language_models/core_eval.py delete mode 100644 plato/benchmarks/language_models/evaluate_model.py delete mode 100644 plato/benchmarks/language_models/evaluate_model.sh delete mode 100644 plato/benchmarks/language_models/report.py delete mode 100644 plato/benchmarks/language_models/tokenizer.py diff --git a/plato/benchmarks/language_models/common.py b/plato/benchmarks/language_models/common.py deleted file mode 100644 index c13dfd424..000000000 --- a/plato/benchmarks/language_models/common.py +++ /dev/null @@ -1,154 +0,0 @@ -""" -Common utilities for nanochat. -""" - -import os -import re -import logging -import torch -import torch.distributed as dist - - -class ColoredFormatter(logging.Formatter): - """Custom formatter that adds colors to log messages.""" - - # ANSI color codes - COLORS = { - "DEBUG": "\033[36m", # Cyan - "INFO": "\033[32m", # Green - "WARNING": "\033[33m", # Yellow - "ERROR": "\033[31m", # Red - "CRITICAL": "\033[35m", # Magenta - } - RESET = "\033[0m" - BOLD = "\033[1m" - - def format(self, record): - # Add color to the level name - levelname = record.levelname - if levelname in self.COLORS: - record.levelname = ( - f"{self.COLORS[levelname]}{self.BOLD}{levelname}{self.RESET}" - ) - # Format the message - message = super().format(record) - # Add color to specific parts of the message - if levelname == "INFO": - # Highlight numbers and percentages - message = re.sub( - r"(\d+\.?\d*\s*(?:GB|MB|%|docs))", - rf"{self.BOLD}\1{self.RESET}", - message, - ) - message = re.sub( - r"(Shard \d+)", - rf"{self.COLORS['INFO']}{self.BOLD}\1{self.RESET}", - message, - ) - return message - - -def setup_default_logging(): - handler = logging.StreamHandler() - handler.setFormatter( - ColoredFormatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") - ) - logging.basicConfig(level=logging.INFO, handlers=[handler]) - - -setup_default_logging() -logger = logging.getLogger(__name__) - - -def get_base_dir(): - # co-locate nanochat intermediates with other cached data in ~/.cache (by default) - if os.environ.get("NANOCHAT_BASE_DIR"): - nanochat_dir = os.environ.get("NANOCHAT_BASE_DIR") - else: - home_dir = os.path.expanduser("~") - cache_dir = os.path.join(home_dir, ".cache") - nanochat_dir = os.path.join(cache_dir, "nanochat") - os.makedirs(nanochat_dir, exist_ok=True) - return nanochat_dir - - -def print0(s="", **kwargs): - ddp_rank = int(os.environ.get("RANK", 0)) - if ddp_rank == 0: - print(s, **kwargs) - - -def is_ddp(): - # TODO is there a proper way - return int(os.environ.get("RANK", -1)) != -1 - - -def get_dist_info(): - if is_ddp(): - assert all(var in os.environ for var in ["RANK", "LOCAL_RANK", "WORLD_SIZE"]) - ddp_rank = int(os.environ["RANK"]) - ddp_local_rank = int(os.environ["LOCAL_RANK"]) - ddp_world_size = int(os.environ["WORLD_SIZE"]) - return True, ddp_rank, ddp_local_rank, ddp_world_size - else: - return False, 0, 0, 1 - - -def autodetect_device_type(): - # prefer to use CUDA if available, otherwise use MPS, otherwise fallback on CPU - if torch.cuda.is_available(): - device_type = "cuda" - elif torch.backends.mps.is_available(): - device_type = "mps" - else: - device_type = "cpu" - print0(f"Autodetected device type: {device_type}") - return device_type - - -def compute_init(device_type="cuda"): # cuda|cpu|mps - """Basic initialization that we keep doing over and over, so make common.""" - - assert device_type in ["cuda", "mps", "cpu"], "Invalid device type atm" - if device_type == "cuda": - assert torch.cuda.is_available(), ( - "Your PyTorch installation is not configured for CUDA but device_type is 'cuda'" - ) - if device_type == "mps": - assert torch.backends.mps.is_available(), ( - "Your PyTorch installation is not configured for MPS but device_type is 'mps'" - ) - - # Reproducibility - torch.manual_seed(42) - if device_type == "cuda": - torch.cuda.manual_seed(42) - # skipping full reproducibility for now, possibly investigate slowdown later - # torch.use_deterministic_algorithms(True) - - # Precision - if device_type == "cuda": - torch.set_float32_matmul_precision( - "high" - ) # uses tf32 instead of fp32 for matmuls - - # Distributed setup: Distributed Data Parallel (DDP), optional, and requires CUDA - ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info() - if ddp and device_type == "cuda": - device = torch.device("cuda", ddp_local_rank) - torch.cuda.set_device(device) # make "cuda" default to this device - dist.init_process_group(backend="nccl", device_id=device) - dist.barrier() - else: - device = torch.device(device_type) # mps|cpu - - if ddp_rank == 0: - logger.info(f"Distributed world size: {ddp_world_size}") - - return ddp, ddp_rank, ddp_local_rank, ddp_world_size, device - - -def compute_cleanup(): - """Companion function to compute_init, to clean things up before script exit""" - if is_ddp(): - dist.destroy_process_group() diff --git a/plato/benchmarks/language_models/core_eval.py b/plato/benchmarks/language_models/core_eval.py deleted file mode 100644 index 547d22cd3..000000000 --- a/plato/benchmarks/language_models/core_eval.py +++ /dev/null @@ -1,269 +0,0 @@ -""" -Functions for evaluating the CORE metric, as described in the DCLM paper. -https://arxiv.org/abs/2406.11794 - -TODOs: -- All tasks ~match except for squad. We get 31% reference is 37%. Figure out why. -""" - -import random - -from jinja2 import Template -import torch -import torch.distributed as dist - -# ----------------------------------------------------------------------------- -# Prompt rendering utilities - - -def render_prompts_mc(item, continuation_delimiter, fewshot_examples=None): - """Render complete prompts for a multiple choice question""" - template_str = """ -{%- for example in fewshot_examples -%} -{{ example.query }}{{ continuation_delimiter }}{{ example.choices[example.gold] }} - -{% endfor -%} -{{ item.query }}{{ continuation_delimiter }}{{ choice }}""".strip() - template = Template(template_str) - fewshot_examples = fewshot_examples or [] - context = { - "fewshot_examples": fewshot_examples, - "continuation_delimiter": continuation_delimiter, - "item": item, - } - prompts = [template.render(choice=choice, **context) for choice in item["choices"]] - return prompts - - -def render_prompts_schema(item, continuation_delimiter, fewshot_examples=None): - """Render complete prompts for a schema question""" - template_str = """ -{%- for example in fewshot_examples -%} -{{ example.context_options[example.gold] }}{{ continuation_delimiter }}{{ example.continuation }} - -{% endfor -%} -{{ context }}{{ continuation_delimiter }}{{ item.continuation }}""".strip() - template = Template(template_str) - fewshot_examples = fewshot_examples or [] - context = { - "fewshot_examples": fewshot_examples, - "continuation_delimiter": continuation_delimiter, - "item": item, - } - prompts = [ - template.render(context=context_option, **context) - for context_option in item["context_options"] - ] - return prompts - - -def render_prompts_lm(item, continuation_delimiter, fewshot_examples=None): - """ - Render complete prompt for a language modeling task. - Notice that we manually trim the context in the template, - which in some datasets seems to have trailing whitespace (which we don't want). - """ - template_str = """ -{%- for example in fewshot_examples -%} -{{ example.context | trim }}{{ continuation_delimiter }}{{ example.continuation }} - -{% endfor -%} -{{ item.context | trim }}{{ continuation_delimiter }}{% if include_continuation %}{{ item.continuation }}{% endif %}""".strip() - template = Template(template_str) - fewshot_examples = fewshot_examples or [] - context = { - "fewshot_examples": fewshot_examples, - "continuation_delimiter": continuation_delimiter, - "item": item, - } - # Return two prompts: without and with the continuation - prompt_without = template.render(include_continuation=False, **context) - prompt_with = template.render(include_continuation=True, **context) - # Due to the way the data seems to be stored, I think I need to strip in the case of LM here. - # Otherwise we may get trailing whitespaces in prompt_without (which get absorbed into the next - # token in prompt_with), meaning we don't get a nice and clean prefix in the token space - # to detect the final continuation. Tokenizers... - prompt_without = prompt_without.strip() - return [prompt_without, prompt_with] - - -def find_common_length(token_sequences, direction="left"): - """ - Find the length of the common prefix or suffix across token sequences - - direction: 'left' for prefix, 'right' for suffix - """ - min_len = min(len(seq) for seq in token_sequences) - indices = {"left": range(min_len), "right": range(-1, -min_len - 1, -1)}[direction] - # Find the first position where the token sequences differ - for i, idx in enumerate(indices): - token = token_sequences[0][idx] - if not all(seq[idx] == token for seq in token_sequences): - return i - return min_len - - -def stack_sequences(tokens, pad_token_id): - """Stack up a list of token sequences, pad to longest on the right""" - bsz, seq_len = len(tokens), max(len(x) for x in tokens) - input_ids = torch.full((bsz, seq_len), pad_token_id, dtype=torch.long) - for i, x in enumerate(tokens): - input_ids[i, : len(x)] = torch.tensor(x, dtype=torch.long) - return input_ids - - -def batch_sequences_mc(tokenizer, prompts): - # In multiple choice, contexts are the same but the continuation is different (common prefix) - tokens = tokenizer(prompts, prepend=tokenizer.get_bos_token_id()) - # figure out the start and end of each continuation - answer_start_idx = find_common_length(tokens, direction="left") - start_indices = [answer_start_idx] * len(prompts) - end_indices = [len(x) for x in tokens] - return tokens, start_indices, end_indices - - -def batch_sequences_schema(tokenizer, prompts): - # In schema tasks, contexts vary but continuation is the same (common suffix) - tokens = tokenizer(prompts, prepend=tokenizer.get_bos_token_id()) - # figure out the start and end of each context - suffix_length = find_common_length(tokens, direction="right") - end_indices = [len(x) for x in tokens] - start_indices = [ei - suffix_length for ei in end_indices] - return tokens, start_indices, end_indices - - -def batch_sequences_lm(tokenizer, prompts): - # In LM tasks, we have two prompts: without and with continuation - tokens = tokenizer(prompts, prepend=tokenizer.get_bos_token_id()) - tokens_without, tokens_with = tokens - start_idx, end_idx = len(tokens_without), len(tokens_with) - assert start_idx < end_idx, ( - "prompt without is supposed to be a prefix of prompt with" - ) - assert tokens_without == tokens_with[:start_idx], ( - "prompt without is supposed to be a prefix of prompt with" - ) - # we only need the with continuation prompt in the LM task, i.e. batch size of 1 - return [tokens_with], [start_idx], [end_idx] - - -@torch.no_grad() -def forward_model(model, input_ids): - """ - Take BxT tensor of token ids, return BxT tensor of losses and argmax predictions. - The last column of losses is set to nan because we don't have autoregressive targets there. - """ - batch_size, seq_len = input_ids.size() - outputs = model(input_ids) - # Roll the tensor to the left by one position to get the (autoregressive) target ids - target_ids = torch.roll(input_ids, shifts=-1, dims=1) - # Calculate cross entropy at all positions - losses = torch.nn.functional.cross_entropy( - outputs.view(batch_size * seq_len, -1), - target_ids.view(batch_size * seq_len), - reduction="none", - ).view(batch_size, seq_len) - # Set the last column to be nan because there is no autoregressive loss there - losses[:, -1] = float("nan") - # Get the argmax predictions at each position - predictions = outputs.argmax(dim=-1) - return losses, predictions - - -@torch.no_grad() -def evaluate_example(idx, model, tokenizer, data, device, task_meta): - """Evaluate a single example, return True if correct, False otherwise""" - item = data[idx] - task_type = task_meta["task_type"] - num_fewshot = task_meta["num_fewshot"] - continuation_delimiter = task_meta["continuation_delimiter"] - - # Sample few-shot examples (excluding current item) - fewshot_examples = [] - if num_fewshot > 0: - rng = random.Random(1234 + idx) - available_indices = [i for i in range(len(data)) if i != idx] - fewshot_indices = rng.sample(available_indices, num_fewshot) - fewshot_examples = [data[i] for i in fewshot_indices] - - # Render prompts and batch sequences based on task type - if task_type == "multiple_choice": - prompts = render_prompts_mc(item, continuation_delimiter, fewshot_examples) - tokens, start_idxs, end_idxs = batch_sequences_mc(tokenizer, prompts) - elif task_type == "schema": - prompts = render_prompts_schema(item, continuation_delimiter, fewshot_examples) - tokens, start_idxs, end_idxs = batch_sequences_schema(tokenizer, prompts) - elif task_type == "language_modeling": - prompts = render_prompts_lm(item, continuation_delimiter, fewshot_examples) - tokens, start_idxs, end_idxs = batch_sequences_lm(tokenizer, prompts) - else: - raise ValueError(f"Unsupported task type: {task_type}") - - # Some models can't forward sequences beyond a certain length (e.g. GPT-2) - # In these cases, we have to truncate sequences to max length and adjust the indices - if hasattr(model, "max_seq_len") and model.max_seq_len is not None: - max_tokens = model.max_seq_len - new_tokens, new_start_idxs, new_end_idxs = [], [], [] - for t, s, e in zip(tokens, start_idxs, end_idxs): - if len(t) > max_tokens: - num_to_crop = len(t) - max_tokens - new_tokens.append(t[-max_tokens:]) # take the last max_tokens tokens - new_start_idxs.append(s - num_to_crop) # shift the indices down - new_end_idxs.append(e - num_to_crop) - assert s - num_to_crop >= 0, "this should never happen right?" - assert e - num_to_crop >= 0, "this should never happen right?" - else: - new_tokens.append(t) # keep unchanged - new_start_idxs.append(s) - new_end_idxs.append(e) - tokens, start_idxs, end_idxs = new_tokens, new_start_idxs, new_end_idxs - - # Stack up all the sequences into a batch - pad_token_id = tokenizer.get_bos_token_id() # use BOS as pad token is ok - input_ids = stack_sequences(tokens, pad_token_id) - input_ids = input_ids.to(device) - - # Forward the model, get the autoregressive loss and argmax prediction at each token - losses, predictions = forward_model(model, input_ids) - - # See if the losses/predictions come out correctly - if task_type == "language_modeling": - # language modeling task is currently always batch size 1 - si = start_idxs[0] - ei = end_idxs[0] - # predictions[i] predict input_ids[i+1] autoregressively - predicted_tokens = predictions[0, si - 1 : ei - 1] - actual_tokens = input_ids[0, si:ei] - is_correct = torch.all(predicted_tokens == actual_tokens).item() - elif task_type in ["multiple_choice", "schema"]: - # For MC/schema: find the option with lowest average loss - mean_losses = [ - losses[i, si - 1 : ei - 1].mean().item() - for i, (si, ei) in enumerate(zip(start_idxs, end_idxs)) - ] - pred_idx = mean_losses.index(min(mean_losses)) - is_correct = pred_idx == item["gold"] - else: - raise ValueError(f"Unsupported task type: {task_type}") - - return is_correct - - -def evaluate_task(model, tokenizer, data, device, task_meta): - """ - This function is responsible for evaluating one task across many examples. - It also handles dispatch to all processes if the script is run with torchrun. - """ - rank = dist.get_rank() if dist.is_initialized() else 0 - world_size = dist.get_world_size() if dist.is_initialized() else 1 - correct = torch.zeros(len(data), dtype=torch.float32, device=device) - # stride the examples to each rank - for idx in range(rank, len(data), world_size): - is_correct = evaluate_example(idx, model, tokenizer, data, device, task_meta) - correct[idx] = float(is_correct) - # sync results across all the processes if running distributed - if world_size > 1: - dist.barrier() - dist.all_reduce(correct, op=dist.ReduceOp.SUM) - # compute the mean - mean_correct = correct.mean().item() - return mean_correct diff --git a/plato/benchmarks/language_models/evaluate_model.py b/plato/benchmarks/language_models/evaluate_model.py deleted file mode 100644 index b133013c5..000000000 --- a/plato/benchmarks/language_models/evaluate_model.py +++ /dev/null @@ -1,224 +0,0 @@ -""" -Evlauate the CORE metric for a given model. - -Run on a single GPU: -python base_eval.py --hf-path - -The script will print the CORE metric to the console. -""" - -import os -import time -import json -import random -import yaml -from contextlib import nullcontext - -import pandas as pd -import torch - -from common import ( - compute_init, - compute_cleanup, - print0, - get_base_dir, - autodetect_device_type, -) -from tokenizer import UniversalHuggingFaceTokenizer -from core_eval import evaluate_task - -# ----------------------------------------------------------------------------- -# nanoChat specific function dealing with I/O etc. - - -def evaluate_model(model, tokenizer, device, max_per_task=-1): - """ - Evaluate a base model on the CORE benchmark. - - max_per_task: crop the data to this many examples per task for testing (-1 = disable) - TODO: clean up this function, delete the need for all the files, for pandas dependency, etc. - """ - # Load config and task metadata - base_dir = get_base_dir() - eval_bundle_dir = os.path.join(base_dir, "eval_bundle") - config_path = os.path.join(eval_bundle_dir, "core.yaml") - data_base_path = os.path.join(eval_bundle_dir, "eval_data") - eval_meta_data = os.path.join(eval_bundle_dir, "eval_meta_data.csv") - with open(config_path, "r") as f: - config = yaml.safe_load(f) - tasks = config["icl_tasks"] - eval_metadata = pd.read_csv(eval_meta_data) - - # Evaluate each task - results = {} - centered_results = {} - for task in tasks: - start_time = time.time() - label = task["label"] - task_meta = { - "task_type": task["icl_task_type"], - "dataset_uri": task["dataset_uri"], - "num_fewshot": task["num_fewshot"][0], - "continuation_delimiter": task.get("continuation_delimiter", " "), - } - print0( - f"Evaluating: {label} ({task_meta['num_fewshot']}-shot, type: {task_meta['task_type']})... ", - end="", - ) - - # Load data for this task - data_path = os.path.join(data_base_path, task_meta["dataset_uri"]) - with open(data_path, "r") as f: - data = [json.loads(line.strip()) for line in f] - - # shuffle the data because in many cases it appears ordered but we want - # the abillity to only run a subset of the data for debugging purposes etc. - shuffle_rng = random.Random(1337) - shuffle_rng.shuffle(data) - if max_per_task > 0: - data = data[:max_per_task] - - # run the evaluation for this task - accuracy = evaluate_task(model, tokenizer, data, device, task_meta) - - results[label] = accuracy - row = eval_metadata[eval_metadata["Eval Task"] == label] - random_baseline = row["Random baseline"].values[0] - centered_result = (accuracy - 0.01 * random_baseline) / ( - 1.0 - 0.01 * random_baseline - ) - centered_results[label] = centered_result - end_time = time.time() - print0( - f"accuracy: {accuracy:.4f} | centered: {centered_result:.4f} | time: {end_time - start_time:.2f}s" - ) - - core_metric = sum(centered_results.values()) / len(centered_results) - out = { - "results": results, - "centered_results": centered_results, - "core_metric": core_metric, - } - return out - - -# ----------------------------------------------------------------------------- -# HuggingFace loading utilities and light wrappers for a model - - -class ModelWrapper: - """Lightweight wrapper for a HuggingFace model""" - - def __init__(self, model, max_seq_len=None): - self.model = model - self.max_seq_len = max_seq_len - - def __call__(self, input_ids): - outputs = self.model(input_ids) - logits = outputs.logits - return logits - - -def load_hf_model(hf_path: str, device): - print0(f"Loading model from: {hf_path}") - from transformers import AutoModelForCausalLM, AutoConfig - - if os.path.exists(hf_path): - hf_path = os.path.abspath(hf_path) - print0(f"Using absolute path: {hf_path}") - - # Load config to help with token detection - config = AutoConfig.from_pretrained(hf_path, local_files_only=True) - model = AutoModelForCausalLM.from_pretrained(hf_path, local_files_only=True) - tokenizer = UniversalHuggingFaceTokenizer(hf_path, config) - - model.to(device) - model.eval() - max_seq_len = 1024 # subject to change based on model type, for GPT-2 it's 1024 - model = ModelWrapper(model, max_seq_len=max_seq_len) - return model, tokenizer - - -# ----------------------------------------------------------------------------- -def main(): - import argparse - - parser = argparse.ArgumentParser() - parser.add_argument( - "--hf-path", - type=str, - default=None, - required=True, - help="HuggingFace model path to evaluate", - ) - parser.add_argument( - "--max-per-task", - type=int, - default=-1, - help="Max examples per task to evaluate (-1 = disable)", - ) - args = parser.parse_args() - - # distributed / precision setup - device_type = autodetect_device_type() - ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type) - autocast_ctx = ( - torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) - if device_type == "cuda" - else nullcontext() - ) - - # Load model and tokenizer from command line or from file system - # atm assume that if a path is given, it's a huggingface model path - hf_path = args.hf_path - print0(f"Loading huggingface model from: {hf_path}") - model, tokenizer = load_hf_model(hf_path, device) - model_name = hf_path # just for logging - model_slug = hf_path.replace("/", "-") # for the output csv file - - # Evaluate the model - with autocast_ctx: - out = evaluate_model(model, tokenizer, device, max_per_task=args.max_per_task) - - # Write out the results to a csv file - core_metric = None - centered_results = {} - if ddp_rank == 0: - base_dir = get_base_dir() - output_csv_path = os.path.join(base_dir, "base_eval", f"{model_slug}.csv") - os.makedirs(os.path.dirname(output_csv_path), exist_ok=True) - results = out["results"] - centered_results = out["centered_results"] - core_metric = out["core_metric"] - with open(output_csv_path, "w") as f: - f.write(f"{'Task':<35}, {'Accuracy':<10}, {'Centered':<10}\n") - for label in results: - f.write( - f"{label:<35}, {results[label]:<10.6f}, {centered_results[label]:<10.6f}\n" - ) - f.write(f"{'CORE':<35}, {'':<10}, {core_metric:<10.6f}\n") - # Print the content of the csv file to console too - print0("=" * 80) - print0(f"Model: {model_name}") - print0("=" * 80) - with open(output_csv_path, "r") as f: - print0(f.read()) - - # Log to report - from report import get_report - - get_report().log( - section="Base model evaluation", - data=[ - { - "Model": model_name, - "CORE metric": core_metric, - }, - centered_results, # the full table - ], - ) - - compute_cleanup() - - -if __name__ == "__main__": - main() diff --git a/plato/benchmarks/language_models/evaluate_model.sh b/plato/benchmarks/language_models/evaluate_model.sh deleted file mode 100644 index 4c443e46b..000000000 --- a/plato/benchmarks/language_models/evaluate_model.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash -# Usage: bash evaluate_model.sh [optional: max_per_task] -# Comment: This script evaluates a HuggingFace-based language model using the NanoChat benchmark infrastructure. -# model_path: Path to the HuggingFace model to evaluate. -# max_per_task: (Optional) Maximum number of examples to evaluate per task, default setting to 16. - -export NANOCHAT_BASE_DIR="$PWD/.cache/nanochat" -mkdir -p $NANOCHAT_BASE_DIR - -EVAL_BUNDLE_URL=https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip -if [ ! -d "$NANOCHAT_BASE_DIR/eval_bundle" ]; then - curl -L -o eval_bundle.zip $EVAL_BUNDLE_URL - unzip -q eval_bundle.zip - rm eval_bundle.zip - mv eval_bundle $NANOCHAT_BASE_DIR -fi - -if [ -z "$2" ]; then - MAX_PER_TASK=16 -else - MAX_PER_TASK=$2 -fi -uv run evaluate_model.py --hf_path=$1 --max-per-task=$MAX_PER_TASK - diff --git a/plato/benchmarks/language_models/report.py b/plato/benchmarks/language_models/report.py deleted file mode 100644 index 5721b1be2..000000000 --- a/plato/benchmarks/language_models/report.py +++ /dev/null @@ -1,446 +0,0 @@ -""" -Utilities for generating training report cards. More messy code than usual, will fix. -""" - -import os -import re -import shutil -import subprocess -import socket -import datetime -import platform -import psutil -import torch - - -def run_command(cmd): - """Run a shell command and return output, or None if it fails.""" - try: - result = subprocess.run( - cmd, shell=True, capture_output=True, text=True, timeout=5 - ) - if result.returncode == 0: - return result.stdout.strip() - return None - except: - return None - - -def get_git_info(): - """Get current git commit, branch, and dirty status.""" - info = {} - info["commit"] = run_command("git rev-parse --short HEAD") or "unknown" - info["branch"] = run_command("git rev-parse --abbrev-ref HEAD") or "unknown" - - # Check if repo is dirty (has uncommitted changes) - status = run_command("git status --porcelain") - info["dirty"] = bool(status) if status is not None else False - - # Get commit message - info["message"] = run_command("git log -1 --pretty=%B") or "" - info["message"] = info["message"].split("\n")[0][:80] # First line, truncated - - return info - - -def get_gpu_info(): - """Get GPU information.""" - if not torch.cuda.is_available(): - return {"available": False} - - num_devices = torch.cuda.device_count() - info = {"available": True, "count": num_devices, "names": [], "memory_gb": []} - - for i in range(num_devices): - props = torch.cuda.get_device_properties(i) - info["names"].append(props.name) - info["memory_gb"].append(props.total_memory / (1024**3)) - - # Get CUDA version - info["cuda_version"] = torch.version.cuda or "unknown" - - return info - - -def get_system_info(): - """Get system information.""" - info = {} - - # Basic system info - info["hostname"] = socket.gethostname() - info["platform"] = platform.system() - info["python_version"] = platform.python_version() - info["torch_version"] = torch.__version__ - - # CPU and memory - info["cpu_count"] = psutil.cpu_count(logical=False) - info["cpu_count_logical"] = psutil.cpu_count(logical=True) - info["memory_gb"] = psutil.virtual_memory().total / (1024**3) - - # User and environment - info["user"] = os.environ.get("USER", "unknown") - info["nanochat_base_dir"] = os.environ.get("NANOCHAT_BASE_DIR", "out") - info["working_dir"] = os.getcwd() - - return info - - -def estimate_cost(gpu_info, runtime_hours=None): - """Estimate training cost based on GPU type and runtime.""" - - # Rough pricing, from Lambda Cloud - default_rate = 2.0 - gpu_hourly_rates = { - "H100": 3.00, - "A100": 1.79, - "V100": 0.55, - } - - if not gpu_info.get("available"): - return None - - # Try to identify GPU type from name - hourly_rate = None - gpu_name = gpu_info["names"][0] if gpu_info["names"] else "unknown" - for gpu_type, rate in gpu_hourly_rates.items(): - if gpu_type in gpu_name: - hourly_rate = rate * gpu_info["count"] - break - - if hourly_rate is None: - hourly_rate = default_rate * gpu_info["count"] # Default estimate - - return { - "hourly_rate": hourly_rate, - "gpu_type": gpu_name, - "estimated_total": hourly_rate * runtime_hours if runtime_hours else None, - } - - -def generate_header(): - """Generate the header for a training report.""" - timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") - - git_info = get_git_info() - gpu_info = get_gpu_info() - sys_info = get_system_info() - cost_info = estimate_cost(gpu_info) - - header = f"""# nanochat training report - -Generated: {timestamp} - -## Environment - -### Git Information -- Branch: {git_info["branch"]} -- Commit: {git_info["commit"]} {"(dirty)" if git_info["dirty"] else "(clean)"} -- Message: {git_info["message"]} - -### Hardware -- Platform: {sys_info["platform"]} -- CPUs: {sys_info["cpu_count"]} cores ({sys_info["cpu_count_logical"]} logical) -- Memory: {sys_info["memory_gb"]:.1f} GB -""" - - if gpu_info.get("available"): - gpu_names = ", ".join(set(gpu_info["names"])) - total_vram = sum(gpu_info["memory_gb"]) - header += f"""- GPUs: {gpu_info["count"]}x {gpu_names} -- GPU Memory: {total_vram:.1f} GB total -- CUDA Version: {gpu_info["cuda_version"]} -""" - else: - header += "- GPUs: None available\n" - - if cost_info and cost_info["hourly_rate"] > 0: - header += f"""- Hourly Rate: ${cost_info["hourly_rate"]:.2f}/hour\n""" - - header += f""" -### Software -- Python: {sys_info["python_version"]} -- PyTorch: {sys_info["torch_version"]} - -""" - - # bloat metrics: package all of the source code and assess its weight - packaged = run_command( - 'files-to-prompt . -e py -e md -e rs -e html -e toml -e sh --ignore "*target*" --cxml' - ) - num_chars = len(packaged) - num_lines = len(packaged.split("\n")) - num_files = len([x for x in packaged.split("\n") if x.startswith("")]) - num_tokens = num_chars // 4 # assume approximately 4 chars per token - - # count dependencies via uv.lock - uv_lock_lines = 0 - if os.path.exists("uv.lock"): - with open("uv.lock", "r") as f: - uv_lock_lines = len(f.readlines()) - - header += f""" -### Bloat -- Characters: {num_chars:,} -- Lines: {num_lines:,} -- Files: {num_files:,} -- Tokens (approx): {num_tokens:,} -- Dependencies (uv.lock lines): {uv_lock_lines:,} - -""" - return header - - -# ----------------------------------------------------------------------------- - - -def slugify(text): - """Slugify a text string.""" - return text.lower().replace(" ", "-") - - -# the expected files and their order -EXPECTED_FILES = [ - "tokenizer-training.md", - "tokenizer-evaluation.md", - "base-model-training.md", - "base-model-loss.md", - "base-model-evaluation.md", - "midtraining.md", - "chat-evaluation-mid.md", - "chat-sft.md", - "chat-evaluation-sft.md", - "chat-rl.md", - "chat-evaluation-rl.md", -] -# the metrics we're currently interested in -chat_metrics = ["ARC-Easy", "ARC-Challenge", "MMLU", "GSM8K", "HumanEval", "ChatCORE"] - - -def extract(section, keys): - """simple def to extract a single key from a section""" - if not isinstance(keys, list): - keys = [keys] # convenience - out = {} - for line in section.split("\n"): - for key in keys: - if key in line: - out[key] = line.split(":")[1].strip() - return out - - -def extract_timestamp(content, prefix): - """Extract timestamp from content with given prefix.""" - for line in content.split("\n"): - if line.startswith(prefix): - time_str = line.split(":", 1)[1].strip() - try: - return datetime.datetime.strptime(time_str, "%Y-%m-%d %H:%M:%S") - except: - pass - return None - - -class Report: - """Maintains a bunch of logs, generates a final markdown report.""" - - def __init__(self, report_dir): - os.makedirs(report_dir, exist_ok=True) - self.report_dir = report_dir - - def log(self, section, data): - """Log a section of data to the report.""" - slug = slugify(section) - file_name = f"{slug}.md" - file_path = os.path.join(self.report_dir, file_name) - with open(file_path, "w") as f: - f.write(f"## {section}\n") - f.write( - f"timestamp: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n" - ) - for item in data: - if not item: - # skip falsy values like None or empty dict etc. - continue - if isinstance(item, str): - # directly write the string - f.write(item) - else: - # render a dict - for k, v in item.items(): - if isinstance(v, float): - vstr = f"{v:.4f}" - elif isinstance(v, int) and v >= 10000: - vstr = f"{v:,.0f}" - else: - vstr = str(v) - f.write(f"- {k}: {vstr}\n") - f.write("\n") - return file_path - - def generate(self): - """Generate the final report.""" - report_dir = self.report_dir - report_file = os.path.join(report_dir, "report.md") - print(f"Generating report to {report_file}") - final_metrics = {} # the most important final metrics we'll add as table at the end - start_time = None - end_time = None - with open(report_file, "w") as out_file: - # write the header first - header_file = os.path.join(report_dir, "header.md") - if os.path.exists(header_file): - with open(header_file, "r") as f: - header_content = f.read() - out_file.write(header_content) - start_time = extract_timestamp(header_content, "Run started:") - # capture bloat data for summary later (the stuff after Bloat header and until \n\n) - bloat_data = re.search( - r"### Bloat\n(.*?)\n\n", header_content, re.DOTALL - ) - bloat_data = bloat_data.group(1) if bloat_data else "" - else: - start_time = ( - None # will cause us to not write the total wall clock time - ) - bloat_data = "[bloat data missing]" - print( - f"Warning: {header_file} does not exist. Did you forget to run `nanochat reset`?" - ) - # process all the individual sections - for file_name in EXPECTED_FILES: - section_file = os.path.join(report_dir, file_name) - if not os.path.exists(section_file): - print(f"Warning: {section_file} does not exist, skipping") - continue - with open(section_file, "r") as in_file: - section = in_file.read() - # Extract timestamp from this section (the last section's timestamp will "stick" as end_time) - if "rl" not in file_name: - # Skip RL sections for end_time calculation because RL is experimental - end_time = extract_timestamp(section, "timestamp:") - # extract the most important metrics from the sections - if file_name == "base-model-evaluation.md": - final_metrics["base"] = extract(section, "CORE") - if file_name == "chat-evaluation-mid.md": - final_metrics["mid"] = extract(section, chat_metrics) - if file_name == "chat-evaluation-sft.md": - final_metrics["sft"] = extract(section, chat_metrics) - if file_name == "chat-evaluation-rl.md": - final_metrics["rl"] = extract( - section, "GSM8K" - ) # RL only evals GSM8K - # append this section of the report - out_file.write(section) - out_file.write("\n") - # add the final metrics table - out_file.write("## Summary\n\n") - # Copy over the bloat metrics from the header - out_file.write(bloat_data) - out_file.write("\n\n") - # Collect all unique metric names - all_metrics = set() - for stage_metrics in final_metrics.values(): - all_metrics.update(stage_metrics.keys()) - # Custom ordering: CORE first, ChatCORE last, rest in middle - all_metrics = sorted( - all_metrics, key=lambda x: (x != "CORE", x == "ChatCORE", x) - ) - # Fixed column widths - stages = ["base", "mid", "sft", "rl"] - metric_width = 15 - value_width = 8 - # Write table header - header = f"| {'Metric'.ljust(metric_width)} |" - for stage in stages: - header += f" {stage.upper().ljust(value_width)} |" - out_file.write(header + "\n") - # Write separator - separator = f"|{'-' * (metric_width + 2)}|" - for stage in stages: - separator += f"{'-' * (value_width + 2)}|" - out_file.write(separator + "\n") - # Write table rows - for metric in all_metrics: - row = f"| {metric.ljust(metric_width)} |" - for stage in stages: - value = final_metrics.get(stage, {}).get(metric, "-") - row += f" {str(value).ljust(value_width)} |" - out_file.write(row + "\n") - out_file.write("\n") - # Calculate and write total wall clock time - if start_time and end_time: - duration = end_time - start_time - total_seconds = int(duration.total_seconds()) - hours = total_seconds // 3600 - minutes = (total_seconds % 3600) // 60 - out_file.write(f"Total wall clock time: {hours}h{minutes}m\n") - else: - out_file.write("Total wall clock time: unknown\n") - # also cp the report.md file to current directory - print(f"Copying report.md to current directory for convenience") - shutil.copy(report_file, "report.md") - return report_file - - def reset(self): - """Reset the report.""" - # Remove section files - for file_name in EXPECTED_FILES: - file_path = os.path.join(self.report_dir, file_name) - if os.path.exists(file_path): - os.remove(file_path) - # Remove report.md if it exists - report_file = os.path.join(self.report_dir, "report.md") - if os.path.exists(report_file): - os.remove(report_file) - # Generate and write the header section with start timestamp - header_file = os.path.join(self.report_dir, "header.md") - header = generate_header() - start_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") - with open(header_file, "w") as f: - f.write(header) - f.write(f"Run started: {start_time}\n\n---\n\n") - print(f"Reset report and wrote header to {header_file}") - - -# ----------------------------------------------------------------------------- -# nanochat-specific convenience functions - - -class DummyReport: - def log(self, *args, **kwargs): - pass - - def reset(self, *args, **kwargs): - pass - - -def get_report(): - # just for convenience, only rank 0 logs to report - from common import get_base_dir, get_dist_info - - ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info() - if ddp_rank == 0: - report_dir = os.path.join(get_base_dir(), "report") - return Report(report_dir) - else: - return DummyReport() - - -if __name__ == "__main__": - import argparse - - parser = argparse.ArgumentParser( - description="Generate or reset nanochat training reports." - ) - parser.add_argument( - "command", - nargs="?", - default="generate", - choices=["generate", "reset"], - help="Operation to perform (default: generate)", - ) - args = parser.parse_args() - if args.command == "generate": - get_report().generate() - elif args.command == "reset": - get_report().reset() diff --git a/plato/benchmarks/language_models/tokenizer.py b/plato/benchmarks/language_models/tokenizer.py deleted file mode 100644 index fc962c1d0..000000000 --- a/plato/benchmarks/language_models/tokenizer.py +++ /dev/null @@ -1,234 +0,0 @@ -""" -BPE Tokenizer in the style of GPT-4. - -Two implementations are available: -1) HuggingFace Tokenizer that can do both training and inference but is really confusing -2) Universal Wrapper that can load any HuggingFace tokenizer (e.g., for GPT-2 which has slightly different tokenization rules than GPT-4) for inference only. -""" - -import os - -SPECIAL_TOKENS = [ - # every document begins with the Beginning of Sequence (BOS) token that delimits documents - "<|bos|>", - # tokens below are only used during finetuning to render Conversations into token ids - "<|user_start|>", # user messages - "<|user_end|>", - "<|assistant_start|>", # assistant messages - "<|assistant_end|>", - "<|python_start|>", # assistant invokes python REPL tool - "<|python_end|>", - "<|output_start|>", # python REPL outputs back to assistant - "<|output_end|>", -] - -# NOTE: this split pattern deviates from GPT-4 in that we use \p{N}{1,2} instead of \p{N}{1,3} -# I did this because I didn't want to "waste" too many tokens on numbers for smaller vocab sizes. -# I haven't validated that this is actually a good idea, TODO. -SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,2}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""" - -# ----------------------------------------------------------------------------- -# Generic GPT-4-style tokenizer based on HuggingFace Tokenizer -from tokenizers import Tokenizer as HFTokenizer -from tokenizers import pre_tokenizers, decoders, Regex -from tokenizers.models import BPE -from tokenizers.trainers import BpeTrainer - - -class HuggingFaceTokenizer: - """Light wrapper around HuggingFace Tokenizer for some utilities""" - - def __init__(self, tokenizer): - self.tokenizer = tokenizer - - @classmethod - def from_pretrained(cls, hf_path): - # init from a HuggingFace pretrained tokenizer (e.g. "gpt2") - tokenizer = HFTokenizer.from_pretrained(hf_path) - return cls(tokenizer) - - @classmethod - def from_directory(cls, tokenizer_dir): - # init from a local directory on disk (e.g. "out/tokenizer") - tokenizer_path = os.path.join(tokenizer_dir, "tokenizer.json") - tokenizer = HFTokenizer.from_file(tokenizer_path) - return cls(tokenizer) - - @classmethod - def train_from_iterator(cls, text_iterator, vocab_size): - # train from an iterator of text - # Configure the HuggingFace Tokenizer - tokenizer = HFTokenizer( - BPE( - byte_fallback=True, # needed! - unk_token=None, - fuse_unk=False, - ) - ) - # Normalizer: None - tokenizer.normalizer = None - # Pre-tokenizer: GPT-4 style - # the regex pattern used by GPT-4 to split text into groups before BPE - # NOTE: The pattern was changed from \p{N}{1,3} to \p{N}{1,2} because I suspect it is harmful to - # very small models and smaller vocab sizes, because it is a little bit wasteful in the token space. - # (but I haven't validated this! TODO) - gpt4_split_regex = Regex( - SPLIT_PATTERN - ) # huggingface demands that you wrap it in Regex!! - tokenizer.pre_tokenizer = pre_tokenizers.Sequence( - [ - pre_tokenizers.Split( - pattern=gpt4_split_regex, behavior="isolated", invert=False - ), - pre_tokenizers.ByteLevel(add_prefix_space=False, use_regex=False), - ] - ) - # Decoder: ByteLevel (it pairs together with the ByteLevel pre-tokenizer) - tokenizer.decoder = decoders.ByteLevel() - # Post-processor: None - tokenizer.post_processor = None - # Trainer: BPE - trainer = BpeTrainer( - vocab_size=vocab_size, - show_progress=True, - min_frequency=0, # no minimum frequency - initial_alphabet=pre_tokenizers.ByteLevel.alphabet(), - special_tokens=SPECIAL_TOKENS, - ) - # Kick off the training - tokenizer.train_from_iterator(text_iterator, trainer) - return cls(tokenizer) - - def get_vocab_size(self): - return self.tokenizer.get_vocab_size() - - def get_special_tokens(self): - special_tokens_map = self.tokenizer.get_added_tokens_decoder() - special_tokens = [w.content for w in special_tokens_map.values()] - return special_tokens - - def id_to_token(self, id): - return self.tokenizer.id_to_token(id) - - def _encode_one(self, text, prepend=None, append=None): - # encode a single string - # prepend/append can be either a string of a special token or a token id directly. - assert isinstance(text, str) - ids = [] - if prepend is not None: - prepend_id = ( - prepend if isinstance(prepend, int) else self.encode_special(prepend) - ) - ids.append(prepend_id) - ids.extend(self.tokenizer.encode(text, add_special_tokens=False).ids) - if append is not None: - append_id = ( - append if isinstance(append, int) else self.encode_special(append) - ) - ids.append(append_id) - return ids - - def encode_special(self, text): - # encode a single special token via exact match - return self.tokenizer.token_to_id(text) - - def get_bos_token_id(self): - bos = self.encode_special("<|bos|>") - return bos - - def encode(self, text, *args, **kwargs): - if isinstance(text, str): - return self._encode_one(text, *args, **kwargs) - elif isinstance(text, list): - return [self._encode_one(t, *args, **kwargs) for t in text] - else: - raise ValueError(f"Invalid input type: {type(text)}") - - def __call__(self, *args, **kwargs): - return self.encode(*args, **kwargs) - - def decode(self, ids): - return self.tokenizer.decode(ids, skip_special_tokens=False) - - def save(self, tokenizer_dir): - # save the tokenizer to disk - os.makedirs(tokenizer_dir, exist_ok=True) - tokenizer_path = os.path.join(tokenizer_dir, "tokenizer.json") - self.tokenizer.save(tokenizer_path) - print(f"Saved tokenizer to {tokenizer_path}") - - -# ----------------------------------------------------------------------------- -# Universal Tokenizer Wrapper that works with any HuggingFace model -class UniversalHuggingFaceTokenizer: - """Universal wrapper that works with any HuggingFace model""" - - def __init__(self, tokenizer_dir, model_config=None): - self.tokenizer = HuggingFaceTokenizer.from_directory(tokenizer_dir) - self.model_config = model_config - self._pad_token_id = None - self._bos_token_id = None - self._detect_special_tokens() - - def _detect_special_tokens(self): - """Auto-detect special tokens for any model""" - # Try to get pad token from tokenizer - if hasattr(self.tokenizer, "tokenizer"): - tokenizer_obj = self.tokenizer.tokenizer - - # Try common pad token names - pad_candidates = ["", "[PAD]", "<|pad|>", "", "<|endoftext|>"] - for candidate in pad_candidates: - if hasattr(tokenizer_obj, "token_to_id"): - token_id = tokenizer_obj.token_to_id(candidate) - if token_id is not None: - self._pad_token_id = token_id - break - - # Try common BOS token names - bos_candidates = ["", "[CLS]", "<|startoftext|>", "<|endoftext|>"] - for candidate in bos_candidates: - if hasattr(tokenizer_obj, "token_to_id"): - token_id = tokenizer_obj.token_to_id(candidate) - if token_id is not None: - self._bos_token_id = token_id - break - - # Fallback to config-based detection - if self.model_config and hasattr(self.model_config, "pad_token_id"): - self._pad_token_id = self.model_config.pad_token_id - - if self.model_config and hasattr(self.model_config, "bos_token_id"): - self._bos_token_id = self.model_config.bos_token_id - - # Final fallbacks based on common patterns - if self._pad_token_id is None: - # Most models use either 0 or their EOS token - self._pad_token_id = 0 - - if self._bos_token_id is None: - # Use pad token as fallback - self._bos_token_id = self._pad_token_id - - def get_bos_token_id(self): - return self._bos_token_id - - def get_pad_token_id(self): - return self._pad_token_id - - def __call__(self, prompts, prepend=None): - """Universal tokenization method""" - if isinstance(prompts, str): - prompts = [prompts] - - result = [] - for prompt in prompts: - tokens = self.tokenizer.encode(prompt) - if prepend is not None: - tokens = [prepend] + tokens - result.append(tokens) - - return result[0] if len(result) == 1 else result - - def __getattr__(self, name): - return getattr(self.tokenizer, name) From a978bebb17748ca708d6291aba35bf663a9729f1 Mon Sep 17 00:00:00 2001 From: Jasmine-Yuting-Zhang Date: Tue, 28 Oct 2025 02:58:50 -0400 Subject: [PATCH 06/17] Added abstract eval_model() to TestingStrategy. --- plato/trainers/strategies/base.py | 33 +++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/plato/trainers/strategies/base.py b/plato/trainers/strategies/base.py index a4c3159b6..a119ab20b 100644 --- a/plato/trainers/strategies/base.py +++ b/plato/trainers/strategies/base.py @@ -554,3 +554,36 @@ def test_model( setting eval mode, and computing the metric. """ pass + + @abstractmethod + def eval_model( + self, + model: nn.Module, + config: dict[str, Any], + benchmark, + sampler, + context: TrainingContext, + ) -> dict[str, Any]: + """ + Evaluate the model on benchmark and return results. + + Args: + model: The model to test + config: Testing configuration dictionary + benchmark: Benchmark instance for evaluation + sampler: Optional data sampler for test set + context: Training context with device, client_id, etc. + + Returns: + Benchmark results dictionary containing evaluation metrics. + For CORE benchmark, this includes: + - 'results': per-task accuracies + - 'centered_results': normalized scores + - 'core_metric': overall CORE score + + Note: + This method should handle moving model to device, + setting eval mode, and computing the benchmark metrics. + The specific return format depends on the benchmark type. + """ + pass \ No newline at end of file From c9db3e02002155ca55b1635c2777aed955a66013 Mon Sep 17 00:00:00 2001 From: Jasmine-Yuting-Zhang Date: Tue, 28 Oct 2025 03:02:43 -0400 Subject: [PATCH 07/17] Added eval_model() to DefaultTestingStrategy. - Introduced eval_model() in testing.py to define a placeholder interface for benchmark-based evaluation. - The default strategy now raises NotImplementedError to prompt use of specialized testing strategies. --- plato/trainers/strategies/testing.py | 32 ++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/plato/trainers/strategies/testing.py b/plato/trainers/strategies/testing.py index 80e6c424f..dd76b9ff4 100644 --- a/plato/trainers/strategies/testing.py +++ b/plato/trainers/strategies/testing.py @@ -7,6 +7,7 @@ import logging import os +from typing import Any import torch @@ -97,3 +98,34 @@ def test_model(self, model, config, testset, sampler, context): ) return accuracy + + def eval_model( + self, + model, + config, + benchmark, + sampler, + context + ) -> dict[str, Any]: + """ + Evaluate the model on benchmark and return results. + + Args: + model: The model to test + config: Testing configuration dictionary + benchmark: Benchmark instance for evaluation + sampler: Optional data sampler for test set + context: Training context with device, client_id, etc. + + Returns: + Benchmark results dictionary + + Note: + DefaultTestingStrategy does not implement benchmark evaluation. + Use a specialized testing strategy (e.g., LLMSplitLearningTestingStrategy) + for benchmark support. + """ + raise NotImplementedError( + "DefaultTestingStrategy does not support benchmark evaluation. " + "Please implement a custom TestingStrategy with eval_model() for your use case." + ) \ No newline at end of file From 34ec5820b53590ba15c24b3bb07bf7e0c74d93f3 Mon Sep 17 00:00:00 2001 From: Jasmine-Yuting-Zhang Date: Tue, 28 Oct 2025 03:05:47 -0400 Subject: [PATCH 08/17] Added benchmark result save/load utilities. - Added static methods save_benchmark_result() and load_benchmark_result() in base.py for saving and loading benchmark evaluation results. --- plato/trainers/base.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/plato/trainers/base.py b/plato/trainers/base.py index 306ca0c4f..9b67a86b7 100644 --- a/plato/trainers/base.py +++ b/plato/trainers/base.py @@ -3,6 +3,7 @@ """ import os +import json from abc import ABC, abstractmethod from typing import Any, Optional @@ -77,6 +78,39 @@ def load_accuracy(filename=None): return accuracy + @staticmethod + def save_benchmark_result(benchmark_result, filename=None): + """Saving the benchmark result to a file.""" + model_path = Config().params["model_path"] + model_name = Config().trainer.model_name + + if not os.path.exists(model_path): + os.makedirs(model_path) + + if filename is not None: + benchmark_result_path = f"{model_path}/{filename}" + else: + benchmark_result_path = f"{model_path}/{model_name}.eval" + + with open(benchmark_result_path, "w", encoding="utf-8") as file: + json.dump(benchmark_result, file) + + @staticmethod + def load_benchmark_result(filename=None): + """Loading the benchmark result from a file.""" + model_path = Config().params["model_path"] + model_name = Config().trainer.model_name + + if filename is not None: + benchmark_result_path = f"{model_path}/{filename}" + else: + benchmark_result_path = f"{model_path}/{model_name}.eval" + + with open(benchmark_result_path, encoding="utf-8") as file: + benchmark_result = json.load(file) + + return benchmark_result + def pause_training(self): """Remove files of running trainers.""" if hasattr(Config().trainer, "max_concurrency"): From 956922f38bf8ed00b2f3a1d39e774a9b0441b05a Mon Sep 17 00:00:00 2001 From: Jasmine-Yuting-Zhang Date: Tue, 28 Oct 2025 03:11:09 -0400 Subject: [PATCH 09/17] Implemented benchmark evaluation pipeline with multiprocessing. - Implemented benchmark evaluation pipeline in plato/trainers/composable.py. - Added eval_model(), eval(), and eval_process() methods. --- plato/trainers/composable.py | 79 ++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/plato/trainers/composable.py b/plato/trainers/composable.py index a1da4a5eb..164c111bb 100644 --- a/plato/trainers/composable.py +++ b/plato/trainers/composable.py @@ -793,6 +793,85 @@ def test_model(self, config, testset, sampler=None, **kwargs): return accuracy + def eval_process(self, config, benchmark, sampler=None, **kwargs): + """The evaluating loop, run in a separate process.""" + self.eval_model(config, benchmark, sampler, **kwargs) + + model_name = Config().trainer.model_name + filename = f"{model_name}_{self.client_id}_{config['run_id']}.eval" + self.save_benchmark_result(self.benchmark_result, filename) + + def eval(self, benchmark, sampler=None, **kwargs) -> dict[str, Any]: + """ + Evaluate the model using the provided benchmark. + + Args: + benchmark: benchmark instance (from benchmarks.registry.get()) + sampler: The sampler for the test dataset + **kwargs: Additional keyword arguments + + Returns: + Accuracy on benchmark + """ + config = Config().trainer._asdict() + config["run_id"] = Config().params["run_id"] + + if "max_concurrency" in config: + model = self._require_model() + model.cpu() + + if mp.get_start_method(allow_none=True) != "spawn": + mp.set_start_method("spawn", force=True) + + eval_proc = mp.Process( + target=self.eval_process, + args=(config, benchmark, sampler), + kwargs=kwargs, + ) + eval_proc.start() + eval_proc.join() + + model_name = Config().trainer.model_name + filename = f"{model_name}_{self.client_id}_{Config().params['run_id']}.eval" + + try: + benchmark_result = self.load_benchmark_result(filename) + except OSError as error: + raise ValueError( + f"Evaluating on client {self.client_id} failed." + ) from error + + self.pause_training() + return benchmark_result + else: + return self.eval_model(config, benchmark, sampler, **kwargs) + + def eval_model(self, config, benchmark, sampler=None, **kwargs): + """ + Evaluate the model using benchmark. + + Args: + config: Evaluation configuration dictionary + benchmark: Benchmark instance (from benchmarks.registry.get()) + sampler: Optional data sampler (usually None for benchmarks) + **kwargs: Additional keyword arguments + + Returns: + Benchmark results dictionary containing: + - 'results': per-task accuracies + - 'centered_results': normalized scores + - 'core_metric': overall CORE score + """ + + model = self._require_model() + result = self.testing_strategy.eval_model( + model, config, benchmark, sampler, self.context + ) + + self.benchmark_result = result + + return result + def obtain_model_update(self, config, trainset, sampler): """ Obtain model updates from training. From 103c6656df09ab9664d842156ca713e1d9bf5aab Mon Sep 17 00:00:00 2001 From: Jasmine-Yuting-Zhang Date: Tue, 28 Oct 2025 03:11:58 -0400 Subject: [PATCH 10/17] Added registry for benchmark. --- plato/benchmarks/registry.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 plato/benchmarks/registry.py diff --git a/plato/benchmarks/registry.py b/plato/benchmarks/registry.py new file mode 100644 index 000000000..3cfb03252 --- /dev/null +++ b/plato/benchmarks/registry.py @@ -0,0 +1,30 @@ +""" +Registry for benchmarks. + +Enables runtime benchmark selection via configuration. +""" +from plato.benchmarks import core +from plato.benchmarks.base import Benchmark as BenchmarkBase + +registered_benchmarks: dict[str, type[BenchmarkBase]] = { + "core": core.Benchmark, +} + +def get(type: str) -> BenchmarkBase: + """Get an instance of the benchmark.""" + if type in registered_benchmarks: + benchmark_cls = registered_benchmarks[type] + registered_benchmark = benchmark_cls() + else: + available = list(registered_benchmarks.keys()) + raise ValueError( + f"No such benchmark: {type}. " + f"Available benchmarks: {available}" + ) + + return registered_benchmark + + +def list_benchmarks(): + """List all available benchmark types.""" + return list(registered_benchmarks.keys()) From c3c5020034ebeca9c76624c2a27c001396092627 Mon Sep 17 00:00:00 2001 From: Jasmine-Yuting-Zhang Date: Tue, 28 Oct 2025 03:15:07 -0400 Subject: [PATCH 11/17] Added base class for evaluating trained models. --- plato/benchmarks/base.py | 135 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 plato/benchmarks/base.py diff --git a/plato/benchmarks/base.py b/plato/benchmarks/base.py new file mode 100644 index 000000000..a6e78516b --- /dev/null +++ b/plato/benchmarks/base.py @@ -0,0 +1,135 @@ +""" +Base class for benchmarks evaluating trained models. +""" + +from typing import Any +from abc import ABC, abstractmethod +import gzip +import logging +import os +import sys +import tarfile +import zipfile +from pathlib import Path +from urllib.parse import urlparse +import requests +import contextlib, time + +class Benchmark(ABC): + """Base class for model benchmarks.""" + + def __init__(self): + """ + Initialize the benchmark. + """ + super().__init__() + + @abstractmethod + def evaluate(self) -> dict[str, Any]: + """ + Evaluate the model on benchmark tasks. + + evaluate() returns evaluation results. + + Returns: + Dictionary of evaluation metrics + + Example: + >>> results = benchmark.evaluate() + >>> print(results) + {'task1_accuracy': 0.85, 'overall': 0.875} + """ + pass + + @abstractmethod + def get_formatted_result(self) -> str: + pass + + # Borrowed from plato/datasources/base.py + @staticmethod + @contextlib.contextmanager + def _download_guard(data_path: str): + """Serialise dataset downloads to avoid concurrent corruption.""" + os.makedirs(data_path, exist_ok=True) + lock_file = os.path.join(data_path, ".download.lock") + lock_fd = None + waited = False + + try: + while True: + try: + lock_fd = os.open(lock_file, os.O_CREAT | os.O_EXCL | os.O_RDWR) + break + except FileExistsError: + if not waited: + logging.info( + "Another process is preparing the dataset at %s. Waiting.", + data_path, + ) + waited = True + time.sleep(1) + yield + finally: + if lock_fd is not None: + os.close(lock_fd) + try: + os.remove(lock_file) + except FileNotFoundError: + pass + + @staticmethod + def download(url, data_path): + """Download a dataset from a URL if it is not already available.""" + url_parse = urlparse(url) + file_name = os.path.join(data_path, url_parse.path.split("/")[-1]) + os.makedirs(data_path, exist_ok=True) + sentinel = Path(f"{file_name}.complete") + + if sentinel.exists(): + return + + with Benchmark._download_guard(data_path): + if sentinel.exists(): + return + + logging.info("Downloading %s.", url) + + res = requests.get(url, stream=True, timeout=60) + total_size = int(res.headers.get("Content-Length", 0)) + downloaded_size = 0 + + with open(file_name, "wb+") as file: + for chunk in res.iter_content(chunk_size=1024): + if not chunk: + continue + downloaded_size += len(chunk) + file.write(chunk) + file.flush() + if total_size: + sys.stdout.write(f"\r{100 * downloaded_size / total_size:.1f}%") + sys.stdout.flush() + if total_size: + sys.stdout.write("\n") + + # Unzip the compressed file just downloaded + logging.info("Decompressing the dataset downloaded.") + name, suffix = os.path.splitext(file_name) + + if file_name.endswith("tar.gz"): + with tarfile.open(file_name, "r:gz") as tar: + tar.extractall(data_path) + os.remove(file_name) + elif suffix == ".zip": + logging.info("Extracting %s to %s.", file_name, data_path) + with zipfile.ZipFile(file_name, "r") as zip_ref: + zip_ref.extractall(data_path) + elif suffix == ".gz": + with gzip.open(file_name, "rb") as zipped_file: + with open(name, "wb") as unzipped_file: + unzipped_file.write(zipped_file.read()) + os.remove(file_name) + else: + logging.info("Unknown compressed file type for %s.", file_name) + sys.exit() + + sentinel.touch() \ No newline at end of file From ed4b025de0e91902a7a43b87ffd32f775e706a11 Mon Sep 17 00:00:00 2001 From: Jasmine-Yuting-Zhang Date: Tue, 28 Oct 2025 03:16:10 -0400 Subject: [PATCH 12/17] Added CORE benchmark implementation for language models. --- plato/benchmarks/core.py | 188 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 188 insertions(+) create mode 100644 plato/benchmarks/core.py diff --git a/plato/benchmarks/core.py b/plato/benchmarks/core.py new file mode 100644 index 000000000..854048c2a --- /dev/null +++ b/plato/benchmarks/core.py @@ -0,0 +1,188 @@ +""" +CORE benchmark implementation for evaluating language models. +Borrowed and adapted from: https://github.com/karpathy/nanochat +""" + +import json +import logging +import os +import random +import time +from typing import Any + +import pandas as pd +import torch +import yaml + +from plato.benchmarks import base +from plato.benchmarks.core_helpers import core +from plato.config import Config + + +class Benchmark(base.Benchmark): + """ + CORE benchmark - evaluates language models on the CORE suite. + """ + + def __init__(self): + """ + Initialize CORE benchmark -- load benchmark tasks and data. + """ + super().__init__() + + # These will be set externally before evaluate() is called + self.model = None + self.device = None + self.tokenizer = None + + # Get configuration specific to CORE benchmark + self.random_seed = getattr(Config().benchmark, 'random_seed', 24) + self.max_per_task = getattr(Config().benchmark, 'max_per_task', -1) + + # Load benchmark tasks and datasets + self._load_benchmark_data() + + + def _load_benchmark_data(self): + """ + Load CORE benchmark tasks and evaluation data. + + Downloads the evaluation bundle if not already present, then loads + task configurations and data files. + """ + # Get base directory and ensure eval_bundle is downloaded + benchmark_base_dir = Config.params["benchmark_path"] + + # Download eval_bundle if not present + if not os.path.exists(benchmark_base_dir): + logging.info("CORE evaluation bundle not found. Downloading...") + eval_bundle_url = "https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip" + Benchmark.download(eval_bundle_url, benchmark_base_dir) + + # Load benchmark configuration + eval_bundle_dir = os.path.join(benchmark_base_dir, "eval_bundle") + config_path = os.path.join(eval_bundle_dir, "core.yaml") + self.eval_meta_data_path = os.path.join(eval_bundle_dir, "eval_meta_data.csv") + self.data_base_path = os.path.join(eval_bundle_dir, "eval_data") + + with open(config_path, "r") as f: + config = yaml.safe_load(f) + + self.tasks = config["icl_tasks"] + self.eval_metadata = pd.read_csv(self.eval_meta_data_path) + + def evaluate(self) -> dict[str, Any]: + """ + Evaluate the model on all CORE tasks. + + Returns: + Dictionary containing: + - 'results': per-task accuracies + - 'centered_results': normalized scores + - 'core_metric': overall CORE score + """ + + if self.model is None: + raise RuntimeError("Trainer has no model - cannot run benchmark") + + if self.tokenizer is None: + raise RuntimeError("Trainer has no tokenizer - cannot run benchmark") + + results = {} + centered_results = {} + + # Set model to eval mode + self.model.eval() + + with torch.no_grad(): + for task in self.tasks: + start_time = time.time() + label = task["label"] + + task_meta = { + "task_type": task["icl_task_type"], + "dataset_uri": task["dataset_uri"], + "num_fewshot": task["num_fewshot"][0], + "continuation_delimiter": task.get("continuation_delimiter", " "), + } + + logging.info( + "Evaluating task: %s (%d-shot, type: %s)", + label, + task_meta['num_fewshot'], + task_meta['task_type'] + ) + + # Load data for this task (matching evaluate_model.py pattern) + data_path = os.path.join(self.data_base_path, task_meta["dataset_uri"]) + with open(data_path, "r") as f: + data = [json.loads(line.strip()) for line in f] + + # Shuffle the data for reproducibility (matching evaluate_model.py) + shuffle_rng = random.Random(self.random_seed) + shuffle_rng.shuffle(data) + + # Crop data if max_per_task is specified + if self.max_per_task > 0: + data = data[:self.max_per_task] + + # Run evaluation using existing core_eval logic + accuracy = core.evaluate_task( + self.model, # Model in CUDA memory from trainer + self.tokenizer, # Tokenizer from trainer + data, + self.device, + task_meta + ) + + results[label] = accuracy + + # Compute centered result (normalized by random baseline) + row = self.eval_metadata[self.eval_metadata["Eval Task"] == label] + random_baseline = row["Random baseline"].values[0] + centered = (accuracy - 0.01 * random_baseline) / ( + 1.0 - 0.01 * random_baseline + ) + centered_results[label] = centered + + elapsed = time.time() - start_time + logging.info( + "accuracy: %.4f | centered: %.4f | time: %.2fs", + accuracy, + centered, + elapsed + ) + + # Compute overall CORE metric + core_metric = sum(centered_results.values()) / len(centered_results) + + + return { + "results": results, + "centered_results": centered_results, + "core_metric": core_metric, + } + + def get_formatted_result(self, evaluation_result: dict[str, Any]) -> str: + """ + Format the evaluation results for display. + + Args: + evaluation_result: The dictionary returned by the evaluate() method. + Returns: + A formatted string summarizing the results. + """ + results = evaluation_result["results"] + centered_results = evaluation_result["centered_results"] + core_metric = evaluation_result["core_metric"] + + result_lines = [f"{'Task':<35}, {'Accuracy':<10}, {'Centered':<10}"] + for task, acc in results.items(): + centered = centered_results[task] + result_lines.append( + f"{task:<35}, {acc:<10.6f}, {centered:<10.6f}" + ) + result_lines.append(f"{'Overall CORE Metric':<35}, {'':<10}, {core_metric:<10.6f}\n") + + return "\n".join(result_lines) + \ No newline at end of file From d8cf2147ff13607776864ca2a78f632ee8484c1a Mon Sep 17 00:00:00 2001 From: Jasmine-Yuting-Zhang Date: Tue, 28 Oct 2025 03:18:51 -0400 Subject: [PATCH 13/17] Added helper functions for CORE benchmark implementation. --- plato/benchmarks/__init__.py | 0 plato/benchmarks/core_helpers/core.py | 281 ++++++++++++++++++++ plato/benchmarks/core_helpers/tokenizer.py | 290 +++++++++++++++++++++ 3 files changed, 571 insertions(+) create mode 100644 plato/benchmarks/__init__.py create mode 100644 plato/benchmarks/core_helpers/core.py create mode 100644 plato/benchmarks/core_helpers/tokenizer.py diff --git a/plato/benchmarks/__init__.py b/plato/benchmarks/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/plato/benchmarks/core_helpers/core.py b/plato/benchmarks/core_helpers/core.py new file mode 100644 index 000000000..2639c9c48 --- /dev/null +++ b/plato/benchmarks/core_helpers/core.py @@ -0,0 +1,281 @@ +""" +Borrowed and adapted from: https://github.com/karpathy/nanochat + +Functions for evaluating the CORE metric, as described in the DCLM paper. +https://arxiv.org/abs/2406.11794 + +TODOs: +- All tasks ~match except for squad. We get 31% reference is 37%. Figure out why. +""" + +import random + +from jinja2 import Template +import torch + +from plato.benchmarks.core_helpers.tokenizer import UniversalHuggingFaceTokenizer + + +def render_prompts_mc(item, continuation_delimiter, fewshot_examples=None): + """Render complete prompts for a multiple choice question""" + template_str = """ +{%- for example in fewshot_examples -%} +{{ example.query }}{{ continuation_delimiter }}{{ example.choices[example.gold] }} + +{% endfor -%} +{{ item.query }}{{ continuation_delimiter }}{{ choice }}""".strip() + template = Template(template_str) + fewshot_examples = fewshot_examples or [] + context = { + "fewshot_examples": fewshot_examples, + "continuation_delimiter": continuation_delimiter, + "item": item, + } + prompts = [template.render(choice=choice, **context) for choice in item["choices"]] + return prompts + + +def render_prompts_schema(item, continuation_delimiter, fewshot_examples=None): + """Render complete prompts for a schema question""" + template_str = """ +{%- for example in fewshot_examples -%} +{{ example.context_options[example.gold] }}{{ continuation_delimiter }}{{ example.continuation }} + +{% endfor -%} +{{ context }}{{ continuation_delimiter }}{{ item.continuation }}""".strip() + template = Template(template_str) + fewshot_examples = fewshot_examples or [] + context = { + "fewshot_examples": fewshot_examples, + "continuation_delimiter": continuation_delimiter, + "item": item, + } + prompts = [ + template.render(context=context_option, **context) + for context_option in item["context_options"] + ] + return prompts + + +def render_prompts_lm(item, continuation_delimiter, fewshot_examples=None): + """ + Render complete prompt for a language modeling task. + Notice that we manually trim the context in the template, + which in some datasets seems to have trailing whitespace (which we don't want). + """ + template_str = """ +{%- for example in fewshot_examples -%} +{{ example.context | trim }}{{ continuation_delimiter }}{{ example.continuation }} + +{% endfor -%} +{{ item.context | trim }}{{ continuation_delimiter }}{% if include_continuation %}{{ item.continuation }}{% endif %}""".strip() + template = Template(template_str) + fewshot_examples = fewshot_examples or [] + context = { + "fewshot_examples": fewshot_examples, + "continuation_delimiter": continuation_delimiter, + "item": item, + } + # Return two prompts: without and with the continuation + prompt_without = template.render(include_continuation=False, **context) + prompt_with = template.render(include_continuation=True, **context) + # Due to the way the data seems to be stored, I think I need to strip in the case of LM here. + # Otherwise we may get trailing whitespaces in prompt_without (which get absorbed into the next + # token in prompt_with), meaning we don't get a nice and clean prefix in the token space + # to detect the final continuation. Tokenizers... + prompt_without = prompt_without.strip() + return [prompt_without, prompt_with] + + +def find_common_length(token_sequences, direction="left"): + """ + Find the length of the common prefix or suffix across token sequences + - direction: 'left' for prefix, 'right' for suffix + """ + min_len = min(len(seq) for seq in token_sequences) + indices = {"left": range(min_len), "right": range(-1, -min_len - 1, -1)}[direction] + # Find the first position where the token sequences differ + for i, idx in enumerate(indices): + token = token_sequences[0][idx] + if not all(seq[idx] == token for seq in token_sequences): + return i + return min_len + + +def stack_sequences(tokens, pad_token_id): + """Stack up a list of token sequences, pad to longest on the right""" + bsz, seq_len = len(tokens), max(len(x) for x in tokens) + input_ids = torch.full((bsz, seq_len), pad_token_id, dtype=torch.long) + for i, x in enumerate(tokens): + input_ids[i, : len(x)] = torch.tensor(x, dtype=torch.long) + return input_ids + + +def batch_sequences_mc(tokenizer, prompts): + # In multiple choice, contexts are the same but the continuation is different (common prefix) + tokens = tokenizer(prompts, prepend=tokenizer.get_bos_token_id()) + # figure out the start and end of each continuation + answer_start_idx = find_common_length(tokens, direction="left") + start_indices = [answer_start_idx] * len(prompts) + end_indices = [len(x) for x in tokens] + return tokens, start_indices, end_indices + + +def batch_sequences_schema(tokenizer, prompts): + # In schema tasks, contexts vary but continuation is the same (common suffix) + tokens = tokenizer(prompts, prepend=tokenizer.get_bos_token_id()) + # figure out the start and end of each context + suffix_length = find_common_length(tokens, direction="right") + end_indices = [len(x) for x in tokens] + start_indices = [ei - suffix_length for ei in end_indices] + return tokens, start_indices, end_indices + + +def batch_sequences_lm(tokenizer, prompts): + # In LM tasks, we have two prompts: without and with continuation + tokens = tokenizer(prompts, prepend=tokenizer.get_bos_token_id()) + tokens_without, tokens_with = tokens + start_idx, end_idx = len(tokens_without), len(tokens_with) + assert start_idx < end_idx, ( + "prompt without is supposed to be a prefix of prompt with" + ) + assert tokens_without == tokens_with[:start_idx], ( + "prompt without is supposed to be a prefix of prompt with" + ) + # we only need the with continuation prompt in the LM task, i.e. batch size of 1 + return [tokens_with], [start_idx], [end_idx] + + +@torch.no_grad() +def forward_model(model, input_ids): + """ + Take BxT tensor of token ids, return BxT tensor of losses and argmax predictions. + The last column of losses is set to nan because we don't have autoregressive targets there. + """ + batch_size, seq_len = input_ids.size() + outputs = model(input_ids) + + # Extract logits from model output (handles both raw tensors and HuggingFace output objects) + if hasattr(outputs, 'logits'): + logits = outputs.logits + else: + logits = outputs + + # Roll the tensor to the left by one position to get the (autoregressive) target ids + target_ids = torch.roll(input_ids, shifts=-1, dims=1) + # Calculate cross entropy at all positions + losses = torch.nn.functional.cross_entropy( + logits.view(batch_size * seq_len, -1), + target_ids.view(batch_size * seq_len), + reduction="none", + ).view(batch_size, seq_len) + # Set the last column to be nan because there is no autoregressive loss there + losses[:, -1] = float("nan") + # Get the argmax predictions at each position + predictions = logits.argmax(dim=-1) + return losses, predictions + + +@torch.no_grad() +def evaluate_example(idx, model, tokenizer, data, device, task_meta): + """Evaluate a single example, return True if correct, False otherwise""" + item = data[idx] + task_type = task_meta["task_type"] + num_fewshot = task_meta["num_fewshot"] + continuation_delimiter = task_meta["continuation_delimiter"] + + # Sample few-shot examples (excluding current item) + fewshot_examples = [] + if num_fewshot > 0: + rng = random.Random(1234 + idx) + available_indices = [i for i in range(len(data)) if i != idx] + fewshot_indices = rng.sample(available_indices, num_fewshot) + fewshot_examples = [data[i] for i in fewshot_indices] + + # Render prompts and batch sequences based on task type + if task_type == "multiple_choice": + prompts = render_prompts_mc(item, continuation_delimiter, fewshot_examples) + tokens, start_idxs, end_idxs = batch_sequences_mc(tokenizer, prompts) + elif task_type == "schema": + prompts = render_prompts_schema(item, continuation_delimiter, fewshot_examples) + tokens, start_idxs, end_idxs = batch_sequences_schema(tokenizer, prompts) + elif task_type == "language_modeling": + prompts = render_prompts_lm(item, continuation_delimiter, fewshot_examples) + tokens, start_idxs, end_idxs = batch_sequences_lm(tokenizer, prompts) + else: + raise ValueError(f"Unsupported task type: {task_type}") + + # Some models can't forward sequences beyond a certain length (e.g. GPT-2) + # In these cases, we have to truncate sequences to max length and adjust the indices + max_tokens = None + if hasattr(model, "max_seq_len") and model.max_seq_len is not None: + max_tokens = model.max_seq_len + elif hasattr(model, "config"): + # For HuggingFace models, check common config attributes + if hasattr(model.config, "n_positions"): + max_tokens = model.config.n_positions + elif hasattr(model.config, "max_position_embeddings"): + max_tokens = model.config.max_position_embeddings + else: + max_tokens = 1024 # default to 1024 (GPT-2) if no info available + + if max_tokens is not None: + new_tokens, new_start_idxs, new_end_idxs = [], [], [] + for t, s, e in zip(tokens, start_idxs, end_idxs): + if len(t) > max_tokens: + num_to_crop = len(t) - max_tokens + new_tokens.append(t[-max_tokens:]) # take the last max_tokens tokens + new_start_idxs.append(s - num_to_crop) # shift the indices down + new_end_idxs.append(e - num_to_crop) + assert s - num_to_crop >= 0, "this should never happen right?" + assert e - num_to_crop >= 0, "this should never happen right?" + else: + new_tokens.append(t) # keep unchanged + new_start_idxs.append(s) + new_end_idxs.append(e) + tokens, start_idxs, end_idxs = new_tokens, new_start_idxs, new_end_idxs + + # Stack up all the sequences into a batch + pad_token_id = tokenizer.get_bos_token_id() # use BOS as pad token is ok + input_ids = stack_sequences(tokens, pad_token_id) + input_ids = input_ids.to(device) + + # Forward the model, get the autoregressive loss and argmax prediction at each token + losses, predictions = forward_model(model, input_ids) + + # See if the losses/predictions come out correctly + if task_type == "language_modeling": + # language modeling task is currently always batch size 1 + si = start_idxs[0] + ei = end_idxs[0] + # predictions[i] predict input_ids[i+1] autoregressively + predicted_tokens = predictions[0, si - 1 : ei - 1] + actual_tokens = input_ids[0, si:ei] + is_correct = torch.all(predicted_tokens == actual_tokens).item() + elif task_type in ["multiple_choice", "schema"]: + # For MC/schema: find the option with lowest average loss + mean_losses = [ + losses[i, si - 1 : ei - 1].mean().item() + for i, (si, ei) in enumerate(zip(start_idxs, end_idxs)) + ] + pred_idx = mean_losses.index(min(mean_losses)) + is_correct = pred_idx == item["gold"] + else: + raise ValueError(f"Unsupported task type: {task_type}") + + return is_correct + + +def evaluate_task(model, tokenizer, data, device, task_meta): + """ + This function is responsible for evaluating one task across many examples. + """ + # wrap tokenizer with Universal wrapper for compatibility + tokenizer = UniversalHuggingFaceTokenizer(tokenizer) + correct = torch.zeros(len(data), dtype=torch.float32, device=device) + for idx in range(len(data)): + is_correct = evaluate_example(idx, model, tokenizer, data, device, task_meta) + correct[idx] = float(is_correct) + # compute the mean + mean_correct = correct.mean().item() + return mean_correct diff --git a/plato/benchmarks/core_helpers/tokenizer.py b/plato/benchmarks/core_helpers/tokenizer.py new file mode 100644 index 000000000..7b6f0e4d5 --- /dev/null +++ b/plato/benchmarks/core_helpers/tokenizer.py @@ -0,0 +1,290 @@ +""" +BPE Tokenizer in the style of GPT-4. + +Two implementations are available: +1) HuggingFace Tokenizer that can do both training and inference but is really confusing +2) Universal Wrapper that can load any HuggingFace tokenizer (e.g., for GPT-2 which has slightly different tokenization rules than GPT-4) for inference only. +""" + +import os + +SPECIAL_TOKENS = [ + # every document begins with the Beginning of Sequence (BOS) token that delimits documents + "<|bos|>", + # tokens below are only used during finetuning to render Conversations into token ids + "<|user_start|>", # user messages + "<|user_end|>", + "<|assistant_start|>", # assistant messages + "<|assistant_end|>", + "<|python_start|>", # assistant invokes python REPL tool + "<|python_end|>", + "<|output_start|>", # python REPL outputs back to assistant + "<|output_end|>", +] + +# NOTE: this split pattern deviates from GPT-4 in that we use \p{N}{1,2} instead of \p{N}{1,3} +# I did this because I didn't want to "waste" too many tokens on numbers for smaller vocab sizes. +# I haven't validated that this is actually a good idea, TODO. +SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,2}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""" + +# ----------------------------------------------------------------------------- +# Generic GPT-4-style tokenizer based on HuggingFace Tokenizer +from tokenizers import Tokenizer as HFTokenizer +from tokenizers import pre_tokenizers, decoders, Regex +from tokenizers.models import BPE +from tokenizers.trainers import BpeTrainer + + +class HuggingFaceTokenizer: + """Light wrapper around HuggingFace Tokenizer for some utilities""" + + def __init__(self, tokenizer): + self.tokenizer = tokenizer + + @classmethod + def from_pretrained(cls, hf_path): + # init from a HuggingFace pretrained tokenizer (e.g. "gpt2") + tokenizer = HFTokenizer.from_pretrained(hf_path) + return cls(tokenizer) + + @classmethod + def from_directory(cls, tokenizer_dir): + # init from a local directory on disk (e.g. "out/tokenizer") + tokenizer_path = os.path.join(tokenizer_dir, "tokenizer.json") + tokenizer = HFTokenizer.from_file(tokenizer_path) + return cls(tokenizer) + + @classmethod + def train_from_iterator(cls, text_iterator, vocab_size): + # train from an iterator of text + # Configure the HuggingFace Tokenizer + tokenizer = HFTokenizer( + BPE( + byte_fallback=True, # needed! + unk_token=None, + fuse_unk=False, + ) + ) + # Normalizer: None + tokenizer.normalizer = None + # Pre-tokenizer: GPT-4 style + # the regex pattern used by GPT-4 to split text into groups before BPE + # NOTE: The pattern was changed from \p{N}{1,3} to \p{N}{1,2} because I suspect it is harmful to + # very small models and smaller vocab sizes, because it is a little bit wasteful in the token space. + # (but I haven't validated this! TODO) + gpt4_split_regex = Regex( + SPLIT_PATTERN + ) # huggingface demands that you wrap it in Regex!! + tokenizer.pre_tokenizer = pre_tokenizers.Sequence( + [ + pre_tokenizers.Split( + pattern=gpt4_split_regex, behavior="isolated", invert=False + ), + pre_tokenizers.ByteLevel(add_prefix_space=False, use_regex=False), + ] + ) + # Decoder: ByteLevel (it pairs together with the ByteLevel pre-tokenizer) + tokenizer.decoder = decoders.ByteLevel() + # Post-processor: None + tokenizer.post_processor = None + # Trainer: BPE + trainer = BpeTrainer( + vocab_size=vocab_size, + show_progress=True, + min_frequency=0, # no minimum frequency + initial_alphabet=pre_tokenizers.ByteLevel.alphabet(), + special_tokens=SPECIAL_TOKENS, + ) + # Kick off the training + tokenizer.train_from_iterator(text_iterator, trainer) + return cls(tokenizer) + + def get_vocab_size(self): + return self.tokenizer.get_vocab_size() + + def get_special_tokens(self): + special_tokens_map = self.tokenizer.get_added_tokens_decoder() + special_tokens = [w.content for w in special_tokens_map.values()] + return special_tokens + + def id_to_token(self, id): + return self.tokenizer.id_to_token(id) + + def _encode_one(self, text, prepend=None, append=None): + # encode a single string + # prepend/append can be either a string of a special token or a token id directly. + assert isinstance(text, str) + ids = [] + if prepend is not None: + prepend_id = ( + prepend if isinstance(prepend, int) else self.encode_special(prepend) + ) + ids.append(prepend_id) + ids.extend(self.tokenizer.encode(text, add_special_tokens=False).ids) + if append is not None: + append_id = ( + append if isinstance(append, int) else self.encode_special(append) + ) + ids.append(append_id) + return ids + + def encode_special(self, text): + # encode a single special token via exact match + return self.tokenizer.token_to_id(text) + + def get_bos_token_id(self): + bos = self.encode_special("<|bos|>") + return bos + + def encode(self, text, *args, **kwargs): + if isinstance(text, str): + return self._encode_one(text, *args, **kwargs) + elif isinstance(text, list): + return [self._encode_one(t, *args, **kwargs) for t in text] + else: + raise ValueError(f"Invalid input type: {type(text)}") + + def __call__(self, *args, **kwargs): + return self.encode(*args, **kwargs) + + def decode(self, ids): + return self.tokenizer.decode(ids, skip_special_tokens=False) + + def save(self, tokenizer_dir): + # save the tokenizer to disk + os.makedirs(tokenizer_dir, exist_ok=True) + tokenizer_path = os.path.join(tokenizer_dir, "tokenizer.json") + self.tokenizer.save(tokenizer_path) + print(f"Saved tokenizer to {tokenizer_path}") + + +# ----------------------------------------------------------------------------- +# Universal Tokenizer Wrapper that works with any HuggingFace model +# For example, GPT2TokenizerFast doesn't have a get_bos_token_id() method, +# so we need this wrapper to provide a unified interface. + + +class UniversalHuggingFaceTokenizer: + """ + Universal wrapper that provides a consistent interface for any HuggingFace tokenizer. + + This wrapper automatically detects special tokens (BOS, PAD, EOS) and provides + utility methods that work across different tokenizer implementations. + """ + + def __init__(self, tokenizer): + """ + Initialize the wrapper with a HuggingFace tokenizer. + + Args: + tokenizer: A HuggingFace tokenizer instance (e.g., GPT2TokenizerFast) + """ + self.tokenizer = tokenizer + self._pad_token_id = None + self._bos_token_id = None + self._eos_token_id = None + self._detect_special_tokens() + + def _detect_special_tokens(self): + """ + Auto-detect special token IDs from the tokenizer. + + Detection strategy (in order of priority): + 1. Try direct attributes on the tokenizer (bos_token_id, pad_token_id, eos_token_id) + 2. For missing tokens, use EOS as BOS/PAD for models like GPT-2 + 3. Try token_to_id() method with common token names + 4. Final fallbacks: 0 for pad, pad for bos + """ + # Strategy 1: Direct attributes (works for most HuggingFace tokenizers) + if hasattr(self.tokenizer, 'bos_token_id') and self.tokenizer.bos_token_id is not None: + self._bos_token_id = self.tokenizer.bos_token_id + + if hasattr(self.tokenizer, 'pad_token_id') and self.tokenizer.pad_token_id is not None: + self._pad_token_id = self.tokenizer.pad_token_id + + if hasattr(self.tokenizer, 'eos_token_id') and self.tokenizer.eos_token_id is not None: + self._eos_token_id = self.tokenizer.eos_token_id + # For GPT-2 and similar models, BOS is often the same as EOS + if self._bos_token_id is None: + self._bos_token_id = self._eos_token_id + # Use EOS as pad if no pad token exists + if self._pad_token_id is None: + self._pad_token_id = self._eos_token_id + + # Strategy 2: Try token_to_id method for tokenizers with nested structure + if hasattr(self.tokenizer, "tokenizer"): + tokenizer_obj = self.tokenizer.tokenizer + + if self._pad_token_id is None: + pad_candidates = ["", "[PAD]", "<|pad|>", "", "<|endoftext|>"] + self._pad_token_id = self._try_token_candidates(tokenizer_obj, pad_candidates) + + if self._bos_token_id is None: + bos_candidates = ["", "[CLS]", "<|startoftext|>", "<|endoftext|>"] + self._bos_token_id = self._try_token_candidates(tokenizer_obj, bos_candidates) + + # Strategy 3: Final fallbacks + if self._pad_token_id is None: + self._pad_token_id = 0 # Most models default to 0 + + if self._bos_token_id is None: + self._bos_token_id = self._pad_token_id + + def _try_token_candidates(self, tokenizer_obj, candidates): + """ + Try to find a token ID from a list of candidate token strings. + + Args: + tokenizer_obj: The tokenizer object with token_to_id method + candidates: List of token strings to try + + Returns: + Token ID if found, None otherwise + """ + if not hasattr(tokenizer_obj, "token_to_id"): + return None + + for candidate in candidates: + token_id = tokenizer_obj.token_to_id(candidate) + if token_id is not None: + return token_id + return None + + def get_bos_token_id(self): + """Get the beginning-of-sequence token ID.""" + return self._bos_token_id + + def get_pad_token_id(self): + """Get the padding token ID.""" + return self._pad_token_id + + def get_eos_token_id(self): + """Get the end-of-sequence token ID.""" + return self._eos_token_id + + def __call__(self, prompts, prepend=None): + """ + Tokenize prompts with optional prepended token. + + Args: + prompts: Single string or list of strings to tokenize + prepend: Optional token ID to prepend to each sequence + + Returns: + List of token IDs, or list of lists if multiple prompts + """ + if isinstance(prompts, str): + prompts = [prompts] + + result = [] + for prompt in prompts: + tokens = self.tokenizer.encode(prompt) + if prepend is not None: + tokens = [prepend] + tokens + result.append(tokens) + + return result[0] if len(result) == 1 else result + + def __getattr__(self, name): + """Delegate all other attributes to the wrapped tokenizer.""" + return getattr(self.tokenizer, name) From aaac39bbf61eab5234068df88be9e1e4d4e0543c Mon Sep 17 00:00:00 2001 From: Jasmine-Yuting-Zhang Date: Tue, 28 Oct 2025 03:21:26 -0400 Subject: [PATCH 14/17] Added benchmark evaluation support in fedavg.py. --- plato/servers/fedavg.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/plato/servers/fedavg.py b/plato/servers/fedavg.py index ee636cb49..b95fe76b8 100644 --- a/plato/servers/fedavg.py +++ b/plato/servers/fedavg.py @@ -7,6 +7,7 @@ import os from plato.algorithms import registry as algorithms_registry +from plato.benchmarks import registry as benchmarks_registry from plato.config import Config from plato.datasources import registry as datasources_registry from plato.processors import registry as processor_registry @@ -50,6 +51,8 @@ def __init__( self.testset_sampler = None self.total_samples = 0 + self.benchmark = None + self.total_clients = Config().clients.total_clients self.clients_per_round = Config().clients.per_round @@ -252,6 +255,17 @@ async def _process_reports(self): trainer = self.require_trainer() self.accuracy = trainer.test(self.testset, self.testset_sampler) + # Evaluating the global model on the specified benchmark + if hasattr(Config().config, "benchmark") and hasattr(Config().benchmark, "type"): + benchmark_type = Config().benchmark.type + if self.benchmark is None: + self.benchmark = benchmarks_registry.get(benchmark_type) + logging.info("[%s] Started model evaluation on benchmark %s.", self, benchmark_type) + trainer = self.require_trainer() + self.benchmark_result = trainer.eval(self.benchmark, self.testset_sampler) + logging.info("[%s] Model evaluation result on benchmark %s:\n%s.", self, benchmark_type, self.benchmark.get_formatted_result(self.benchmark_result)) + + if hasattr(Config().trainer, "target_perplexity"): logging.info( fonts.colourize( From 0a36f74a0d3f72309b9f012060a17e1db25c85bb Mon Sep 17 00:00:00 2001 From: Jasmine-Yuting-Zhang Date: Tue, 28 Oct 2025 03:21:59 -0400 Subject: [PATCH 15/17] Added benchmark configuration support in config.py. --- plato/config.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/plato/config.py b/plato/config.py index ec7d840a0..a3c73344e 100644 --- a/plato/config.py +++ b/plato/config.py @@ -153,6 +153,7 @@ class Config: clients: Any server: Any data: Any + benchmark: Any trainer: Any algorithm: Any results: Any @@ -342,6 +343,20 @@ def __new__(cls): Config.params["base_path"], "data" ) + # User specific benchmark + if hasattr(config, "benchmark"): + Config.benchmark = config.benchmark + + # Directory of benchmark dataset + if hasattr(Config().benchmark, "data_path"): + Config.params["benchmark_path"] = os.path.join( + Config.params["base_path"], Config().benchmark.data_path + ) + else: + Config.params["benchmark_path"] = os.path.join( + Config.params["base_path"], "benchmark" + ) + # Pretrained models if hasattr(Config().server, "model_path"): Config.params["model_path"] = os.path.join( @@ -401,6 +416,10 @@ def __new__(cls): if hasattr(config, "parameters"): Config.parameters = config.parameters + + # Benchmark configuration (for model evaluation) + if hasattr(config, "benchmark"): + Config.benchmark = config.benchmark return cls._instance From e60d99bc73572896cd1975cc8b979a544066fcae Mon Sep 17 00:00:00 2001 From: Jasmine-Yuting-Zhang Date: Tue, 28 Oct 2025 03:26:31 -0400 Subject: [PATCH 16/17] Added support for split learning benchmark evaluation. - Enabled benchmark evaluation in split learning to test and validate benchmark implementations. --- .../split_learning_trainer.py | 50 +++++++++++++++++++ .../split_learning_wikitext2_gpt2.toml | 7 ++- plato/trainers/split_learning.py | 2 + 3 files changed, 58 insertions(+), 1 deletion(-) diff --git a/examples/split_learning/llm_split_learning/split_learning_trainer.py b/examples/split_learning/llm_split_learning/split_learning_trainer.py index b921eba66..87b97a56d 100644 --- a/examples/split_learning/llm_split_learning/split_learning_trainer.py +++ b/examples/split_learning/llm_split_learning/split_learning_trainer.py @@ -161,7 +161,57 @@ def test_model(self, model, config, testset, sampler, context): # Save other metric information such as accuracy tester.log_metrics("eval", metrics) return metrics["eval_accuracy"] + + def eval_model( + self, + model, + config, + benchmark, + sampler, + context + ): + """ + Evaluate the model using the benchmark specified in the configuration. + + This is a specialized implementation for HuggingFace-based models. + + Arguments: + model: The model to evaluate + config: Testing configuration dictionary + benchmark: Benchmark instance (e.g., from plato.benchmarks.registry.get()) + sampler: Optional data sampler (not used for CORE benchmark) + context: Training context + Returns: + Benchmark results dictionary containing: + - 'results': per-task accuracies (for CORE) + - 'centered_results': normalized scores (for CORE) + - 'core_metric': overall benchmark score (for CORE) + """ + + if hasattr(model, "copy_weight"): + model.copy_weight() + + # Get base model if available + base_model = model.base_model if hasattr(model, "base_model") else model + + # Set model to eval mode and move to device + base_model.to(context.device) + base_model.eval() + + + if hasattr(benchmark, 'model'): + benchmark.model = base_model + if hasattr(benchmark, 'device'): + benchmark.device = context.device + if hasattr(benchmark, 'tokenizer') and self.tokenizer is not None: + benchmark.tokenizer = self.tokenizer + + # Use benchmark's evaluate method to get results + # benchmark.evaluate() returns dict with metrics + results = benchmark.evaluate() + + return results # ============================================================================ # Custom Callbacks for LLM Split Learning diff --git a/examples/split_learning/llm_split_learning/split_learning_wikitext2_gpt2.toml b/examples/split_learning/llm_split_learning/split_learning_wikitext2_gpt2.toml index cd068c09c..88c67e48b 100644 --- a/examples/split_learning/llm_split_learning/split_learning_wikitext2_gpt2.toml +++ b/examples/split_learning/llm_split_learning/split_learning_wikitext2_gpt2.toml @@ -40,13 +40,18 @@ random_seed = 1 # IID, biased, or sharded? sampler = "iid" +[benchmark] +type = "core" # Benchmark type (from registry) +max_per_task = 16 # Limit samples per task for faster evaluation +random_seed = 1 + [trainer] # The type of the trainer type = "split_learning" # The maximum number of training rounds -rounds = 100000 +rounds = 10 # The machine learning model model_type = "huggingface" diff --git a/plato/trainers/split_learning.py b/plato/trainers/split_learning.py index 8a8b81536..ac09322ee 100644 --- a/plato/trainers/split_learning.py +++ b/plato/trainers/split_learning.py @@ -214,6 +214,8 @@ def test_model(self, model, config, testset, sampler, context): accuracy = correct / total return accuracy + def eval_model(self, model, config, benchmark, sampler, context): + raise NotImplementedError("eval_model is not implemented yet for SplitLearningTestingStrategy.") # pylint:disable=too-many-instance-attributes class Trainer(ComposableTrainer): From ec1c1ba3403550fae5a1fcd0e44d73f10880576d Mon Sep 17 00:00:00 2001 From: Jasmine-Yuting-Zhang Date: Tue, 28 Oct 2025 03:32:50 -0400 Subject: [PATCH 17/17] Reformatted code using Ruff. --- .../split_learning_trainer.py | 21 ++--- .../fedunlearning/fedunlearning_server.py | 4 +- plato/benchmarks/base.py | 13 +-- plato/benchmarks/core.py | 79 +++++++++---------- plato/benchmarks/core_helpers/core.py | 6 +- plato/benchmarks/core_helpers/tokenizer.py | 47 +++++++---- plato/benchmarks/registry.py | 7 +- plato/config.py | 4 +- plato/servers/fedavg.py | 16 +++- plato/trainers/split_learning.py | 5 +- plato/trainers/strategies/base.py | 2 +- plato/trainers/strategies/testing.py | 11 +-- 12 files changed, 112 insertions(+), 103 deletions(-) diff --git a/examples/split_learning/llm_split_learning/split_learning_trainer.py b/examples/split_learning/llm_split_learning/split_learning_trainer.py index 87b97a56d..348d6a34b 100644 --- a/examples/split_learning/llm_split_learning/split_learning_trainer.py +++ b/examples/split_learning/llm_split_learning/split_learning_trainer.py @@ -161,15 +161,8 @@ def test_model(self, model, config, testset, sampler, context): # Save other metric information such as accuracy tester.log_metrics("eval", metrics) return metrics["eval_accuracy"] - - def eval_model( - self, - model, - config, - benchmark, - sampler, - context - ): + + def eval_model(self, model, config, benchmark, sampler, context): """ Evaluate the model using the benchmark specified in the configuration. @@ -194,17 +187,16 @@ def eval_model( # Get base model if available base_model = model.base_model if hasattr(model, "base_model") else model - + # Set model to eval mode and move to device base_model.to(context.device) base_model.eval() - - if hasattr(benchmark, 'model'): + if hasattr(benchmark, "model"): benchmark.model = base_model - if hasattr(benchmark, 'device'): + if hasattr(benchmark, "device"): benchmark.device = context.device - if hasattr(benchmark, 'tokenizer') and self.tokenizer is not None: + if hasattr(benchmark, "tokenizer") and self.tokenizer is not None: benchmark.tokenizer = self.tokenizer # Use benchmark's evaluate method to get results @@ -213,6 +205,7 @@ def eval_model( return results + # ============================================================================ # Custom Callbacks for LLM Split Learning # ============================================================================ diff --git a/examples/unlearning/fedunlearning/fedunlearning_server.py b/examples/unlearning/fedunlearning/fedunlearning_server.py index 8938b6549..6d6ade576 100644 --- a/examples/unlearning/fedunlearning/fedunlearning_server.py +++ b/examples/unlearning/fedunlearning/fedunlearning_server.py @@ -43,9 +43,7 @@ async def aggregate_deltas(self, updates, deltas_received, context): if not filtered_pairs: if self._fallback_to_original: - return await super().aggregate_deltas( - updates, deltas_received, context - ) + return await super().aggregate_deltas(updates, deltas_received, context) zero_delta = self._zero_delta( context, deltas_received[0] if deltas_received else None diff --git a/plato/benchmarks/base.py b/plato/benchmarks/base.py index a6e78516b..3978723ce 100644 --- a/plato/benchmarks/base.py +++ b/plato/benchmarks/base.py @@ -15,6 +15,7 @@ import requests import contextlib, time + class Benchmark(ABC): """Base class for model benchmarks.""" @@ -23,24 +24,24 @@ def __init__(self): Initialize the benchmark. """ super().__init__() - + @abstractmethod def evaluate(self) -> dict[str, Any]: """ Evaluate the model on benchmark tasks. evaluate() returns evaluation results. - + Returns: Dictionary of evaluation metrics - + Example: >>> results = benchmark.evaluate() - >>> print(results) + >>> print(results) {'task1_accuracy': 0.85, 'overall': 0.875} """ pass - + @abstractmethod def get_formatted_result(self) -> str: pass @@ -132,4 +133,4 @@ def download(url, data_path): logging.info("Unknown compressed file type for %s.", file_name) sys.exit() - sentinel.touch() \ No newline at end of file + sentinel.touch() diff --git a/plato/benchmarks/core.py b/plato/benchmarks/core.py index 854048c2a..b2af7ef53 100644 --- a/plato/benchmarks/core.py +++ b/plato/benchmarks/core.py @@ -23,58 +23,59 @@ class Benchmark(base.Benchmark): """ CORE benchmark - evaluates language models on the CORE suite. """ - + def __init__(self): """ Initialize CORE benchmark -- load benchmark tasks and data. """ super().__init__() - + # These will be set externally before evaluate() is called self.model = None self.device = None self.tokenizer = None - + # Get configuration specific to CORE benchmark - self.random_seed = getattr(Config().benchmark, 'random_seed', 24) - self.max_per_task = getattr(Config().benchmark, 'max_per_task', -1) + self.random_seed = getattr(Config().benchmark, "random_seed", 24) + self.max_per_task = getattr(Config().benchmark, "max_per_task", -1) # Load benchmark tasks and datasets self._load_benchmark_data() - def _load_benchmark_data(self): """ Load CORE benchmark tasks and evaluation data. - + Downloads the evaluation bundle if not already present, then loads task configurations and data files. """ # Get base directory and ensure eval_bundle is downloaded benchmark_base_dir = Config.params["benchmark_path"] - + # Download eval_bundle if not present if not os.path.exists(benchmark_base_dir): logging.info("CORE evaluation bundle not found. Downloading...") - eval_bundle_url = "https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip" + eval_bundle_url = ( + "https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip" + ) Benchmark.download(eval_bundle_url, benchmark_base_dir) - + # Load benchmark configuration eval_bundle_dir = os.path.join(benchmark_base_dir, "eval_bundle") config_path = os.path.join(eval_bundle_dir, "core.yaml") self.eval_meta_data_path = os.path.join(eval_bundle_dir, "eval_meta_data.csv") self.data_base_path = os.path.join(eval_bundle_dir, "eval_data") - + with open(config_path, "r") as f: config = yaml.safe_load(f) - + self.tasks = config["icl_tasks"] self.eval_metadata = pd.read_csv(self.eval_meta_data_path) - + def evaluate(self) -> dict[str, Any]: """ Evaluate the model on all CORE tasks. - + Returns: Dictionary containing: - 'results': per-task accuracies @@ -84,59 +85,59 @@ def evaluate(self) -> dict[str, Any]: if self.model is None: raise RuntimeError("Trainer has no model - cannot run benchmark") - + if self.tokenizer is None: raise RuntimeError("Trainer has no tokenizer - cannot run benchmark") results = {} centered_results = {} - + # Set model to eval mode self.model.eval() - + with torch.no_grad(): for task in self.tasks: start_time = time.time() label = task["label"] - + task_meta = { "task_type": task["icl_task_type"], "dataset_uri": task["dataset_uri"], "num_fewshot": task["num_fewshot"][0], "continuation_delimiter": task.get("continuation_delimiter", " "), } - + logging.info( "Evaluating task: %s (%d-shot, type: %s)", label, - task_meta['num_fewshot'], - task_meta['task_type'] + task_meta["num_fewshot"], + task_meta["task_type"], ) - + # Load data for this task (matching evaluate_model.py pattern) data_path = os.path.join(self.data_base_path, task_meta["dataset_uri"]) with open(data_path, "r") as f: data = [json.loads(line.strip()) for line in f] - + # Shuffle the data for reproducibility (matching evaluate_model.py) shuffle_rng = random.Random(self.random_seed) shuffle_rng.shuffle(data) - + # Crop data if max_per_task is specified if self.max_per_task > 0: - data = data[:self.max_per_task] - + data = data[: self.max_per_task] + # Run evaluation using existing core_eval logic accuracy = core.evaluate_task( - self.model, # Model in CUDA memory from trainer + self.model, # Model in CUDA memory from trainer self.tokenizer, # Tokenizer from trainer data, self.device, - task_meta + task_meta, ) - + results[label] = accuracy - + # Compute centered result (normalized by random baseline) row = self.eval_metadata[self.eval_metadata["Eval Task"] == label] random_baseline = row["Random baseline"].values[0] @@ -144,25 +145,24 @@ def evaluate(self) -> dict[str, Any]: 1.0 - 0.01 * random_baseline ) centered_results[label] = centered - + elapsed = time.time() - start_time logging.info( "accuracy: %.4f | centered: %.4f | time: %.2fs", accuracy, centered, - elapsed + elapsed, ) - + # Compute overall CORE metric core_metric = sum(centered_results.values()) / len(centered_results) - return { "results": results, "centered_results": centered_results, "core_metric": core_metric, } - + def get_formatted_result(self, evaluation_result: dict[str, Any]) -> str: """ Format the evaluation results for display. @@ -179,10 +179,9 @@ def get_formatted_result(self, evaluation_result: dict[str, Any]) -> str: result_lines = [f"{'Task':<35}, {'Accuracy':<10}, {'Centered':<10}"] for task, acc in results.items(): centered = centered_results[task] - result_lines.append( - f"{task:<35}, {acc:<10.6f}, {centered:<10.6f}" - ) - result_lines.append(f"{'Overall CORE Metric':<35}, {'':<10}, {core_metric:<10.6f}\n") + result_lines.append(f"{task:<35}, {acc:<10.6f}, {centered:<10.6f}") + result_lines.append( + f"{'Overall CORE Metric':<35}, {'':<10}, {core_metric:<10.6f}\n" + ) return "\n".join(result_lines) - \ No newline at end of file diff --git a/plato/benchmarks/core_helpers/core.py b/plato/benchmarks/core_helpers/core.py index 2639c9c48..d767dd49d 100644 --- a/plato/benchmarks/core_helpers/core.py +++ b/plato/benchmarks/core_helpers/core.py @@ -154,13 +154,13 @@ def forward_model(model, input_ids): """ batch_size, seq_len = input_ids.size() outputs = model(input_ids) - + # Extract logits from model output (handles both raw tensors and HuggingFace output objects) - if hasattr(outputs, 'logits'): + if hasattr(outputs, "logits"): logits = outputs.logits else: logits = outputs - + # Roll the tensor to the left by one position to get the (autoregressive) target ids target_ids = torch.roll(input_ids, shifts=-1, dims=1) # Calculate cross entropy at all positions diff --git a/plato/benchmarks/core_helpers/tokenizer.py b/plato/benchmarks/core_helpers/tokenizer.py index 7b6f0e4d5..05dc619ab 100644 --- a/plato/benchmarks/core_helpers/tokenizer.py +++ b/plato/benchmarks/core_helpers/tokenizer.py @@ -167,7 +167,7 @@ def save(self, tokenizer_dir): class UniversalHuggingFaceTokenizer: """ Universal wrapper that provides a consistent interface for any HuggingFace tokenizer. - + This wrapper automatically detects special tokens (BOS, PAD, EOS) and provides utility methods that work across different tokenizer implementations. """ @@ -175,7 +175,7 @@ class UniversalHuggingFaceTokenizer: def __init__(self, tokenizer): """ Initialize the wrapper with a HuggingFace tokenizer. - + Args: tokenizer: A HuggingFace tokenizer instance (e.g., GPT2TokenizerFast) """ @@ -188,7 +188,7 @@ def __init__(self, tokenizer): def _detect_special_tokens(self): """ Auto-detect special token IDs from the tokenizer. - + Detection strategy (in order of priority): 1. Try direct attributes on the tokenizer (bos_token_id, pad_token_id, eos_token_id) 2. For missing tokens, use EOS as BOS/PAD for models like GPT-2 @@ -196,13 +196,22 @@ def _detect_special_tokens(self): 4. Final fallbacks: 0 for pad, pad for bos """ # Strategy 1: Direct attributes (works for most HuggingFace tokenizers) - if hasattr(self.tokenizer, 'bos_token_id') and self.tokenizer.bos_token_id is not None: + if ( + hasattr(self.tokenizer, "bos_token_id") + and self.tokenizer.bos_token_id is not None + ): self._bos_token_id = self.tokenizer.bos_token_id - - if hasattr(self.tokenizer, 'pad_token_id') and self.tokenizer.pad_token_id is not None: + + if ( + hasattr(self.tokenizer, "pad_token_id") + and self.tokenizer.pad_token_id is not None + ): self._pad_token_id = self.tokenizer.pad_token_id - - if hasattr(self.tokenizer, 'eos_token_id') and self.tokenizer.eos_token_id is not None: + + if ( + hasattr(self.tokenizer, "eos_token_id") + and self.tokenizer.eos_token_id is not None + ): self._eos_token_id = self.tokenizer.eos_token_id # For GPT-2 and similar models, BOS is often the same as EOS if self._bos_token_id is None: @@ -210,18 +219,22 @@ def _detect_special_tokens(self): # Use EOS as pad if no pad token exists if self._pad_token_id is None: self._pad_token_id = self._eos_token_id - + # Strategy 2: Try token_to_id method for tokenizers with nested structure if hasattr(self.tokenizer, "tokenizer"): tokenizer_obj = self.tokenizer.tokenizer if self._pad_token_id is None: pad_candidates = ["", "[PAD]", "<|pad|>", "", "<|endoftext|>"] - self._pad_token_id = self._try_token_candidates(tokenizer_obj, pad_candidates) + self._pad_token_id = self._try_token_candidates( + tokenizer_obj, pad_candidates + ) if self._bos_token_id is None: bos_candidates = ["", "[CLS]", "<|startoftext|>", "<|endoftext|>"] - self._bos_token_id = self._try_token_candidates(tokenizer_obj, bos_candidates) + self._bos_token_id = self._try_token_candidates( + tokenizer_obj, bos_candidates + ) # Strategy 3: Final fallbacks if self._pad_token_id is None: @@ -233,17 +246,17 @@ def _detect_special_tokens(self): def _try_token_candidates(self, tokenizer_obj, candidates): """ Try to find a token ID from a list of candidate token strings. - + Args: tokenizer_obj: The tokenizer object with token_to_id method candidates: List of token strings to try - + Returns: Token ID if found, None otherwise """ if not hasattr(tokenizer_obj, "token_to_id"): return None - + for candidate in candidates: token_id = tokenizer_obj.token_to_id(candidate) if token_id is not None: @@ -257,7 +270,7 @@ def get_bos_token_id(self): def get_pad_token_id(self): """Get the padding token ID.""" return self._pad_token_id - + def get_eos_token_id(self): """Get the end-of-sequence token ID.""" return self._eos_token_id @@ -265,11 +278,11 @@ def get_eos_token_id(self): def __call__(self, prompts, prepend=None): """ Tokenize prompts with optional prepended token. - + Args: prompts: Single string or list of strings to tokenize prepend: Optional token ID to prepend to each sequence - + Returns: List of token IDs, or list of lists if multiple prompts """ diff --git a/plato/benchmarks/registry.py b/plato/benchmarks/registry.py index 3cfb03252..1325e8bff 100644 --- a/plato/benchmarks/registry.py +++ b/plato/benchmarks/registry.py @@ -3,6 +3,7 @@ Enables runtime benchmark selection via configuration. """ + from plato.benchmarks import core from plato.benchmarks.base import Benchmark as BenchmarkBase @@ -10,6 +11,7 @@ "core": core.Benchmark, } + def get(type: str) -> BenchmarkBase: """Get an instance of the benchmark.""" if type in registered_benchmarks: @@ -18,10 +20,9 @@ def get(type: str) -> BenchmarkBase: else: available = list(registered_benchmarks.keys()) raise ValueError( - f"No such benchmark: {type}. " - f"Available benchmarks: {available}" + f"No such benchmark: {type}. Available benchmarks: {available}" ) - + return registered_benchmark diff --git a/plato/config.py b/plato/config.py index a3c73344e..9d0f68b15 100644 --- a/plato/config.py +++ b/plato/config.py @@ -346,7 +346,7 @@ def __new__(cls): # User specific benchmark if hasattr(config, "benchmark"): Config.benchmark = config.benchmark - + # Directory of benchmark dataset if hasattr(Config().benchmark, "data_path"): Config.params["benchmark_path"] = os.path.join( @@ -416,7 +416,7 @@ def __new__(cls): if hasattr(config, "parameters"): Config.parameters = config.parameters - + # Benchmark configuration (for model evaluation) if hasattr(config, "benchmark"): Config.benchmark = config.benchmark diff --git a/plato/servers/fedavg.py b/plato/servers/fedavg.py index b95fe76b8..30b262b5a 100644 --- a/plato/servers/fedavg.py +++ b/plato/servers/fedavg.py @@ -256,15 +256,23 @@ async def _process_reports(self): self.accuracy = trainer.test(self.testset, self.testset_sampler) # Evaluating the global model on the specified benchmark - if hasattr(Config().config, "benchmark") and hasattr(Config().benchmark, "type"): + if hasattr(Config().config, "benchmark") and hasattr( + Config().benchmark, "type" + ): benchmark_type = Config().benchmark.type if self.benchmark is None: self.benchmark = benchmarks_registry.get(benchmark_type) - logging.info("[%s] Started model evaluation on benchmark %s.", self, benchmark_type) + logging.info( + "[%s] Started model evaluation on benchmark %s.", self, benchmark_type + ) trainer = self.require_trainer() self.benchmark_result = trainer.eval(self.benchmark, self.testset_sampler) - logging.info("[%s] Model evaluation result on benchmark %s:\n%s.", self, benchmark_type, self.benchmark.get_formatted_result(self.benchmark_result)) - + logging.info( + "[%s] Model evaluation result on benchmark %s:\n%s.", + self, + benchmark_type, + self.benchmark.get_formatted_result(self.benchmark_result), + ) if hasattr(Config().trainer, "target_perplexity"): logging.info( diff --git a/plato/trainers/split_learning.py b/plato/trainers/split_learning.py index ac09322ee..dcacb08da 100644 --- a/plato/trainers/split_learning.py +++ b/plato/trainers/split_learning.py @@ -215,7 +215,10 @@ def test_model(self, model, config, testset, sampler, context): return accuracy def eval_model(self, model, config, benchmark, sampler, context): - raise NotImplementedError("eval_model is not implemented yet for SplitLearningTestingStrategy.") + raise NotImplementedError( + "eval_model is not implemented yet for SplitLearningTestingStrategy." + ) + # pylint:disable=too-many-instance-attributes class Trainer(ComposableTrainer): diff --git a/plato/trainers/strategies/base.py b/plato/trainers/strategies/base.py index a119ab20b..30032f252 100644 --- a/plato/trainers/strategies/base.py +++ b/plato/trainers/strategies/base.py @@ -586,4 +586,4 @@ def eval_model( setting eval mode, and computing the benchmark metrics. The specific return format depends on the benchmark type. """ - pass \ No newline at end of file + pass diff --git a/plato/trainers/strategies/testing.py b/plato/trainers/strategies/testing.py index dd76b9ff4..b170f35a0 100644 --- a/plato/trainers/strategies/testing.py +++ b/plato/trainers/strategies/testing.py @@ -99,14 +99,7 @@ def test_model(self, model, config, testset, sampler, context): return accuracy - def eval_model( - self, - model, - config, - benchmark, - sampler, - context - ) -> dict[str, Any]: + def eval_model(self, model, config, benchmark, sampler, context) -> dict[str, Any]: """ Evaluate the model on benchmark and return results. @@ -128,4 +121,4 @@ def eval_model( raise NotImplementedError( "DefaultTestingStrategy does not support benchmark evaluation. " "Please implement a custom TestingStrategy with eval_model() for your use case." - ) \ No newline at end of file + )