From e29df9b48b65d2beeca41de54cdd9fc5dbef7f7e Mon Sep 17 00:00:00 2001 From: jainapurva Date: Thu, 4 Dec 2025 19:23:24 +0000 Subject: [PATCH 1/3] Add relevant shapes to microbenchmarks --- .../microbenchmark_quantization_config.yml | 8 +-- .../microbenchmarks/benchmark_runner.py | 49 ++++++++++++++++++- 2 files changed, 53 insertions(+), 4 deletions(-) diff --git a/benchmarks/dashboard/microbenchmark_quantization_config.yml b/benchmarks/dashboard/microbenchmark_quantization_config.yml index 4483112fa1..d3e57647d4 100644 --- a/benchmarks/dashboard/microbenchmark_quantization_config.yml +++ b/benchmarks/dashboard/microbenchmark_quantization_config.yml @@ -10,9 +10,11 @@ output_dir: "benchmarks/microbenchmarks/results" model_params: - name: "small_bf16_linear" matrix_shapes: - - name: "small_sweep" - min_power: 10 - max_power: 15 + - name: "llama4" + - name: "deepseek_v3_236b" + - name: "deepseek_v3_671b" + - name: "qwen3_32b" + - name: "gemma3_27b" high_precision_dtype: "torch.bfloat16" torch_compile_mode: "max-autotune" device: "cuda" diff --git a/benchmarks/microbenchmarks/benchmark_runner.py b/benchmarks/microbenchmarks/benchmark_runner.py index 45a0534ee0..51f5d8182f 100644 --- a/benchmarks/microbenchmarks/benchmark_runner.py +++ b/benchmarks/microbenchmarks/benchmark_runner.py @@ -60,6 +60,53 @@ def get_shapes_for_config( "ffn.w2": (M, 3584, 8192), } shapes.extend([(f"{name}_{k}", v) for k, v in llama_shapes.items()]) + elif name == "llama4": + # LLaMa 4 shapes + llama4_shapes = [ + ("FFN", (16384, 8192, 5120)), + ("QO_proj", (16384, 8192, 8192)), + ("KV_proj", (16384, 8192, 1024)), + ("FFN", (128000, 8192, 5120)), + ("QO_proj", (128000, 8192, 8192)), + ("KV_proj", (128000, 8192, 1024)), + ] + shapes.extend([(f"{name}_{k}", v) for k, v in llama4_shapes]) + elif name == "deepseek_v3_236b": + # DeepSeek V3 236B shapes + deepseek_v3_236b_shapes = [ + ("FFN", (16384, 1536, 5120)), + ("QKVO_proj", (16384, 7168, 7168)), + ("FFN", (128000, 1536, 5120)), + ("QKVO_proj", (128000, 7168, 7168)), + ] + shapes.extend([(f"{name}_{k}", v) for k, v in deepseek_v3_236b_shapes]) + elif name == "deepseek_v3_671b": + # DeepSeek V3 671B shapes + deepseek_v3_671b_shapes = [ + ("FFN", (16384, 2048, 7168)), + ("QKVO_proj", (16384, 7168, 7168)), + ("FFN", (128000, 2048, 7168)), + ("QKVO_proj", (128000, 7168, 7168)), + ] + shapes.extend([(f"{name}_{k}", v) for k, v in deepseek_v3_671b_shapes]) + elif name == "qwen3_32b": + # Qwen3 32B shapes + qwen3_32b_shapes = [ + ("QO_proj", (16384, 5120, 5120)), + ("KV_proj", (16384, 5120, 640)), + ("QO_proj", (128000, 5120, 5120)), + ("KV_proj", (128000, 5120, 640)), + ] + shapes.extend([(f"{name}_{k}", v) for k, v in qwen3_32b_shapes]) + elif name == "gemma3_27b": + # Gemma3 27B shapes + gemma3_27b_shapes = [ + ("QO_proj", (16384, 4096, 4096)), + ("KV_proj", (16384, 4096, 1024)), + ("QO_proj", (128000, 4096, 4096)), + ("KV_proj", (128000, 4096, 1024)), + ] + shapes.extend([(f"{name}_{k}", v) for k, v in gemma3_27b_shapes]) elif name == "pow2": # Generate shapes with dimensions that are powers of 2 min_power_of_2 = shape_config.get("min_power", 10) # 1024 @@ -105,7 +152,7 @@ def get_shapes_for_config( counter += 1 else: raise NotImplementedError( - f"Shape config {name} not supported. Supported options: custom, llama, pow2, pow2_extended, sweep." + f"Shape config {name} not supported. Supported options: custom, llama, llama4, deepseek_v3_236b, deepseek_v3_671b, qwen3_32b, gemma3_27b, pow2, pow2_extended, sweep." ) return shapes From 6d6edd7d506fb40ae0bfc5a1f0ef0088201e15e3 Mon Sep 17 00:00:00 2001 From: jainapurva Date: Tue, 9 Dec 2025 17:41:30 +0000 Subject: [PATCH 2/3] updates --- .../dashboard/microbenchmark_quantization_config.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/benchmarks/dashboard/microbenchmark_quantization_config.yml b/benchmarks/dashboard/microbenchmark_quantization_config.yml index d3e57647d4..1666a1331d 100644 --- a/benchmarks/dashboard/microbenchmark_quantization_config.yml +++ b/benchmarks/dashboard/microbenchmark_quantization_config.yml @@ -15,6 +15,13 @@ model_params: - name: "deepseek_v3_671b" - name: "qwen3_32b" - name: "gemma3_27b" + - name: "custom" + shapes: [ + [1920, 3072, 3072], + [1920, 3072, 9216], + [1920, 3072, 14336], + [1920, 14336, 3072] + ] high_precision_dtype: "torch.bfloat16" torch_compile_mode: "max-autotune" device: "cuda" From 5f36d5e057d04646b43cbf7ae06e9ab9e2bb8df6 Mon Sep 17 00:00:00 2001 From: Apurva Jain Date: Tue, 9 Dec 2025 10:23:57 -0800 Subject: [PATCH 3/3] Re-enable int8wo in benchmark configuration --- benchmarks/dashboard/microbenchmark_quantization_config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/dashboard/microbenchmark_quantization_config.yml b/benchmarks/dashboard/microbenchmark_quantization_config.yml index 1666a1331d..5d6563f25e 100644 --- a/benchmarks/dashboard/microbenchmark_quantization_config.yml +++ b/benchmarks/dashboard/microbenchmark_quantization_config.yml @@ -1,7 +1,7 @@ # Benchmark configuration for microbenchmarks benchmark_mode: "inference" quantization_config_recipe_names: # Will run a baseline inference for model by default, without quantization for comparison - # - "int8wo" TODO: Re-enable once we debug the delay in the benchmark + - "int8wo" - "int8dq" - "float8dq-tensor" - "float8dq-row"