From e29df9b48b65d2beeca41de54cdd9fc5dbef7f7e Mon Sep 17 00:00:00 2001
From: jainapurva <apurvajain.kota@gmail.com>
Date: Thu, 4 Dec 2025 19:23:24 +0000
Subject: [PATCH 1/3] Add relevant shapes to microbenchmarks

---
 .../microbenchmark_quantization_config.yml    |  8 +--
 .../microbenchmarks/benchmark_runner.py       | 49 ++++++++++++++++++-
 2 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/benchmarks/dashboard/microbenchmark_quantization_config.yml b/benchmarks/dashboard/microbenchmark_quantization_config.yml
index 4483112fa1..d3e57647d4 100644
--- a/benchmarks/dashboard/microbenchmark_quantization_config.yml
+++ b/benchmarks/dashboard/microbenchmark_quantization_config.yml
@@ -10,9 +10,11 @@ output_dir: "benchmarks/microbenchmarks/results"
 model_params:
   - name: "small_bf16_linear"
     matrix_shapes:
-      - name: "small_sweep"
-        min_power: 10
-        max_power: 15
+      - name: "llama4"
+      - name: "deepseek_v3_236b"
+      - name: "deepseek_v3_671b"
+      - name: "qwen3_32b"
+      - name: "gemma3_27b"
     high_precision_dtype: "torch.bfloat16"
     torch_compile_mode: "max-autotune"
     device: "cuda"
diff --git a/benchmarks/microbenchmarks/benchmark_runner.py b/benchmarks/microbenchmarks/benchmark_runner.py
index 45a0534ee0..51f5d8182f 100644
--- a/benchmarks/microbenchmarks/benchmark_runner.py
+++ b/benchmarks/microbenchmarks/benchmark_runner.py
@@ -60,6 +60,53 @@ def get_shapes_for_config(
                 "ffn.w2": (M, 3584, 8192),
             }
             shapes.extend([(f"{name}_{k}", v) for k, v in llama_shapes.items()])
+        elif name == "llama4":
+            # LLaMa 4 shapes
+            llama4_shapes = [
+                ("FFN", (16384, 8192, 5120)),
+                ("QO_proj", (16384, 8192, 8192)),
+                ("KV_proj", (16384, 8192, 1024)),
+                ("FFN", (128000, 8192, 5120)),
+                ("QO_proj", (128000, 8192, 8192)),
+                ("KV_proj", (128000, 8192, 1024)),
+            ]
+            shapes.extend([(f"{name}_{k}", v) for k, v in llama4_shapes])
+        elif name == "deepseek_v3_236b":
+            # DeepSeek V3 236B shapes
+            deepseek_v3_236b_shapes = [
+                ("FFN", (16384, 1536, 5120)),
+                ("QKVO_proj", (16384, 7168, 7168)),
+                ("FFN", (128000, 1536, 5120)),
+                ("QKVO_proj", (128000, 7168, 7168)),
+            ]
+            shapes.extend([(f"{name}_{k}", v) for k, v in deepseek_v3_236b_shapes])
+        elif name == "deepseek_v3_671b":
+            # DeepSeek V3 671B shapes
+            deepseek_v3_671b_shapes = [
+                ("FFN", (16384, 2048, 7168)),
+                ("QKVO_proj", (16384, 7168, 7168)),
+                ("FFN", (128000, 2048, 7168)),
+                ("QKVO_proj", (128000, 7168, 7168)),
+            ]
+            shapes.extend([(f"{name}_{k}", v) for k, v in deepseek_v3_671b_shapes])
+        elif name == "qwen3_32b":
+            # Qwen3 32B shapes
+            qwen3_32b_shapes = [
+                ("QO_proj", (16384, 5120, 5120)),
+                ("KV_proj", (16384, 5120, 640)),
+                ("QO_proj", (128000, 5120, 5120)),
+                ("KV_proj", (128000, 5120, 640)),
+            ]
+            shapes.extend([(f"{name}_{k}", v) for k, v in qwen3_32b_shapes])
+        elif name == "gemma3_27b":
+            # Gemma3 27B shapes
+            gemma3_27b_shapes = [
+                ("QO_proj", (16384, 4096, 4096)),
+                ("KV_proj", (16384, 4096, 1024)),
+                ("QO_proj", (128000, 4096, 4096)),
+                ("KV_proj", (128000, 4096, 1024)),
+            ]
+            shapes.extend([(f"{name}_{k}", v) for k, v in gemma3_27b_shapes])
         elif name == "pow2":
             # Generate shapes with dimensions that are powers of 2
             min_power_of_2 = shape_config.get("min_power", 10)  # 1024
@@ -105,7 +152,7 @@ def get_shapes_for_config(
                         counter += 1
         else:
             raise NotImplementedError(
-                f"Shape config {name} not supported. Supported options: custom, llama, pow2, pow2_extended, sweep."
+                f"Shape config {name} not supported. Supported options: custom, llama, llama4, deepseek_v3_236b, deepseek_v3_671b, qwen3_32b, gemma3_27b, pow2, pow2_extended, sweep."
             )
     return shapes
 

From 6d6edd7d506fb40ae0bfc5a1f0ef0088201e15e3 Mon Sep 17 00:00:00 2001
From: jainapurva <apurvajain.kota@gmail.com>
Date: Tue, 9 Dec 2025 17:41:30 +0000
Subject: [PATCH 2/3] updates

---
 .../dashboard/microbenchmark_quantization_config.yml       | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/benchmarks/dashboard/microbenchmark_quantization_config.yml b/benchmarks/dashboard/microbenchmark_quantization_config.yml
index d3e57647d4..1666a1331d 100644
--- a/benchmarks/dashboard/microbenchmark_quantization_config.yml
+++ b/benchmarks/dashboard/microbenchmark_quantization_config.yml
@@ -15,6 +15,13 @@ model_params:
       - name: "deepseek_v3_671b"
       - name: "qwen3_32b"
       - name: "gemma3_27b"
+      - name: "custom"
+        shapes: [
+          [1920, 3072, 3072],
+          [1920, 3072, 9216],
+          [1920, 3072, 14336],
+          [1920, 14336, 3072]
+        ] 
     high_precision_dtype: "torch.bfloat16"
     torch_compile_mode: "max-autotune"
     device: "cuda"

From 5f36d5e057d04646b43cbf7ae06e9ab9e2bb8df6 Mon Sep 17 00:00:00 2001
From: Apurva Jain <apurvajain.kota@gmail.com>
Date: Tue, 9 Dec 2025 10:23:57 -0800
Subject: [PATCH 3/3] Re-enable int8wo in benchmark configuration

---
 benchmarks/dashboard/microbenchmark_quantization_config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/dashboard/microbenchmark_quantization_config.yml b/benchmarks/dashboard/microbenchmark_quantization_config.yml
index 1666a1331d..5d6563f25e 100644
--- a/benchmarks/dashboard/microbenchmark_quantization_config.yml
+++ b/benchmarks/dashboard/microbenchmark_quantization_config.yml
@@ -1,7 +1,7 @@
 # Benchmark configuration for microbenchmarks
 benchmark_mode: "inference"
 quantization_config_recipe_names: # Will run a baseline inference for model by default, without quantization for comparison
-  # - "int8wo" TODO: Re-enable once we debug the delay in the benchmark
+  - "int8wo"
   - "int8dq"
   - "float8dq-tensor"
   - "float8dq-row"