From baee214fe59878ebbbc887a3aa3628ba572de9da Mon Sep 17 00:00:00 2001
From: "codeflash-ai[bot]"
 <148906541+codeflash-ai[bot]@users.noreply.github.com>
Date: Sun, 14 Dec 2025 17:16:44 +0000
Subject: [PATCH] Optimize FunctionRanker.get_function_stats_summary
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The optimization replaces an O(N) linear search through all functions with an O(1) hash table lookup followed by iteration over only matching function names.

**Key Changes:**
- Added `_function_stats_by_name` index in `__init__` that maps function names to lists of (key, stats) tuples
- Modified `get_function_stats_summary` to first lookup candidates by function name, then iterate only over those candidates

**Why This is Faster:**
The original code iterates through ALL function stats (22,603 iterations in the profiler results) for every lookup. The optimized version uses a hash table to instantly find only the functions with matching names, then iterates through just those candidates (typically 1-2 functions).

**Performance Impact:**
- **Small datasets**: 15-30% speedup as shown in basic test cases
- **Large datasets**: Dramatic improvement - the `test_large_scale_performance` case with 900 functions shows **3085% speedup** (66.7μs → 2.09μs)
- **Overall benchmark**: 2061% speedup demonstrates the optimization scales excellently with dataset size

**When This Optimization Shines:**
- Large codebases with many profiled functions (where the linear search becomes expensive)
- Repeated function lookups (if this method is called frequently)
- Cases with many unique function names but few duplicates per name

The optimization maintains identical behavior while transforming the algorithm from O(N) per lookup to O(average functions per name) per lookup, which is typically O(1) in practice.
---
 codeflash/benchmarking/function_ranker.py | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/codeflash/benchmarking/function_ranker.py b/codeflash/benchmarking/function_ranker.py
index e68ee1d72..b559f2ed7 100644
--- a/codeflash/benchmarking/function_ranker.py
+++ b/codeflash/benchmarking/function_ranker.py
@@ -56,8 +56,15 @@ def __init__(self, trace_file_path: Path) -> None:
         self.trace_file_path = trace_file_path
         self._profile_stats = ProfileStats(trace_file_path.as_posix())
         self._function_stats: dict[str, dict] = {}
+        self._function_stats_by_name: dict[str, list[tuple[str, dict]]] = {}
         self.load_function_stats()
 
+        # Build index for faster lookups: map function_name to list of (key, stats)
+        for key, stats in self._function_stats.items():
+            func_name = stats.get("function_name")
+            if func_name:
+                self._function_stats_by_name.setdefault(func_name, []).append((key, stats))
+
     def load_function_stats(self) -> None:
         try:
             pytest_filtered_count = 0
@@ -114,10 +121,16 @@ def load_function_stats(self) -> None:
 
     def get_function_stats_summary(self, function_to_optimize: FunctionToOptimize) -> dict | None:
         target_filename = function_to_optimize.file_path.name
-        for key, stats in self._function_stats.items():
-            if stats.get("function_name") == function_to_optimize.function_name and (
-                key.endswith(f"/{target_filename}") or target_filename in key
-            ):
+        candidates = self._function_stats_by_name.get(function_to_optimize.function_name)
+        if not candidates:
+            logger.debug(
+                f"Could not find stats for function {function_to_optimize.function_name} in file {target_filename}"
+            )
+            return None
+
+        for key, stats in candidates:
+            # The check preserves exact logic: "key.endswith(f"/{target_filename}") or target_filename in key"
+            if key.endswith(f"/{target_filename}") or target_filename in key:
                 return stats
 
         logger.debug(