From a886e3f9bd37af0e0702e8b822352732b88c23d0 Mon Sep 17 00:00:00 2001
From: androna-xm <androna.xm@gmail.com>
Date: Sat, 6 Dec 2025 19:21:26 +0200
Subject: [PATCH 1/4] Expose sample parameter in get_structured_schema and
 get_schema

---
 src/neo4j_graphrag/schema.py | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/src/neo4j_graphrag/schema.py b/src/neo4j_graphrag/schema.py
index 40292067f..5f299e114 100644
--- a/src/neo4j_graphrag/schema.py
+++ b/src/neo4j_graphrag/schema.py
@@ -29,7 +29,7 @@
 DISTINCT_VALUE_LIMIT = 10
 
 NODE_PROPERTIES_QUERY = (
-    "CALL apoc.meta.data() "
+    "CALL apoc.meta.data({sample: $SAMPLE}) "
     "YIELD label, other, elementType, type, property "
     "WHERE NOT type = 'RELATIONSHIP' AND elementType = 'node' "
     "AND NOT label IN $EXCLUDED_LABELS "
@@ -38,7 +38,7 @@
 )
 
 REL_PROPERTIES_QUERY = (
-    "CALL apoc.meta.data() "
+    "CALL apoc.meta.data({sample: $SAMPLE}) "
     "YIELD label, other, elementType, type, property "
     "WHERE NOT type = 'RELATIONSHIP' AND elementType = 'relationship' "
     "AND NOT label in $EXCLUDED_LABELS "
@@ -47,7 +47,7 @@
 )
 
 REL_QUERY = (
-    "CALL apoc.meta.data() "
+    "CALL apoc.meta.data({sample: $SAMPLE}) "
     "YIELD label, other, elementType, type, property "
     "WHERE type = 'RELATIONSHIP' AND elementType = 'node' "
     "UNWIND other AS other_node "
@@ -186,6 +186,7 @@ def get_schema(
     database: Optional[str] = None,
     timeout: Optional[float] = None,
     sanitize: bool = False,
+    sample: int = 1000,
 ) -> str:
     """
     Returns the schema of the graph as a string with following format:
@@ -210,6 +211,8 @@ def get_schema(
         sanitize (bool): A flag to indicate whether to remove lists with
                 more than 128 elements from results. Useful for removing
                 embedding-like properties from database responses. Default is False.
+        sample (int): Number of nodes to sample for the apoc.meta.data procedure. Setting sample to -1 will remove sampling.
+                Defaults to 1000.
 
 
     Returns:
@@ -221,6 +224,7 @@ def get_schema(
         database=database,
         timeout=timeout,
         sanitize=sanitize,
+        sample=sample,
     )
     return format_schema(structured_schema, is_enhanced)
 
@@ -231,6 +235,7 @@ def get_structured_schema(
     database: Optional[str] = None,
     timeout: Optional[float] = None,
     sanitize: bool = False,
+    sample: int = 1000,
 ) -> dict[str, Any]:
     """
     Returns the structured schema of the graph.
@@ -280,6 +285,8 @@ def get_structured_schema(
         sanitize (bool): A flag to indicate whether to remove lists with
             more than 128 elements from results. Useful for removing
             embedding-like properties from database responses. Default is False.
+        sample (int): Number of nodes to sample for the apoc.meta.data procedure. Setting sample to -1 will remove sampling.
+            Defaults to 1000.
 
     Returns:
         dict[str, Any]: the graph schema information in a structured format.
@@ -291,7 +298,8 @@ def get_structured_schema(
             query=NODE_PROPERTIES_QUERY,
             params={
                 "EXCLUDED_LABELS": EXCLUDED_LABELS
-                + [BASE_ENTITY_LABEL, BASE_KG_BUILDER_LABEL]
+                + [BASE_ENTITY_LABEL, BASE_KG_BUILDER_LABEL],
+                "SAMPLE": sample,
             },
             database=database,
             timeout=timeout,
@@ -304,7 +312,7 @@ def get_structured_schema(
         for data in query_database(
             driver=driver,
             query=REL_PROPERTIES_QUERY,
-            params={"EXCLUDED_LABELS": EXCLUDED_RELS},
+            params={"EXCLUDED_LABELS": EXCLUDED_RELS, "SAMPLE": sample},
             database=database,
             timeout=timeout,
             sanitize=sanitize,
@@ -318,7 +326,8 @@ def get_structured_schema(
             query=REL_QUERY,
             params={
                 "EXCLUDED_LABELS": EXCLUDED_LABELS
-                + [BASE_ENTITY_LABEL, BASE_KG_BUILDER_LABEL]
+                + [BASE_ENTITY_LABEL, BASE_KG_BUILDER_LABEL],
+                "SAMPLE": sample,
             },
             database=database,
             timeout=timeout,

From 1a0594cf0690bc8590c41272233b824e3cf456f2 Mon Sep 17 00:00:00 2001
From: androna-xm <androna.xm@gmail.com>
Date: Sat, 6 Dec 2025 19:34:46 +0200
Subject: [PATCH 2/4] Update changelog for schema sampling parameter

---
 CHANGELOG.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d28e55a87..e3e5d8e6b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,9 @@
 
 ## Next
 
+### Added
+- Exposed optional `sample` parameter on `get_schema` and `get_structured_schema` to control APOC sampling for schema discovery.
+
 ## 1.10.1
 
 ### Added

From 1bd14b046a8e578b9182e1e82aebb87599711997 Mon Sep 17 00:00:00 2001
From: androna-xm <androna.xm@gmail.com>
Date: Tue, 9 Dec 2025 14:53:51 +0200
Subject: [PATCH 3/4] update unit tests and e2e tests

---
 tests/e2e/test_schema_e2e.py         | 12 +++++++++---
 tests/e2e/test_schema_filters_e2e.py | 11 ++++++++---
 tests/unit/test_schema.py            | 14 ++++++++------
 3 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/tests/e2e/test_schema_e2e.py b/tests/e2e/test_schema_e2e.py
index 5226e9855..e362ed433 100644
--- a/tests/e2e/test_schema_e2e.py
+++ b/tests/e2e/test_schema_e2e.py
@@ -29,7 +29,9 @@
 @pytest.mark.usefixtures("setup_neo4j_for_schema_query")
 def test_cypher_returns_correct_node_properties(driver: Driver) -> None:
     node_properties = query_database(
-        driver, NODE_PROPERTIES_QUERY, params={"EXCLUDED_LABELS": [BASE_ENTITY_LABEL]}
+        driver,
+        NODE_PROPERTIES_QUERY,
+        params={"EXCLUDED_LABELS": [BASE_ENTITY_LABEL], "SAMPLE": 1000},
     )
 
     expected_node_properties = [
@@ -47,7 +49,9 @@ def test_cypher_returns_correct_node_properties(driver: Driver) -> None:
 @pytest.mark.usefixtures("setup_neo4j_for_schema_query")
 def test_cypher_returns_correct_relationship_properties(driver: Driver) -> None:
     relationships_properties = query_database(
-        driver, REL_PROPERTIES_QUERY, params={"EXCLUDED_LABELS": [BASE_ENTITY_LABEL]}
+        driver,
+        REL_PROPERTIES_QUERY,
+        params={"EXCLUDED_LABELS": [BASE_ENTITY_LABEL], "SAMPLE": 1000},
     )
 
     expected_relationships_properties = [
@@ -65,7 +69,9 @@ def test_cypher_returns_correct_relationship_properties(driver: Driver) -> None:
 @pytest.mark.usefixtures("setup_neo4j_for_schema_query")
 def test_cypher_returns_correct_relationships(driver: Driver) -> None:
     relationships = query_database(
-        driver, REL_QUERY, params={"EXCLUDED_LABELS": [BASE_ENTITY_LABEL]}
+        driver,
+        REL_QUERY,
+        params={"EXCLUDED_LABELS": [BASE_ENTITY_LABEL], "SAMPLE": 1000},
     )
 
     expected_relationships = [
diff --git a/tests/e2e/test_schema_filters_e2e.py b/tests/e2e/test_schema_filters_e2e.py
index 5e79cdca3..5d11d453e 100644
--- a/tests/e2e/test_schema_filters_e2e.py
+++ b/tests/e2e/test_schema_filters_e2e.py
@@ -33,7 +33,7 @@ def test_filtering_labels_node_properties(driver: Driver) -> None:
         for data in query_database(
             driver,
             NODE_PROPERTIES_QUERY,
-            params={"EXCLUDED_LABELS": EXCLUDED_LABELS},
+            params={"EXCLUDED_LABELS": EXCLUDED_LABELS, "SAMPLE": 1000},
         )
     ]
 
@@ -45,7 +45,9 @@ def test_filtering_labels_relationship_properties(driver: Driver) -> None:
     relationship_properties = [
         data["output"]
         for data in query_database(
-            driver, REL_PROPERTIES_QUERY, params={"EXCLUDED_LABELS": EXCLUDED_RELS}
+            driver,
+            REL_PROPERTIES_QUERY,
+            params={"EXCLUDED_LABELS": EXCLUDED_RELS, "SAMPLE": 1000},
         )
     ]
 
@@ -59,7 +61,10 @@ def test_filtering_labels_relationships(driver: Driver) -> None:
         for data in query_database(
             driver,
             REL_QUERY,
-            params={"EXCLUDED_LABELS": EXCLUDED_LABELS + [BASE_ENTITY_LABEL]},
+            params={
+                "EXCLUDED_LABELS": EXCLUDED_LABELS + [BASE_ENTITY_LABEL],
+                "SAMPLE": 1000,
+            },
         )
     ]
 
diff --git a/tests/unit/test_schema.py b/tests/unit/test_schema.py
index 148be55df..4842b2f66 100644
--- a/tests/unit/test_schema.py
+++ b/tests/unit/test_schema.py
@@ -97,7 +97,8 @@ def test_get_structured_schema_happy_path(driver: MagicMock) -> None:
     assert query_obj.timeout is None
     assert kwargs["database_"] is None
     assert kwargs["parameters_"] == {
-        "EXCLUDED_LABELS": EXCLUDED_LABELS + [BASE_ENTITY_LABEL, BASE_KG_BUILDER_LABEL]
+        "EXCLUDED_LABELS": EXCLUDED_LABELS + [BASE_ENTITY_LABEL, BASE_KG_BUILDER_LABEL],
+        "SAMPLE": 1000,
     }
 
     args, kwargs = calls[1]
@@ -106,7 +107,7 @@ def test_get_structured_schema_happy_path(driver: MagicMock) -> None:
     assert query_obj.text == REL_PROPERTIES_QUERY
     assert query_obj.timeout is None
     assert kwargs["database_"] is None
-    assert kwargs["parameters_"] == {"EXCLUDED_LABELS": EXCLUDED_RELS}
+    assert kwargs["parameters_"] == {"EXCLUDED_LABELS": EXCLUDED_RELS, "SAMPLE": 1000}
 
     args, kwargs = calls[2]
     query_obj = args[0]
@@ -115,7 +116,8 @@ def test_get_structured_schema_happy_path(driver: MagicMock) -> None:
     assert query_obj.timeout is None
     assert kwargs["database_"] is None
     assert kwargs["parameters_"] == {
-        "EXCLUDED_LABELS": EXCLUDED_LABELS + [BASE_ENTITY_LABEL, BASE_KG_BUILDER_LABEL]
+        "EXCLUDED_LABELS": EXCLUDED_LABELS + [BASE_ENTITY_LABEL, BASE_KG_BUILDER_LABEL],
+        "SAMPLE": 1000,
     }
 
     args, kwargs = calls[3]
@@ -205,9 +207,9 @@ def test__value_sanitize(
     description: str, input_value: Dict[str, Any], expected_output: Any
 ) -> None:
     """Test the _value_sanitize function."""
-    assert (
-        _value_sanitize(input_value) == expected_output
-    ), f"Failed test case: {description}"
+    assert _value_sanitize(input_value) == expected_output, (
+        f"Failed test case: {description}"
+    )
 
 
 @pytest.mark.parametrize(

From 32e91a078b8e1d2278125080a3a25ef70fba8db4 Mon Sep 17 00:00:00 2001
From: androna-xm <androna.xm@gmail.com>
Date: Tue, 9 Dec 2025 15:55:50 +0200
Subject: [PATCH 4/4] Apply ruff formatting to test_schema

---
 tests/unit/test_schema.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/unit/test_schema.py b/tests/unit/test_schema.py
index 4842b2f66..656b2d7ba 100644
--- a/tests/unit/test_schema.py
+++ b/tests/unit/test_schema.py
@@ -207,9 +207,9 @@ def test__value_sanitize(
     description: str, input_value: Dict[str, Any], expected_output: Any
 ) -> None:
     """Test the _value_sanitize function."""
-    assert _value_sanitize(input_value) == expected_output, (
-        f"Failed test case: {description}"
-    )
+    assert (
+        _value_sanitize(input_value) == expected_output
+    ), f"Failed test case: {description}"
 
 
 @pytest.mark.parametrize(