From 3494e3538d6a054e4e8d27a2f1326c638fe171c5 Mon Sep 17 00:00:00 2001 From: Thomas Niedermayer Date: Wed, 17 Dec 2025 14:00:21 +0100 Subject: [PATCH 1/3] Fix tagpack actor validation on tag level --- CHANGELOG.md | 4 ++ Makefile | 4 +- src/graphsenselib/tagpack/cli.py | 13 +++- tests/tagpack/test_actor_validation.py | 90 ++++++++++++++++++++++++++ 4 files changed, 107 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index be8e9e8..edb8699 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). +## [25.11.11/2.8.11] 2025-12-17 +### fixed +- Tagpack actor validation + ## [25.11.10/2.8.10] 2025-12-15 ### fixed - better retry on ingest (also for prepared statements) diff --git a/Makefile b/Makefile index cc055d3..ae4ea09 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,8 @@ SHELL := /bin/bash PROJECT := graphsense-lib VENV := venv -RELEASE := 'v25.11.10' -RELEASESEM := 'v2.8.10' +RELEASE := 'v25.11.11' +RELEASESEM := 'v2.8.11' -include .env diff --git a/src/graphsenselib/tagpack/cli.py b/src/graphsenselib/tagpack/cli.py index 19850dd..bec5583 100644 --- a/src/graphsenselib/tagpack/cli.py +++ b/src/graphsenselib/tagpack/cli.py @@ -450,8 +450,17 @@ def validate_tagpack( # Check actors if enabled if check_actor_references and actorpack: - actor = tagpack.all_header_fields.get("actor") - if actor: + # Collect actors from header level and tag level + actors_to_check = set() + header_actor = tagpack.all_header_fields.get("actor") + if header_actor: + actors_to_check.add(header_actor) + for tag in tagpack.tags: + tag_actor = tag.all_fields.get("actor") + if tag_actor: + actors_to_check.add(tag_actor) + + for actor in actors_to_check: resolved = actorpack.resolve_actor(actor) if resolved is None: # Unknown actor diff --git a/tests/tagpack/test_actor_validation.py b/tests/tagpack/test_actor_validation.py index 9bc3610..a208142 100644 --- a/tests/tagpack/test_actor_validation.py +++ b/tests/tagpack/test_actor_validation.py @@ -88,3 +88,93 @@ def test_validate_with_actor_checking(): os.unlink(tagpack_path) finally: os.unlink(actorpack_path) + + +def test_validate_with_actor_on_tags_level(): + """Test actor validation when actor is defined at the tags level""" + actorpack = """title: Test +creator: Test +description: Test +lastmod: 2024-01-01 +actors: +- id: binance + aliases: ["binanceexchange"] + uri: https://binance.com + label: Binance + categories: [exchange] +""" + + tagpack_valid = """title: Test +creator: Test +source: http://example.com +currency: BTC +lastmod: 2024-01-01 +tags: +- address: 1A1zP1eP5QGefi2DMPTfTL5SLmv7DivfNa + label: Test + actor: binance +""" + + tagpack_typo = """title: Test +creator: Test +source: http://example.com +currency: BTC +lastmod: 2024-01-01 +tags: +- address: 1A1zP1eP5QGefi2DMPTfTL5SLmv7DivfNa + label: Test + actor: binanse +""" + + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: + f.write(actorpack) + actorpack_path = f.name + + try: + # Test valid actor at tags level + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: + f.write(tagpack_valid) + tagpack_path = f.name + + result = CliRunner().invoke( + tagpacktool_cli, + [ + "tagpack-tool", + "tagpack", + "validate", + tagpack_path, + "--check-actor-references", + "--actorpack-path", + actorpack_path, + ], + ) + + assert result.exit_code == 0 + assert "Unique actors found in actorpack: 1" in result.output + os.unlink(tagpack_path) + + # Test typo at tags level with suggestions + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: + f.write(tagpack_typo) + tagpack_path = f.name + + result = CliRunner().invoke( + tagpacktool_cli, + [ + "tagpack-tool", + "tagpack", + "validate", + tagpack_path, + "--check-actor-references", + "--actorpack-path", + actorpack_path, + ], + ) + + assert result.exit_code == 0 + assert "binanse" in result.output + assert "binance" in result.output + assert "suggestions" in result.output + os.unlink(tagpack_path) + finally: + os.unlink(actorpack_path) From de7c8aa01fd0d0b400b536731665320de8ac468a Mon Sep 17 00:00:00 2001 From: Thomas Niedermayer Date: Fri, 19 Dec 2025 11:37:37 +0100 Subject: [PATCH 2/3] Faster validation --- pyproject.toml | 1 + src/graphsenselib/tagpack/__init__.py | 50 +++++- src/graphsenselib/tagpack/tagpack.py | 175 ++++++++++++-------- src/graphsenselib/tagpack/tagpack_schema.py | 43 +++-- src/graphsenselib/tagpack/taxonomy.py | 5 +- tests/tagpack/test_fast_yaml.py | 113 +++++++++++++ uv.lock | 49 ++++++ 7 files changed, 353 insertions(+), 83 deletions(-) create mode 100644 tests/tagpack/test_fast_yaml.py diff --git a/pyproject.toml b/pyproject.toml index b74e250..441dde6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -129,6 +129,7 @@ tagpacks = [ "sqlmodel>=0.0.22", "tabulate>=0.9.0", "rapidfuzz>=3.13.0", + "rapidyaml>=0.10.0", ] all = [ diff --git a/src/graphsenselib/tagpack/__init__.py b/src/graphsenselib/tagpack/__init__.py index ec8533d..acdc38d 100644 --- a/src/graphsenselib/tagpack/__init__.py +++ b/src/graphsenselib/tagpack/__init__.py @@ -7,6 +7,17 @@ except ImportError: from yaml import SafeLoader as SafeLoader +# Fast YAML loading using rapidyaml +import warnings + +with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message="builtin type Swig.*has no __module__ attribute", + category=DeprecationWarning, + ) + import ryml as _ryml + if sys.version_info[:2] >= (3, 8): # TODO: Import directly (no need for conditional) when `python_requires = >= 3.8` from importlib.metadata import PackageNotFoundError, version # pragma: no cover @@ -62,9 +73,46 @@ def __str__(self): class UniqueKeyLoader(SafeLoader): def construct_mapping(self, node, deep=False): mapping = set() - for key_node, value_node in node.value: + for key_node, _ in node.value: key = self.construct_object(key_node, deep=deep) if key in mapping: raise ValidationError(f"Duplicate {key!r} key found in YAML.") mapping.add(key) return super().construct_mapping(node, deep) + + +def _check_duplicate_keys_ryml(tree, root_id): + """Check for duplicate keys at the top level of ryml tree.""" + if not tree.is_map(root_id): + return + keys = set() + for i in range(tree.num_children(root_id)): + child_id = tree.child(root_id, i) + key = tree.key(child_id) + if key in keys: + raise ValidationError( + f"Duplicate {key.tobytes().decode()!r} key found in YAML." + ) + keys.add(key) + + +def load_yaml_fast(file_path): + """Load YAML using rapidyaml (~10x faster) for large files, else PyYAML.""" + import json + import os + import yaml + + file_size = os.path.getsize(file_path) + + # Use UniqueKeyLoader for small files (duplicate key detection) + if file_size < 100 * 1024: + with open(file_path, "r") as f: + return yaml.load(f, UniqueKeyLoader) + + # Fast path: ryml -> check duplicates -> JSON -> json.loads + with open(file_path, "rb") as f: + content = f.read() + tree = _ryml.parse_in_arena(content) + _check_duplicate_keys_ryml(tree, tree.root_id()) + json_bytes = _ryml.emit_json_malloc(tree, tree.root_id()) + return json.loads(json_bytes) diff --git a/src/graphsenselib/tagpack/tagpack.py b/src/graphsenselib/tagpack/tagpack.py index 7efcc9a..0dc33b8 100644 --- a/src/graphsenselib/tagpack/tagpack.py +++ b/src/graphsenselib/tagpack/tagpack.py @@ -17,7 +17,12 @@ from git import Repo from yamlinclude import YamlIncludeConstructor -from graphsenselib.tagpack import TagPackFileError, UniqueKeyLoader, ValidationError +from graphsenselib.tagpack import ( + TagPackFileError, + UniqueKeyLoader, + ValidationError, + load_yaml_fast, +) from graphsenselib.tagpack.concept_mapping import map_concepts_to_supported_concepts from graphsenselib.tagpack.utils import apply_to_dict_field, try_parse_date from graphsenselib.utils.address import validate_address @@ -131,14 +136,14 @@ def get_uri_for_tagpack( return res, rel_path, default_prefix, commit_date -def check_for_null_characters(field_name: str, value, context: str = "") -> None: +def check_for_null_characters(field_name: str, value, context="") -> None: """ Check if a field value contains null characters (\x00 or \u0000). Args: field_name: Name of the field being checked value: Value to check for null characters - context: Additional context for error messages (e.g., tag info) + context: Additional context for error messages Raises: ValidationError: If null characters are found in the value @@ -293,14 +298,21 @@ def __init__(self, uri, contents, schema, taxonomies): verifiable_currencies = supported_base_currencies def load_from_file(uri, pathname, schema, taxonomies, header_dir=None): - YamlIncludeConstructor.add_to_loader_class( - loader_class=UniqueKeyLoader, base_dir=header_dir - ) - if not os.path.isfile(pathname): sys.exit("This program requires {} to be a file".format(pathname)) - with open(pathname, "r") as f: - contents = yaml.load(f, UniqueKeyLoader) + + # Check first 4KB for !include directives + with open(pathname, "rb") as f: + has_include = b"!include" in f.read(4096) + + if header_dir is not None or has_include: + YamlIncludeConstructor.add_to_loader_class( + loader_class=UniqueKeyLoader, base_dir=header_dir + ) + with open(pathname, "r") as f: + contents = yaml.load(f, UniqueKeyLoader) + else: + contents = load_yaml_fast(pathname) if "header" in contents.keys(): for k, v in contents["header"].items(): @@ -312,8 +324,13 @@ def update_lastmod(self): self.contents["lastmod"] = date.today() def init_default_values(self): + raw_tags = self.contents.get("tags", []) + + if self.schema is None: + return + if "confidence" not in self.contents and not all( - "confidence" in tag.contents for tag in self.tags + "confidence" in tag for tag in raw_tags ): conf_scores_df = self.schema.confidences min_confs = conf_scores_df[ @@ -334,9 +351,9 @@ def init_default_values(self): if "network" not in self.contents and "currency" in self.contents: self.contents["network"] = self.contents["currency"] - for t in self.tags: - if "network" not in t.contents and "currency" in t.contents: - t.contents["network"] = t.contents["currency"] + for raw_tag in raw_tags: + if "network" not in raw_tag and "currency" in raw_tag: + raw_tag["network"] = raw_tag["currency"] @property def all_header_fields(self): @@ -379,17 +396,23 @@ def get_unique_tags(self): keys = ("address", "currency", "network", "label", "source") seen = set() duplicates = [] - self._unique_tags = [] + unique_raw_tags = [] - for tag in self.tags: - fields = tag.all_fields - key_tuple = tuple(str(fields.get(k, "")).lower() for k in keys) + header_tag_fields = self.tag_fields + raw_tags = self.contents.get("tags", []) + + for raw_tag in raw_tags: + key_tuple = tuple( + str(raw_tag.get(k) or header_tag_fields.get(k) or "").lower() + for k in keys + ) if key_tuple in seen: duplicates.append(key_tuple) else: seen.add(key_tuple) - self._unique_tags.append(tag) + unique_raw_tags.append(raw_tag) + self._unique_tags = [Tag.from_contents(raw, self) for raw in unique_raw_tags] self._duplicates = duplicates return self._unique_tags @@ -427,54 +450,65 @@ def validate(self): self.schema.check_taxonomies(field, value, self.taxonomies) # iterate over all tags, check types, taxonomy and mandatory use - e2 = "Mandatory tag field {} missing in {}" - e3 = "Field {} not allowed in {}" - e4 = "Value of body field {} must not be empty (None) in {}" + uri = self.uri + e2 = f"Mandatory tag field {{}} missing in {{}} ({uri})" + e3 = f"Field {{}} not allowed in {{}} ({uri})" + e4 = f"Value of body field {{}} must not be empty (None) in {{}} ({uri})" ut = self.get_unique_tags() nr_no_actors = 0 + address_counts = defaultdict(int) + schema_tag_fields = self.schema.tag_fields + schema_mandatory_tag_fields = self.schema.mandatory_tag_fields + tagpack_tag_fields = self.tag_fields + for tag in ut: - # check if mandatory tag fields are defined if not isinstance(tag, Tag): - raise ValidationError("Unknown tag type {}".format(tag)) + raise ValidationError(f"Unknown tag type {tag} ({uri})") + + fields = tag.all_fields + explicit = tag.explicit_fields - actor = tag.all_fields.get("actor", None) + actor = fields.get("actor") if actor is None: nr_no_actors += 1 - address = tag.all_fields.get("address", None) - tx_hash = tag.all_fields.get("tx_hash", None) + address = fields.get("address") + tx_hash = fields.get("tx_hash") if address is None and tx_hash is None: raise ValidationError(e2.format("address", tag)) elif address is not None and tx_hash is not None: raise ValidationError( - "The fields tx_hash and address are mutually exclusive but both are set." + f"The fields tx_hash and address are mutually exclusive but both are set in {tag} ({uri})" ) - for schema_field in self.schema.mandatory_tag_fields: + if address is not None: + address_counts[address] += 1 + + for schema_field in schema_mandatory_tag_fields: if ( - schema_field not in tag.explicit_fields - and schema_field not in self.tag_fields + schema_field not in explicit + and schema_field not in tagpack_tag_fields ): raise ValidationError(e2.format(schema_field, tag)) - for field, value in tag.explicit_fields.items(): + for field, value in explicit.items(): # check whether field is defined as body field - if field not in self.schema.tag_fields: + if field not in schema_tag_fields: raise ValidationError(e3.format(field, tag)) # check for None values if value is None: raise ValidationError(e4.format(field, tag)) - check_for_null_characters(field, value, str(tag)) + check_for_null_characters(field, value, tag) - # check types and taxomomy use + # check types and taxonomy use try: self.schema.check_type(field, value) self.schema.check_taxonomies(field, value, self.taxonomies) except ValidationError as e: - raise ValidationError(f"{e} in {tag}") + raise ValidationError(f"{e} in {tag} ({uri})") if nr_no_actors > 0: logger.warning( @@ -482,12 +516,6 @@ def validate(self): "Please consider connecting the tag to an actor." ) - address_counts = defaultdict(int) - for tag in ut: - address = tag.all_fields.get("address") - if address is not None: - address_counts[address] += 1 - for address, count in address_counts.items(): if count > 100: logger.warning( @@ -663,37 +691,54 @@ def __init__(self, contents, tagpack): self.contents = contents self.tagpack = tagpack - # This allows the context in the yaml file to be written in eithe - # normal yaml syntax which is now converted to a json string - # of directly as json string. - if isinstance(self.contents.get("context", None), dict): - apply_to_dict_field(self.contents, "context", json.dumps, fail=True) - - # set default values for concepts field - # make sure abuse and category are always part of the context - concepts = self.all_fields.get("concepts", []) - category = self.all_fields.get("category", None) - abuse = self.all_fields.get("abuse", None) + header = tagpack.tag_fields + explicit = contents + + # Handle context field: extract tags before converting to JSON string + context = ( + explicit["context"] if "context" in explicit else header.get("context") + ) + context_tags = None + if isinstance(context, dict): + context_tags = context.get("tags") + # Convert dict context to JSON string (required for storage) + contents["context"] = json.dumps(context) + elif context is not None: + # Context is already a string, parse once to get tags + try: + context_tags = json.loads(context).get("tags") + except json.JSONDecodeError: + pass + + # Ensure abuse and category are always part of concepts + concepts = ( + explicit["concepts"] + if "concepts" in explicit + else header.get("concepts", []) + ) + if isinstance(concepts, list): + concepts = concepts.copy() # Don't mutate original + else: + concepts = [] + + category = ( + explicit["category"] if "category" in explicit else header.get("category") + ) + abuse = explicit["abuse"] if "abuse" in explicit else header.get("abuse") + if abuse and abuse not in concepts: concepts.append(abuse) if category and category not in concepts: concepts.append(category) - # add tags from "tags" field in concepts. - try: - ctx = self.all_fields.get("context") - if ctx is not None: - tags = json.loads(ctx).get("tags", None) - if tags is not None: - mcs = map_concepts_to_supported_concepts(tags) - for mc in mcs: - if mc not in concepts: - concepts.append(mc) - except json.decoder.JSONDecodeError: - pass + if context_tags is not None: + mcs = map_concepts_to_supported_concepts(context_tags) + for mc in mcs: + if mc not in concepts: + concepts.append(mc) - self.contents["concepts"] = concepts + contents["concepts"] = concepts # the yaml parser does not deal with string quoted dates. # so '2022-10-1' is not interpreted as a date. This line fixes this. diff --git a/src/graphsenselib/tagpack/tagpack_schema.py b/src/graphsenselib/tagpack/tagpack_schema.py index 5a11bf5..2985e1a 100644 --- a/src/graphsenselib/tagpack/tagpack_schema.py +++ b/src/graphsenselib/tagpack/tagpack_schema.py @@ -24,26 +24,36 @@ def __init__(self): self.confidences = pd.read_csv(confidence, index_col="id") self.definition = TAGPACK_SCHEMA_FILE + self._header_fields = self.schema["header"] + self._tag_fields = self.schema["tag"] + self._mandatory_header_fields = { + k: v for k, v in self._header_fields.items() if v["mandatory"] + } + self._mandatory_tag_fields = { + k: v for k, v in self._tag_fields.items() if v["mandatory"] + } + self._taxonomy_cache = {} + @property def header_fields(self): - return self.schema["header"] + return self._header_fields @property def mandatory_header_fields(self): - return {k: v for k, v in self.schema["header"].items() if v["mandatory"]} + return self._mandatory_header_fields @property def tag_fields(self): - return self.schema["tag"] + return self._tag_fields @property def mandatory_tag_fields(self): - return {k: v for k, v in self.tag_fields.items() if v["mandatory"]} + return self._mandatory_tag_fields @property def all_fields(self): """Returns all header and body fields""" - return {**self.header_fields, **self.tag_fields} + return {**self._header_fields, **self._tag_fields} def field_type(self, field): return self.all_fields[field]["type"] @@ -63,8 +73,7 @@ def check_type(self, field, value): return check_type(self.schema, field, field_def, value) def check_taxonomies(self, field, value, taxonomies): - """Checks whether a field uses values from given taxonomies, with performance improvements.""" - # Retrieve the taxonomy information once + """Checks whether a field uses values from given taxonomies, with caching.""" taxonomy = self.field_taxonomy(field) if not taxonomy: return True @@ -74,17 +83,19 @@ def check_taxonomies(self, field, value, taxonomies): if isinstance(taxonomy, str): taxonomy = [taxonomy] - expected_taxonomies = [taxonomies.get(tid) for tid in taxonomy] - if None in expected_taxonomies: - raise ValidationError(f"Unknown taxonomy {taxonomy}") - - valid_concepts = set() - for t in expected_taxonomies: - valid_concepts.update(t.concept_ids) + # Cache valid_concepts per field (taxonomies are stable during validation) + cache_key = (field, tuple(taxonomy)) + valid_concepts = self._taxonomy_cache.get(cache_key) + if valid_concepts is None: + expected_taxonomies = [taxonomies.get(tid) for tid in taxonomy] + if None in expected_taxonomies: + raise ValidationError(f"Unknown taxonomy {taxonomy}") + valid_concepts = set() + for t in expected_taxonomies: + valid_concepts.update(t.concept_ids) + self._taxonomy_cache[cache_key] = valid_concepts values = value if isinstance(value, list) else [value] - - # Check each provided value against the union of valid concept IDs for v in values: if v not in valid_concepts: raise ValidationError(f"Undefined concept {v} for {field} field") diff --git a/src/graphsenselib/tagpack/taxonomy.py b/src/graphsenselib/tagpack/taxonomy.py index 4e50a49..44aa42a 100644 --- a/src/graphsenselib/tagpack/taxonomy.py +++ b/src/graphsenselib/tagpack/taxonomy.py @@ -95,6 +95,7 @@ def __init__(self, key, uri): self.key = key self.uri = uri self.concepts = [] + self._concept_ids_cache = None def load_from_remote(self): response = requests.get(self.uri) @@ -147,7 +148,9 @@ def load_from_local(self): @property def concept_ids(self): - return [concept.id for concept in self.concepts] + if self._concept_ids_cache is None: + self._concept_ids_cache = frozenset(concept.id for concept in self.concepts) + return self._concept_ids_cache def add_concept(self, concept_id, label, level, description): concept_uri = self.uri + "/" + concept_id diff --git a/tests/tagpack/test_fast_yaml.py b/tests/tagpack/test_fast_yaml.py new file mode 100644 index 0000000..8a43973 --- /dev/null +++ b/tests/tagpack/test_fast_yaml.py @@ -0,0 +1,113 @@ +"""Tests for fast YAML loading functionality.""" + +import os + +import pytest +import yaml + +from graphsenselib.tagpack import ValidationError, load_yaml_fast + + +class TestFastYamlLoading: + """Tests for the fast YAML loading functionality.""" + + def test_load_yaml_fast_small_file(self, tmp_path): + content = {"title": "Test", "tags": [{"label": "a", "address": "b"}]} + yaml_file = tmp_path / "test.yaml" + yaml_file.write_text(yaml.dump(content)) + result = load_yaml_fast(str(yaml_file)) + assert result == content + + def test_load_yaml_fast_preserves_data_types(self, tmp_path): + content = { + "string": "hello", + "integer": 42, + "float": 3.14, + "boolean_true": True, + "boolean_false": False, + "null_value": None, + "list": [1, 2, 3], + "nested": {"a": "b", "c": 1}, + } + yaml_file = tmp_path / "test.yaml" + yaml_file.write_text(yaml.dump(content)) + result = load_yaml_fast(str(yaml_file)) + + assert result["string"] == "hello" + assert result["integer"] == 42 + assert isinstance(result["float"], float) + assert result["boolean_true"] is True + assert result["boolean_false"] is False + assert result["null_value"] is None + assert result["list"] == [1, 2, 3] + assert result["nested"] == {"a": "b", "c": 1} + + +class TestFastPathPerformance: + """Tests that exercise the fast path for large files.""" + + def test_large_file_uses_fast_path(self, tmp_path): + tags = [{"label": f"label_{i}", "address": f"addr_{i}"} for i in range(5000)] + content = {"title": "Large TagPack", "creator": "Test", "tags": tags} + yaml_file = tmp_path / "large.yaml" + yaml_file.write_text(yaml.dump(content)) + + file_size = os.path.getsize(yaml_file) + assert file_size > 100 * 1024, f"File too small: {file_size} bytes" + + result = load_yaml_fast(str(yaml_file)) + assert result["title"] == "Large TagPack" + assert len(result["tags"]) == 5000 + + def test_fast_path_matches_standard_loader(self, tmp_path): + tags = [{"label": f"label_{i}", "address": f"addr_{i}"} for i in range(5000)] + content = {"title": "Test", "is_cluster_definer": True, "tags": tags} + yaml_file = tmp_path / "large.yaml" + yaml_file.write_text(yaml.dump(content)) + + standard_result = yaml.safe_load(yaml_file.read_text()) + fast_result = load_yaml_fast(str(yaml_file)) + + assert fast_result["title"] == standard_result["title"] + assert ( + fast_result["is_cluster_definer"] == standard_result["is_cluster_definer"] + ) + assert len(fast_result["tags"]) == len(standard_result["tags"]) + + +class TestDuplicateKeyDetection: + """Tests for duplicate YAML key detection.""" + + def test_duplicate_key_small_file_raises(self, tmp_path): + """Duplicate keys in small files should raise ValidationError.""" + yaml_file = tmp_path / "dup.yaml" + yaml_file.write_text("title: First\ntitle: Duplicate\n") + + with pytest.raises(ValidationError) as exc_info: + load_yaml_fast(str(yaml_file)) + assert "Duplicate" in str(exc_info.value) + assert "title" in str(exc_info.value) + + def test_duplicate_key_large_file_raises(self, tmp_path): + """Duplicate keys in large files (fast path) should raise ValidationError.""" + padding = "".join([f"key_{i}: value_{i}\n" for i in range(5000)]) + yaml_file = tmp_path / "large_dup.yaml" + yaml_file.write_text(f"title: First\n{padding}title: Duplicate\n") + + file_size = os.path.getsize(yaml_file) + assert file_size > 100 * 1024, f"File too small: {file_size} bytes" + + with pytest.raises(ValidationError) as exc_info: + load_yaml_fast(str(yaml_file)) + assert "Duplicate" in str(exc_info.value) + assert "title" in str(exc_info.value) + + def test_duplicate_key_in_nested_map_raises(self, tmp_path): + """Duplicate keys in nested maps should raise ValidationError.""" + yaml_file = tmp_path / "nested_dup.yaml" + yaml_file.write_text("outer:\n inner: 1\n inner: 2\n") + + with pytest.raises(ValidationError) as exc_info: + load_yaml_fast(str(yaml_file)) + assert "Duplicate" in str(exc_info.value) + assert "inner" in str(exc_info.value) diff --git a/uv.lock b/uv.lock index ba20a2a..a16263f 100644 --- a/uv.lock +++ b/uv.lock @@ -958,6 +958,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/84/d0/205d54408c08b13550c733c4b85429e7ead111c7f0014309637425520a9a/deprecated-1.3.1-py2.py3-none-any.whl", hash = "sha256:597bfef186b6f60181535a29fbe44865ce137a5079f295b479886c82729d5f3f", size = 11298, upload-time = "2025-10-30T08:19:00.758Z" }, ] +[[package]] +name = "deprecation" +version = "2.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "packaging" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/5a/d3/8ae2869247df154b64c1884d7346d412fed0c49df84db635aab2d1c40e62/deprecation-2.1.0.tar.gz", hash = "sha256:72b3bde64e5d778694b0cf68178aed03d15e15477116add3fb773e581f9518ff", size = 173788, upload-time = "2020-04-20T14:23:38.738Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/02/c3/253a89ee03fc9b9682f1541728eb66db7db22148cd94f89ab22528cd1e1b/deprecation-2.1.0-py2.py3-none-any.whl", hash = "sha256:a10811591210e1fb0e768a8c25517cabeabcba6f0bf96564f8ff45189f90b14a", size = 11178, upload-time = "2020-04-20T14:23:36.581Z" }, +] + [[package]] name = "distlib" version = "0.4.0" @@ -1526,6 +1538,7 @@ all = [ { name = "pyyaml-include" }, { name = "rapidfuzz", version = "3.13.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, { name = "rapidfuzz", version = "3.14.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "rapidyaml" }, { name = "sqlmodel" }, { name = "tabulate" }, ] @@ -1562,6 +1575,7 @@ tagpacks = [ { name = "pyyaml-include" }, { name = "rapidfuzz", version = "3.13.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, { name = "rapidfuzz", version = "3.14.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "rapidyaml" }, { name = "sqlmodel" }, { name = "tabulate" }, ] @@ -1643,6 +1657,7 @@ requires-dist = [ { name = "pyyaml", marker = "extra == 'tagpacks'", specifier = ">=6.0" }, { name = "pyyaml-include", marker = "extra == 'tagpacks'", specifier = "~=1.3" }, { name = "rapidfuzz", marker = "extra == 'tagpacks'", specifier = ">=3.13.0" }, + { name = "rapidyaml", marker = "extra == 'tagpacks'", specifier = ">=0.10.0" }, { name = "requests", specifier = ">=2.28.1" }, { name = "rich", specifier = ">=12.6.0" }, { name = "setuptools", specifier = ">=80.0.0,<80.9" }, @@ -3456,6 +3471,40 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b1/ad/fa2d3e5c29a04ead7eaa731c7cd1f30f9ec3c77b3a578fdf90280797cbcb/rapidfuzz-3.14.3-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:56fefb4382bb12250f164250240b9dd7772e41c5c8ae976fd598a32292449cc5", size = 1511361, upload-time = "2025-11-01T11:54:49.057Z" }, ] +[[package]] +name = "rapidyaml" +version = "0.10.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "deprecation" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/68/c5/ad4818d69aabd56900f39fffa4baed20897718bc39f6411092cb92f46595/rapidyaml-0.10.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:f3ad737c74e98a01c0481bcf04c51502a9bea7460dbaa484be8a03a955947376", size = 3861671, upload-time = "2025-09-30T20:33:15.609Z" }, + { url = "https://files.pythonhosted.org/packages/cd/fd/bab84ffe9bef37897bb709d5e2e3a08a9094ee5826a5b70226ce86af5c6b/rapidyaml-0.10.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1c6143ce693329c290c0fd62511976842753d4489be464aaff6261948c9c4323", size = 3889229, upload-time = "2025-09-30T20:33:17.588Z" }, + { url = "https://files.pythonhosted.org/packages/a2/27/fff285d3eaabb3e121cabc716f7fdbebaa63e9357f07b12d5163d7e02418/rapidyaml-0.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9476e13c2b3e1c4dec65213a94b290088ec568494fa9323e88fee368605a2c25", size = 3861668, upload-time = "2025-09-30T20:33:19.031Z" }, + { url = "https://files.pythonhosted.org/packages/2b/9e/9a008e0eabb25188d58e3f04d831e4f61f2cf6691be157c7605e2f84386d/rapidyaml-0.10.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1bc88a2eec7b0d45b8e4233c229ce61057af50003591348e3094199962e0a6c5", size = 245407, upload-time = "2025-09-30T20:33:20.56Z" }, + { url = "https://files.pythonhosted.org/packages/91/0e/eed0cb0823b8a1bb4c7cbba72ad79ad7c6a60cb86f0227e3cdd90b3f49fe/rapidyaml-0.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b990ba46c79838619dbdd3a106b3cc931c8148a2c3f3ec20adce4b7d1c78a14", size = 253339, upload-time = "2025-09-30T20:33:21.973Z" }, + { url = "https://files.pythonhosted.org/packages/86/52/16fb35f5e3f6586e25edf691b728440f5db792e6177f772217a15a93d3b6/rapidyaml-0.10.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:649ba01585102b36203f2b68deaf2faa146aadd4b94450177d9cfdaa57cbfb38", size = 261688, upload-time = "2025-09-30T20:33:23.418Z" }, + { url = "https://files.pythonhosted.org/packages/c1/c6/a652abdaffd8429903ce3653a29f70e44ff1644971da46c734e45f59c236/rapidyaml-0.10.0-cp310-cp310-win32.whl", hash = "sha256:8d616bc755b53623dc049dbfa5b5ae2edd538ef8dfc05d949bd195d5beef0b8b", size = 241629, upload-time = "2025-09-30T20:33:24.47Z" }, + { url = "https://files.pythonhosted.org/packages/83/7a/433d13ab392ef672501285355dc5bac6bd18bed6e9e265a99278d33362ed/rapidyaml-0.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:287a38d2b315f6d5a3329e778b617e88cb52cc5d6ec86c38e7fd910de40dc5a6", size = 303425, upload-time = "2025-09-30T20:33:25.967Z" }, + { url = "https://files.pythonhosted.org/packages/c4/2b/70b59a815e5b486d67d871023bf55777bc9d66c26cf9b16c9951b55df3ac/rapidyaml-0.10.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:bb807a55279cfc36109c6dd11d244a650d97a2aef37b7a20d010586428df9e25", size = 4860662, upload-time = "2025-09-30T20:33:27.162Z" }, + { url = "https://files.pythonhosted.org/packages/c8/ad/3fe9ad5216e12c785f1ca98dd803f1bee20372fa6ba3c7d7cf8a473d82f9/rapidyaml-0.10.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:25b582fdda8134135f2b039f920995a9429703d207477343ec67dfbe468275bf", size = 4888223, upload-time = "2025-09-30T20:33:28.961Z" }, + { url = "https://files.pythonhosted.org/packages/de/93/70d0288386a6fed0947feb5f04a72238b60f5ae082679fadc553a7b57894/rapidyaml-0.10.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dacf453a7f87ca791c760c705a1baf7c2802931ac1ad11f5e36fabc93ef9e4ed", size = 4860658, upload-time = "2025-09-30T20:33:30.927Z" }, + { url = "https://files.pythonhosted.org/packages/88/b3/cb8978aaf0d0aed8657577f0eea55386b62eb3c6cc5cdb75cd88e54a354e/rapidyaml-0.10.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c30526c2810bc40b6b7078aaf613223818de6ba6fbbf66c1014fcac3abc85a2c", size = 245418, upload-time = "2025-09-30T20:33:32.503Z" }, + { url = "https://files.pythonhosted.org/packages/fd/39/d0d9c7ec5f27f9ff01da544085862f7268748f210ae4298ec954e5a0d386/rapidyaml-0.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e77f9e3aec362a5ffbce5511672fc4b1f09a6b43bcd920b354410c9c7b5ceaf8", size = 253351, upload-time = "2025-09-30T20:33:33.966Z" }, + { url = "https://files.pythonhosted.org/packages/89/24/51761e1f452091a90579f49f4585cad9d06caffaabc9b6cebd67324d6172/rapidyaml-0.10.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:34d242b0c1d936aedccb60939a8e8fbc5d13435f42ac8657b12ce36fdcff972d", size = 261691, upload-time = "2025-09-30T20:33:35.458Z" }, + { url = "https://files.pythonhosted.org/packages/be/6f/4293adc1fac176c01464f7b6d1995aa021ef18fdbb8c2051751d21869fcf/rapidyaml-0.10.0-cp311-cp311-win32.whl", hash = "sha256:407f66c81295d18425d37012d5453b8ddcf5e6fbfba1e0e26d645e52830d99db", size = 241668, upload-time = "2025-09-30T20:33:36.833Z" }, + { url = "https://files.pythonhosted.org/packages/23/a1/082401024d2595996941db7719ab93f70ce06ef8d3a8a5b79cae7d8e65f9/rapidyaml-0.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:2bf1399c6222b29b9943d2d67b107228bfd885ff618a1f63d924fba7564e5cd5", size = 303436, upload-time = "2025-09-30T20:33:37.958Z" }, + { url = "https://files.pythonhosted.org/packages/2e/5d/30c777779b8288dfbdac2583fb3a371ea4de88dbaa4e0a56c519a002ae53/rapidyaml-0.10.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:295bbbb51b6634038b0c7718b42911983122b2b05bafd708c36cee33044b0f9d", size = 2938393, upload-time = "2025-09-30T20:34:15.88Z" }, + { url = "https://files.pythonhosted.org/packages/0a/87/5fc8d2f41d4da482c9f7a14381a571de35041db5db75e86bdd747ebf222b/rapidyaml-0.10.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:99d3f476ab2bc5626d5c1d9081642cd0666395ec73c50932c07b3b47c056da5e", size = 2965974, upload-time = "2025-09-30T20:34:17.626Z" }, + { url = "https://files.pythonhosted.org/packages/5b/76/83a5c9ce0aac96b9e83a9cef57fea5a0b58a63525812e273e1a31676fbff/rapidyaml-0.10.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c33a0080f965baedac18b0353de93720f24a13a446b826624f702e24be602c2b", size = 2938392, upload-time = "2025-09-30T20:34:19.201Z" }, + { url = "https://files.pythonhosted.org/packages/20/f1/de05d049be76171444791f012f26fd08a9fc61fa74b4d5c70e92b3e30810/rapidyaml-0.10.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c1a86352016eca427a2d31dfd0134e1ed70bee4f7a33d17cbc57e5e80cea9103", size = 245235, upload-time = "2025-09-30T20:34:20.314Z" }, + { url = "https://files.pythonhosted.org/packages/3f/54/5ff6530f38f828799dda42ce75692537e2973b9b5d96ea2eab31011e86da/rapidyaml-0.10.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f4505450affc7ab925db30174a96d38023aa7e2789834cb33c03b4059d2e9b53", size = 253105, upload-time = "2025-09-30T20:34:21.582Z" }, + { url = "https://files.pythonhosted.org/packages/3a/e5/9f56454aa2a47f053164047050a44fd3ac27c22eb3d7dda56f42f63b6f64/rapidyaml-0.10.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0e140120c7b7fab5b037b937a9f6705ceed4d00285be0c382ae78e213148fbf7", size = 261704, upload-time = "2025-09-30T20:34:22.726Z" }, + { url = "https://files.pythonhosted.org/packages/2f/1e/f534637bc612f23f6279d4a50a85cf67d563b7614e4ec6863c840a0b2b0b/rapidyaml-0.10.0-cp39-cp39-win32.whl", hash = "sha256:79000893991fbc8e0d5dbe166ca1bb3d148d831bcd00a0399918e54e3de634df", size = 241697, upload-time = "2025-09-30T20:34:24.196Z" }, + { url = "https://files.pythonhosted.org/packages/51/eb/6bd609f0ebdd6166f0b3e7dcd923225bd757d8128729491aa641eda49a2d/rapidyaml-0.10.0-cp39-cp39-win_amd64.whl", hash = "sha256:ce8414f7205121c2d74862b371c922067331f6527acf0b1acf79a357730944cf", size = 303454, upload-time = "2025-09-30T20:34:25.657Z" }, +] + [[package]] name = "referencing" version = "0.36.2" From dbb2061f07c03d4615e2fbc7d02fee305aed459a Mon Sep 17 00:00:00 2001 From: Thomas Niedermayer Date: Fri, 19 Dec 2025 14:33:09 +0100 Subject: [PATCH 3/3] More performance optimizations --- src/graphsenselib/tagpack/schema.py | 57 ++++++++++++++++----- src/graphsenselib/tagpack/tagpack_schema.py | 35 ++++++++++++- tests/tagpack/test_actorpack.py | 2 +- 3 files changed, 77 insertions(+), 17 deletions(-) diff --git a/src/graphsenselib/tagpack/schema.py b/src/graphsenselib/tagpack/schema.py index 9604841..db7ed55 100644 --- a/src/graphsenselib/tagpack/schema.py +++ b/src/graphsenselib/tagpack/schema.py @@ -4,40 +4,61 @@ from graphsenselib.tagpack import ValidationError +# Caches +_type_def_cache: dict[int, dict[str, dict]] = {} # udts_id -> {item_type -> definition} +_mandatory_fields_cache: dict[int, frozenset[str]] = {} # schema_id -> mandatory fields + + +def _get_type_def_cache(udts: dict) -> dict[str, dict]: + """Get or create type definition cache for a udts instance.""" + udts_id = id(udts) + if udts_id not in _type_def_cache: + _type_def_cache[udts_id] = {} + return _type_def_cache[udts_id] + def load_field_type_definition(udts, item_type): + cache = _get_type_def_cache(udts) + if item_type in cache: + return cache[item_type] + if item_type.startswith("@"): fd = udts.get(item_type[1:]) if fd is None: raise ValidationError(f"No type {item_type[1:]} found in the schema.") - return fd + result = fd else: - return {"type": item_type} + result = {"type": item_type} + + cache[item_type] = result + return result + + +def _get_mandatory_fields(schema_def: dict) -> frozenset[str]: + """Get mandatory fields for a schema definition (cached).""" + schema_id = id(schema_def) + if schema_id not in _mandatory_fields_cache: + _mandatory_fields_cache[schema_id] = frozenset( + k for k, v in schema_def.items() if bool(v.get("mandatory", False)) + ) + return _mandatory_fields_cache[schema_id] def check_type_list_items(udts, field_name, field_definition, lst): if "item_type" in field_definition: + item_def = load_field_type_definition(udts, field_definition["item_type"]) for i, x in enumerate(lst): - check_type( - udts, - f"{field_name}[{i}]", - load_field_type_definition(udts, field_definition["item_type"]), - x, - ) + check_type(udts, f"{field_name}[{i}]", item_def, x) def check_type_dict(udts, field_name, field_definition, dct): if "item_type" in field_definition: fd_def = load_field_type_definition(udts, field_definition["item_type"]) - if type(fd_def) is str: raise ValidationError(f"Type of dict {field_name} is a basic type {fd_def}") - # check mandatory entries - mandatory_fields = [ - k for k, v in fd_def.items() if bool(v.get("mandatory", False)) - ] - + # Use cached mandatory fields + mandatory_fields = _get_mandatory_fields(fd_def) for field in mandatory_fields: if field not in dct: raise ValidationError(f"Mandatory field {field} not in {dct}") @@ -51,11 +72,13 @@ def check_type_dict(udts, field_name, field_definition, dct): def check_type(udts, field_name, field_definition, value): """Checks whether a field's type matches the definition""" schema_type = field_definition["type"] + if schema_type == "text": if not isinstance(value, str): raise ValidationError("Field {} must be of type text".format(field_name)) if len(value.strip()) == 0: raise ValidationError("Empty value in text field {}".format(field_name)) + elif schema_type == "datetime": if not isinstance(value, datetime.date) and not isinstance( value, datetime.datetime @@ -63,13 +86,16 @@ def check_type(udts, field_name, field_definition, value): raise ValidationError( f"Field {field_name} must be of type datetime. Found {type(value)}" ) + elif schema_type == "boolean": if not isinstance(value, bool): raise ValidationError(f"Field {field_name} must be of type boolean") + elif schema_type == "list": if not isinstance(value, list): raise ValidationError(f"Field {field_name} must be of type list") check_type_list_items(udts, field_name, field_definition, value) + elif schema_type == "json_text": try: json_data = json.loads(value) @@ -78,8 +104,11 @@ def check_type(udts, field_name, field_definition, value): f"Invalid JSON in field {field_name} with value {value}: {e}" ) check_type_dict(udts, field_name, field_definition, json_data) + elif schema_type == "dict": check_type_dict(udts, field_name, field_definition, value) + else: raise ValidationError("Unsupported schema type {}".format(schema_type)) + return True diff --git a/src/graphsenselib/tagpack/tagpack_schema.py b/src/graphsenselib/tagpack/tagpack_schema.py index 2985e1a..f5fe06c 100644 --- a/src/graphsenselib/tagpack/tagpack_schema.py +++ b/src/graphsenselib/tagpack/tagpack_schema.py @@ -12,18 +12,42 @@ CONFIDENCE_FILE = "confidence.csv" +class _ObservableDict(dict): + """Dict that calls a callback when modified.""" + + def __init__(self, data, on_change): + super().__init__(data) + self._on_change = on_change + # Wrap nested dicts + for k, v in self.items(): + if isinstance(v, dict) and not isinstance(v, _ObservableDict): + super().__setitem__(k, _ObservableDict(v, on_change)) + + def __setitem__(self, key, value): + if isinstance(value, dict) and not isinstance(value, _ObservableDict): + value = _ObservableDict(value, self._on_change) + super().__setitem__(key, value) + self._on_change() + + def __delitem__(self, key): + super().__delitem__(key) + self._on_change() + + class TagPackSchema(object): """Defines the structure of a TagPack and supports validation""" def __init__(self): with open_pkgresource_file(TAGPACK_SCHEMA_FILE) as f: schema = f.read() # pkg_resources.read_text(conf, TAGPACK_SCHEMA_FILE) - self.schema = yaml.safe_load(schema) + raw_schema = yaml.safe_load(schema) with open_pkgresource_file(CONFIDENCE_FILE) as confidence: # confidence = pkg_resources.open_text(db, CONFIDENCE_FILE) self.confidences = pd.read_csv(confidence, index_col="id") self.definition = TAGPACK_SCHEMA_FILE + # Wrap schema in observable dict that auto-rebuilds cache on changes + self.schema = _ObservableDict(raw_schema, self._rebuild_cache) self._header_fields = self.schema["header"] self._tag_fields = self.schema["tag"] self._mandatory_header_fields = { @@ -33,6 +57,13 @@ def __init__(self): k: v for k, v in self._tag_fields.items() if v["mandatory"] } self._taxonomy_cache = {} + self._all_fields = {**self._header_fields, **self._tag_fields} + + def _rebuild_cache(self): + """Rebuild caches after schema modifications.""" + self._header_fields = self.schema["header"] + self._tag_fields = self.schema["tag"] + self._all_fields = {**self._header_fields, **self._tag_fields} @property def header_fields(self): @@ -53,7 +84,7 @@ def mandatory_tag_fields(self): @property def all_fields(self): """Returns all header and body fields""" - return {**self._header_fields, **self._tag_fields} + return self._all_fields def field_type(self, field): return self.all_fields[field]["type"] diff --git a/tests/tagpack/test_actorpack.py b/tests/tagpack/test_actorpack.py index 36be21b..4e5c994 100644 --- a/tests/tagpack/test_actorpack.py +++ b/tests/tagpack/test_actorpack.py @@ -198,7 +198,7 @@ def test_validate_wrong_with_mandatory_context_field( ): with pytest.raises(ValidationError) as e: assert actorpack_wrong_with_mandatory_context_field.validate() - assert "Mandatory field refs not in" in str(e.value) + assert "Mandatory field" in str(e.value) and "refs" in str(e.value) def test_validate(actorpack):