-
Notifications
You must be signed in to change notification settings - Fork 6
Feature/faster validation #38
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -7,6 +7,17 @@ | |
| except ImportError: | ||
| from yaml import SafeLoader as SafeLoader | ||
|
|
||
| # Fast YAML loading using rapidyaml | ||
| import warnings | ||
|
|
||
| with warnings.catch_warnings(): | ||
| warnings.filterwarnings( | ||
| "ignore", | ||
| message="builtin type Swig.*has no __module__ attribute", | ||
| category=DeprecationWarning, | ||
| ) | ||
| import ryml as _ryml | ||
|
|
||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is it a good idea to just remove these warnings? |
||
| if sys.version_info[:2] >= (3, 8): | ||
| # TODO: Import directly (no need for conditional) when `python_requires = >= 3.8` | ||
| from importlib.metadata import PackageNotFoundError, version # pragma: no cover | ||
|
|
@@ -62,9 +73,46 @@ def __str__(self): | |
| class UniqueKeyLoader(SafeLoader): | ||
| def construct_mapping(self, node, deep=False): | ||
| mapping = set() | ||
| for key_node, value_node in node.value: | ||
| for key_node, _ in node.value: | ||
| key = self.construct_object(key_node, deep=deep) | ||
| if key in mapping: | ||
| raise ValidationError(f"Duplicate {key!r} key found in YAML.") | ||
| mapping.add(key) | ||
| return super().construct_mapping(node, deep) | ||
|
|
||
|
|
||
| def _check_duplicate_keys_ryml(tree, root_id): | ||
| """Check for duplicate keys at the top level of ryml tree.""" | ||
| if not tree.is_map(root_id): | ||
| return | ||
| keys = set() | ||
| for i in range(tree.num_children(root_id)): | ||
| child_id = tree.child(root_id, i) | ||
| key = tree.key(child_id) | ||
| if key in keys: | ||
| raise ValidationError( | ||
| f"Duplicate {key.tobytes().decode()!r} key found in YAML." | ||
| ) | ||
| keys.add(key) | ||
|
|
||
|
|
||
| def load_yaml_fast(file_path): | ||
| """Load YAML using rapidyaml (~10x faster) for large files, else PyYAML.""" | ||
| import json | ||
| import os | ||
| import yaml | ||
|
|
||
| file_size = os.path.getsize(file_path) | ||
|
|
||
| # Use UniqueKeyLoader for small files (duplicate key detection) | ||
| if file_size < 100 * 1024: | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is there a reason to fallback on small files? |
||
| with open(file_path, "r") as f: | ||
| return yaml.load(f, UniqueKeyLoader) | ||
|
|
||
| # Fast path: ryml -> check duplicates -> JSON -> json.loads | ||
| with open(file_path, "rb") as f: | ||
| content = f.read() | ||
| tree = _ryml.parse_in_arena(content) | ||
| _check_duplicate_keys_ryml(tree, tree.root_id()) | ||
| json_bytes = _ryml.emit_json_malloc(tree, tree.root_id()) | ||
| return json.loads(json_bytes) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -450,8 +450,17 @@ def validate_tagpack( | |
|
|
||
| # Check actors if enabled | ||
| if check_actor_references and actorpack: | ||
| actor = tagpack.all_header_fields.get("actor") | ||
| if actor: | ||
| # Collect actors from header level and tag level | ||
| actors_to_check = set() | ||
| header_actor = tagpack.all_header_fields.get("actor") | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is there a reason we need more cases? or is this a new feature? |
||
| if header_actor: | ||
| actors_to_check.add(header_actor) | ||
| for tag in tagpack.tags: | ||
| tag_actor = tag.all_fields.get("actor") | ||
| if tag_actor: | ||
| actors_to_check.add(tag_actor) | ||
|
|
||
| for actor in actors_to_check: | ||
| resolved = actorpack.resolve_actor(actor) | ||
| if resolved is None: | ||
| # Unknown actor | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is this really a change or is this a artifact of not merging the last release to master?