diff --git a/__init__.py b/__init__.py index 02a5d1e..b79fbe1 100644 --- a/__init__.py +++ b/__init__.py @@ -1,545 +1,11 @@ -import builtins -from dataclasses import dataclass -import itertools -import numbers -import sys -import re - -from collections import defaultdict -import types - -from lark import Lark, Transformer, Tree, Token -from lark.exceptions import UnexpectedToken, UnexpectedCharacters try: - from .mvd_info import MvdInfo, LARK_AVAILABLE -except ImportError: # in case of running module locally (e.g. test_parser.py) - from mvd_info import MvdInfo, LARK_AVAILABLE - -class ValidationError(Exception): - pass - -from collections import namedtuple - -class SyntaxError(ValidationError): - def __init__(self, filecontent, exception): - self.filecontent = filecontent - self.exception = exception - - def asdict(self, with_message=True): - return { - "type": ( - "unexpected_token" - if isinstance(self.exception, UnexpectedToken) - else "unexpected_character" - ), - "lineno": self.exception.line, - "column": self.exception.column, - "found_type": self.exception.token.type.lower(), - "found_value": self.exception.token.value, - "expected": sorted(x for x in self.exception.accepts if "__ANON" not in x), - "line": self.filecontent.split("\n")[self.exception.line - 1], - **({"message": str(self)} if with_message else {}), - } - - def __str__(self): - d = self.asdict(with_message=False) - if len(d["expected"]) == 1: - exp = d["expected"][0] - else: - exp = f"one of {' '.join(d['expected'])}" - - sth = "character" if d["type"] == "unexpected_character" else "" - - return f"On line {d['lineno']} column {d['column']}:\nUnexpected {sth}{d['found_type']} ('{d['found_value']}')\nExpecting {exp}\n{d['lineno']:05d} | {d['line']}\n {' ' * (self.exception.column - 1)}^" - - -class DuplicateNameError(ValidationError): - def __init__(self, filecontent, name, linenumbers): - self.name = name - self.filecontent = filecontent - self.linenumbers = linenumbers - - def asdict(self, with_message=True): - return { - "type": "duplicate_name", - "name": self.name, - "lineno": self.linenumbers[0], - "line": self.filecontent.split("\n")[self.linenumbers[0] - 1], - **({"message": str(self)} if with_message else {}), - } - - def __str__(self): - d = self.asdict(with_message=False) - - def build(): - yield f"On line {d['lineno']}:\nDuplicate instance name #{d['name']}" - yield f"{d['lineno']:05d} | {d['line']}" - yield " " * 8 + "^" * len(d["line"].rstrip()) - - return "\n".join(build()) - - -grammar = r""" -file: "ISO-10303-21;" header data_section "END-ISO-10303-21;" -header: "HEADER" ";" header_entity_list "ENDSEC" ";" -header_line: (SPECIAL|DIGIT|LOWER|UPPER)* "*" -data_section: "DATA" ";" (entity_instance)* "ENDSEC" ";" -entity_instance: simple_entity_instance|complex_entity_instance -simple_entity_instance: id "=" simple_record ";" -complex_entity_instance: id "=" subsuper_record ";" -subsuper_record : "(" simple_record_list ")" -simple_record_list:simple_record simple_record* -simple_record: keyword "("parameter_list?")" -header_entity_list: file_description file_name file_schema -file_description: "FILE_DESCRIPTION" "(" parameter_list ")" ";" -file_name: "FILE_NAME" "(" parameter_list ")" ";" -file_schema: "FILE_SCHEMA" "(" parameter_list ")" ";" -id: /#[0-9]+/ -keyword: /[A-Z][0-9A-Z_]*/ -parameter: untyped_parameter|typed_parameter|omitted_parameter -parameter_list: parameter ("," parameter)* -list: "(" parameter ("," parameter)* ")" |"("")" -typed_parameter: keyword "(" parameter ")"|"()" -untyped_parameter: string| NONE |INT |REAL |enumeration |id |binary |list -omitted_parameter:STAR -enumeration: "." keyword "." -binary: "\"" ("0"|"1"|"2"|"3") (HEX)* "\"" -string: "'" (REVERSE_SOLIDUS REVERSE_SOLIDUS|SPECIAL|DIGIT|SPACE|LOWER|UPPER|CONTROL_DIRECTIVE|"\\*\\")* "'" - -STAR: "*" -SLASH: "/" -NONE: "$" -SPECIAL : "!" - | "*" - | "$" - | "%" - | "&" - | "." - | "#" - | "+" - | "," - | "-" - | "(" - | ")" - | "?" - | "/" - | ":" - | ";" - | "<" - | "=" - | ">" - | "@" - | "[" - | "]" - | "{" - | "|" - | "}" - | "^" - | "`" - | "~" - | "_" - | "\"" - | "\"\"" - | "''" -REAL: SIGN? DIGIT (DIGIT)* "." (DIGIT)* ("E" SIGN? DIGIT (DIGIT)* )? -INT: SIGN? DIGIT (DIGIT)* -CONTROL_DIRECTIVE: PAGE | ALPHABET | EXTENDED2 | EXTENDED4 | ARBITRARY -PAGE : REVERSE_SOLIDUS "S" REVERSE_SOLIDUS LATIN_CODEPOINT -LATIN_CODEPOINT : SPACE | DIGIT | LOWER | UPPER | SPECIAL | REVERSE_SOLIDUS | APOSTROPHE -ALPHABET : REVERSE_SOLIDUS "P" UPPER REVERSE_SOLIDUS -EXTENDED2: REVERSE_SOLIDUS "X2" REVERSE_SOLIDUS (HEX_TWO)* END_EXTENDED -EXTENDED4 :REVERSE_SOLIDUS "X4" REVERSE_SOLIDUS (HEX_FOUR)* END_EXTENDED -END_EXTENDED: REVERSE_SOLIDUS "X0" REVERSE_SOLIDUS -ARBITRARY: REVERSE_SOLIDUS "X" REVERSE_SOLIDUS HEX_ONE -HEX_FOUR: HEX_TWO HEX_TWO -HEX_TWO: HEX_ONE HEX_ONE -HEX_ONE: HEX HEX -HEX: "0" - | "1" - | "2" - | "3" - | "4" - | "5" - | "6" - | "7" - | "8" - | "9" - | "A" - | "B" - | "C" - | "D" - | "E" - | "F" -APOSTROPHE: "'" -REVERSE_SOLIDUS: "\\" -DIGIT: "0".."9" -SIGN: "+"|"-" -LOWER: "a".."z" -UPPER: "A".."Z" -ESCAPE : "\\" ( "$" | "\"" | CHAR ) -CHAR : /[^$"\n]/ -WORD : CHAR+ -SPACE.10 : " " - -%ignore /[ \t\f\r\n]/+ -""" - - -class Ref: - def __init__(self, id): - self.id = id - - def __str__(self): - return "#" + str(self.id) - - __repr__ = __str__ - - -class IfcType: - def __init__(self, ifctype, value): - self.ifctype = ifctype - self.value = value - - def __str__(self): - return self.ifctype + "(" + str(self.value) + ")" - - __repr__ = __str__ - - -class T(Transformer): - def id(self, s): - return int(s[0][1:]) - - def string(self, s): - word = "".join(s).replace("''", "'") - return word - - def keyword(self, s): - word = "".join(s) - return word - - def untyped_parameter(self, s): - return s[0] - - def parameter(self, s): - return s[0] - - def typed_parameter(self, s): - if len(s): - return IfcType(s[0], s[1]) - else: - return () - - def omitted_parameter(self, s): - return s[0] - - def enumeration(self, s): - return s[0] - - parameter_list = tuple - list = tuple - subsuper_record = list - INT = int - REAL = float - NONE = lambda *args: None - STAR = str - - -@dataclass -class entity_instance: - id: int - type: str - attributes: tuple - lines: tuple - - def __getitem__(self, k): - if isinstance(k, numbers.Integral): - return self.attributes[k] - else: - # compatibility with dict - return getattr(self, k) - - def __repr__(self): - return f'#{self.id}={self.type}({",".join(map(str, self.attributes))})' - - -def create_step_entity(entity_tree): - t = T(visit_tokens=True).transform(entity_tree) - - def get_line_number(t): - if isinstance(t, Token): - yield t.line - - def traverse(fn, x): - yield from fn(x) - if isinstance(x, Tree): - for c in x.children: - yield from traverse(fn, c) - - lines = list(traverse(get_line_number, entity_tree)) - - entity_id = t.children[0].children[0] - entity_type = t.children[0].children[1].children[0] - - attributes_tree = t.children[0].children[1].children[1] - attributes = list(attributes_tree) - - return entity_instance( - entity_id, - entity_type, - attributes, - (min(lines), max(lines)), - ) - -def make_header_ent(ast): - rule = ast.data - params = T(visit_tokens=True).transform(ast.children[0]) - return rule.upper(), params - - - -def process_tree(filecontent, file_tree, with_progress, with_header=False): - ents = defaultdict(list) - header, data = file_tree.children - - if with_header: - header = dict(map(make_header_ent, header.children[0].children)) - - n = len(data.children) - if n: - percentages = [i * 100.0 / n for i in range(n + 1)] - num_dots = [int(b) - int(a) for a, b in zip(percentages, percentages[1:])] - - for idx, entity_tree in enumerate(data.children): - if with_progress: - sys.stdout.write(num_dots[idx] * ".") - sys.stdout.flush() - ent = create_step_entity(entity_tree) - id_ = int(ent["id"]) - if ents[id_]: - raise DuplicateNameError(filecontent, ent["id"], ent["lines"]) - ents[id_].append(ent) - - if with_header: - return header, ents - else: - return ents - - -def parse( - *, - filename=None, - filecontent=None, - with_progress=False, - with_tree=True, - with_header=False, - only_header=False, -): - if filename: - assert not filecontent - filecontent = builtins.open(filename, encoding=None).read() - - if only_header: - with_header = True - - # Match and remove the comments - p = r"/\*[\s\S]*?\*/" - - def replace_fn(match): - return re.sub(r"[^\n]", " ", match.group(), flags=re.M) - - filecontent_wo_comments = re.sub(p, replace_fn, filecontent) - - - if only_header: - # Extract just the HEADER section using regex - header_match = re.search( - r"ISO-10303-21;\s*HEADER;(.*?)ENDSEC;", - filecontent_wo_comments, - flags=re.DOTALL | re.IGNORECASE, - ) - if not header_match: - raise ValidationError("No HEADER section found in file") - - header_text = f"HEADER;{header_match.group(1)}ENDSEC;" - full_header_text = f"ISO-10303-21;{header_text}DATA;ENDSEC;END-ISO-10303-21;" - - parser = Lark(grammar, parser="lalr", start="file") - try: - ast = parser.parse(full_header_text) - except (UnexpectedToken, UnexpectedCharacters) as e: - raise SyntaxError(filecontent, e) - - header_tree = ast.children[0] # HEADER section - - header = dict(map(make_header_ent, header_tree.children[0].children)) - return header - - - instance_identifiers = [] - transformer = {} - if not with_tree: - # If we're not going to return the tree, we also don't need to - # keep in memory while parsing. So we build a transformer that - # just returns None for every rule. lark creates a dictionary - # of callbacks from the transformer type object, so we can't - # simply use __getattr__ we need an actual type objects with - # callback functions for the rules given in the grammar. - - # Create a temporary parser just for analysing the grammar - temp = Lark(grammar, parser="lalr", start="file") - # Extract the rule names - rule_names = filter( - lambda s: not s.startswith("_"), set(r.origin.name for r in temp.rules) - ) - null_function = lambda self, *args: None - # Create dictionary of methods for type() creation - methods = {r: null_function for r in rule_names} - - # Even in this case we do want to report duplicate identifiers - # so these need to be captured - methods["id"] = lambda self, *args: args - methods["simple_entity_instance"] = ( - lambda self, tree: instance_identifiers.append( - (int(tree[0][0][0][1:]), int(tree[0][0][0].line)) - ) - ) - - NT = type("NullTransformer", (Transformer,), methods) - transformer = {"transformer": NT()} - - parser = Lark(grammar, parser="lalr", start="file", **transformer) - - try: - ast = parser.parse(filecontent_wo_comments) - except (UnexpectedToken, UnexpectedCharacters) as e: - raise SyntaxError(filecontent, e) - - if with_tree: - return process_tree(filecontent, ast, with_progress, with_header) - else: - # process_tree() would take care of duplicate identifiers, - # but we need to do it ourselves now using our rudimentary - # transformer - seen = set() - for iden, lineno in instance_identifiers: - if iden in seen: - raise DuplicateNameError(filecontent, iden, [lineno, lineno]) - seen.add(iden) - - -class file: - """ - A somewhat compatible interface (but very limited) to ifcopenshell.file - """ - - def __init__(self, parse_outcomes): - self.header_, self.data_ = parse_outcomes - - @property - def schema_identifier(self) -> str: - return self.header_["FILE_SCHEMA"][0][0] - - @property - def schema(self) -> str: - """General IFC schema version: IFC2X3, IFC4, IFC4X3.""" - prefixes = ("IFC", "X", "_ADD", "_TC") - reg = "".join(f"(?P<{s}>{s}\\d+)?" for s in prefixes) - match = re.match(reg, self.schema_identifier) - version_tuple = tuple( - map( - lambda pp: int(pp[1][len(pp[0]) :]) if pp[1] else None, - ((p, match.group(p)) for p in prefixes), - ) - ) - return "".join( - "".join(map(str, t)) if t[1] else "" - for t in zip(prefixes, version_tuple[0:2]) - ) - - @property - def schema_version(self) -> tuple[int, int, int, int]: - """Numeric representation of the full IFC schema version. - - E.g. IFC4X3_ADD2 is represented as (4, 3, 2, 0). - """ - schema = self.wrapped_data.schema - version = [] - for prefix in ("IFC", "X", "_ADD", "_TC"): - number = re.search(prefix + r"(\d)", schema) - version.append(int(number.group(1)) if number else 0) - return tuple(version) - - - @property - def header(self): - HEADER_FIELDS = { - "file_description": namedtuple('file_description', ['description', 'implementation_level']), - "file_name": namedtuple('file_name', ['name', 'time_stamp', 'author', 'organization', 'preprocessor_version', 'originating_system', 'authorization']), - "file_schema": namedtuple('file_schema', ['schema_identifiers']), - } - header = {} - - for field_name, namedtuple_class in HEADER_FIELDS.items(): - field_data = self.header_.get(field_name.upper(), []) - header[field_name.lower()] = namedtuple_class(*field_data) - - return types.SimpleNamespace(**header) - - - @property - def mvd(self): - if not LARK_AVAILABLE or MvdInfo is None: - return None - return MvdInfo(self.header) - - def __getitem__(self, key: numbers.Integral) -> entity_instance: - return self.by_id(key) - - def by_id(self, id: int) -> entity_instance: - """Return an IFC entity instance filtered by IFC ID. - - :param id: STEP numerical identifier - :type id: int - - :raises RuntimeError: If `id` is not found or multiple definitions exist for `id`. - - :rtype: entity_instance - """ - ns = self.data_.get(id, []) - if len(ns) == 0: - raise RuntimeError(f"Instance with id {id} not found") - elif len(ns) > 1: - raise RuntimeError(f"Duplicate definition for id {id}") - return ns[0] - - def by_type(self, type: str) -> list[entity_instance]: - """Return IFC objects filtered by IFC Type and wrapped with the entity_instance class. - :rtype: list[entity_instance] - """ - type_lc = type.lower() - return list( - filter( - lambda ent: ent.type.lower() == type_lc, - itertools.chain.from_iterable(self.data_.values()), - ) - ) - - -def open(fn, only_header: bool = False) -> file: - if only_header: # Ensure consistent options - parse_outcomes = parse( - filename=fn, - with_tree=True, - with_header=True, # must be True to return the header - only_header=True, - ) - return file((parse_outcomes, defaultdict(list))) # data section is empty - else: - parse_outcomes = parse( - filename=fn, - with_tree=True, - with_header=True, - only_header=False, - ) - return file(parse_outcomes) \ No newline at end of file + from parser.parse import parse + from parser.file import file, open + from parser.errors import _ValidationError, CollectedValidationErrors, DuplicateNameError, HeaderFieldError +except: + from .parser.parse import parse + from .parser.file import file, open + from .parser.errors import _ValidationError, CollectedValidationErrors, DuplicateNameError, HeaderFieldError + +__all__ = ["parse", "open", "file", "_ValidationError", + "CollectedValidationErrors", "DuplicateNameError", "HeaderFieldError"] # for testing \ No newline at end of file diff --git a/__main__.py b/__main__.py index 7230888..537d022 100644 --- a/__main__.py +++ b/__main__.py @@ -1,7 +1,7 @@ import sys import json import argparse -from . import parse, ValidationError +from . import parse, CollectedValidationErrors def main(): parser = argparse.ArgumentParser(description="Parse and validate STEP file.") @@ -22,11 +22,11 @@ def main(): if not args.json: print("Valid", file=sys.stderr) exit(0) - except ValidationError as exc: + except CollectedValidationErrors as exc: if not args.json: print(exc, file=sys.stderr) else: - json.dump(exc.asdict(), sys.stdout) + json.dump([e.asdict() for e in exc.errors], sys.stdout, indent=2) exit(1) if __name__ == '__main__': diff --git a/fixtures/fail_multiple_duplicate_ids.ifc b/fixtures/fail_multiple_duplicate_ids.ifc new file mode 100644 index 0000000..1df3800 --- /dev/null +++ b/fixtures/fail_multiple_duplicate_ids.ifc @@ -0,0 +1,29 @@ +ISO-10303-21; +HEADER; +FILE_DESCRIPTION(('ViewDefinition [CoordinationView]'),'2;1'); +FILE_NAME('','2022-05-04T08:08:30',(''),(''),'IfcOpenShell-0.7.0','IfcOpenShell-0.7.0',''); +FILE_SCHEMA(('IFC4')); +ENDSEC; +DATA; +#1=IFCPERSON($,$,'',$,$,$,$,$); +#2=IFCORGANIZATION($,'',$,$,$); +#3=IFCPERSONANDORGANIZATION(#1,#2,$); +#4=IFCAPPLICATION(#2,'0.7.0','IfcOpenShell-0.7.0',''); +#5=IFCOWNERHISTORY(#3,#4,$,.ADDED.,$,#3,#4,1651651710); +#6=IFCDIRECTION((1.,0.,0.)); +#7=IFCDIRECTION((0.,0.,1.)); +#8=IFCCARTESIANPOINT((0.,0.,0.)); +#9=IFCAXIS2PLACEMENT3D(#8,#7,#6); +#10=IFCDIRECTION((0.,1.,0.)); +#11=IFCGEOMETRICREPRESENTATIONCONTEXT($,'Model',3,1.E-05,#9,#10); +#12=IFCDIMENSIONALEXPONENTS(0,0,0,0,0,0,0); +#13=IFCSIUNIT(*,.LENGTHUNIT.,$,.METRE.); +#14=IFCSIUNIT(*,.AREAUNIT.,$,.SQUARE_METRE.); +#15=IFCSIUNIT(*,.VOLUMEUNIT.,$,.CUBIC_METRE.); +#16=IFCSIUNIT(*,.PLANEANGLEUNIT.,$,.RADIAN.); +#18=IFCMEASUREWITHUNIT(IFCPLANEANGLEMEASURE(0.017453292519943295),#16); +#18=IFCCONVERSIONBASEDUNIT(#12,.PLANEANGLEUNIT.,'DEGREE',#17); +#19=IFCUNITASSIGNMENT((#13,#14,#15,#18)); +#19=IFCPROJECT('2AyG2X0sb16Bjd4gQc07yZ',#5,'',$,$,$,$,(#11),#19); +ENDSEC; +END-ISO-10303-21; diff --git a/fixtures/fail_multiple_wrong_header_fields.ifc b/fixtures/fail_multiple_wrong_header_fields.ifc new file mode 100644 index 0000000..1ca0687 --- /dev/null +++ b/fixtures/fail_multiple_wrong_header_fields.ifc @@ -0,0 +1,30 @@ +ISO-10303-21; +HEADER; +FILE_DESCRIPTION(('ViewDefinition [ReferenceView_V1.2]', 'ExchangeRequirement [Any]')); +FILE_NAME('Header.ifc','2025-02-13T15:58:45',('tricott'),('Trimble Inc.'),'TrimBimToIFC rel. 4.0.2','Example - Example - 2025.0','IFC4 model', ''); +FILE_SCHEMA(('IFC4')); +ENDSEC; +DATA; +#1=IFCPERSON($,$,'',$,$,$,$,$); +#2=IFCORGANIZATION($,'',$,$,$); +#3=IFCPERSONANDORGANIZATION(#1,#2,$); +#4=IFCAPPLICATION(#2,'v0.7.0-6c9e130ca','IfcOpenShell-v0.7.0-6c9e130ca',''); +#5=IFCOWNERHISTORY(#3,#4,$,.NOTDEFINED.,$,#3,#4,1700419055); +#6=IFCDIRECTION((1.,0.,0.)); +#7=IFCDIRECTION((0.,0.,1.)); +#8=IFCCARTESIANPOINT((0.,0.,0.)); +#9=IFCAXIS2PLACEMENT3D(#8,#7,#6); +#10=IFCDIRECTION((0.,1.)); +#11=IFCGEOMETRICREPRESENTATIONCONTEXT($,'Model',3,1.E-05,#9,#10); +#12=IFCDIMENSIONALEXPONENTS(0,0,0,0,0,0,0); +#13=IFCSIUNIT(*,.LENGTHUNIT.,$,.METRE.); +#14=IFCSIUNIT(*,.AREAUNIT.,$,.SQUARE_METRE.); +#15=IFCSIUNIT(*,.VOLUMEUNIT.,$,.CUBIC_METRE.); +#16=IFCSIUNIT(*,.PLANEANGLEUNIT.,$,.RADIAN.); +#17=IFCMEASUREWITHUNIT(IFCPLANEANGLEMEASURE(0.017453292519943295),#16); +#18=IFCCONVERSIONBASEDUNIT(#12,.PLANEANGLEUNIT.,'DEGREE',#17); +#19=IFCUNITASSIGNMENT((#13,#14,#15,#18)); +#20=IFCPROJECT('0iDmeiiLP3AOllitM2Favn',#5,'',$,$,$,$,(#11),#19); +#21=IFCSITE('3rg2jGkIH10RFhrQsGZKRk',#5,$,$,$,$,$,$,$,$,$,$,$,$); +ENDSEC; +END-ISO-10303-21; diff --git a/fixtures/fail_too_many_header_entity_fields.ifc b/fixtures/fail_too_many_header_entity_fields.ifc new file mode 100644 index 0000000..24a853b --- /dev/null +++ b/fixtures/fail_too_many_header_entity_fields.ifc @@ -0,0 +1,30 @@ +ISO-10303-21; +HEADER; +FILE_DESCRIPTION(('ViewDefinition [ReferenceView_V1.2]', 'ExchangeRequirement [Any]'),'2;1'); +FILE_NAME('Header.ifc','2025-02-13T15:58:45',('tricott'),('Trimble Inc.'),'TrimBimToIFC rel. 4.0.2','Example - Example - 2025.0','IFC4 model', ''); +FILE_SCHEMA(('IFC4')); +ENDSEC; +DATA; +#1=IFCPERSON($,$,'',$,$,$,$,$); +#2=IFCORGANIZATION($,'',$,$,$); +#3=IFCPERSONANDORGANIZATION(#1,#2,$); +#4=IFCAPPLICATION(#2,'v0.7.0-6c9e130ca','IfcOpenShell-v0.7.0-6c9e130ca',''); +#5=IFCOWNERHISTORY(#3,#4,$,.NOTDEFINED.,$,#3,#4,1700419055); +#6=IFCDIRECTION((1.,0.,0.)); +#7=IFCDIRECTION((0.,0.,1.)); +#8=IFCCARTESIANPOINT((0.,0.,0.)); +#9=IFCAXIS2PLACEMENT3D(#8,#7,#6); +#10=IFCDIRECTION((0.,1.)); +#11=IFCGEOMETRICREPRESENTATIONCONTEXT($,'Model',3,1.E-05,#9,#10); +#12=IFCDIMENSIONALEXPONENTS(0,0,0,0,0,0,0); +#13=IFCSIUNIT(*,.LENGTHUNIT.,$,.METRE.); +#14=IFCSIUNIT(*,.AREAUNIT.,$,.SQUARE_METRE.); +#15=IFCSIUNIT(*,.VOLUMEUNIT.,$,.CUBIC_METRE.); +#16=IFCSIUNIT(*,.PLANEANGLEUNIT.,$,.RADIAN.); +#17=IFCMEASUREWITHUNIT(IFCPLANEANGLEMEASURE(0.017453292519943295),#16); +#18=IFCCONVERSIONBASEDUNIT(#12,.PLANEANGLEUNIT.,'DEGREE',#17); +#19=IFCUNITASSIGNMENT((#13,#14,#15,#18)); +#20=IFCPROJECT('0iDmeiiLP3AOllitM2Favn',#5,'',$,$,$,$,(#11),#19); +#21=IFCSITE('3rg2jGkIH10RFhrQsGZKRk',#5,$,$,$,$,$,$,$,$,$,$,$,$); +ENDSEC; +END-ISO-10303-21; diff --git a/parser/__init__.py b/parser/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/parser/errors.py b/parser/errors.py new file mode 100644 index 0000000..2449866 --- /dev/null +++ b/parser/errors.py @@ -0,0 +1,108 @@ +from lark.exceptions import UnexpectedToken + +class _ValidationError(Exception): + def __init__(self, *args, **kwargs): + if self.__class__ is _ValidationError: + raise TypeError("Do not raise _ValidationError directly.") + super().__init__(*args, **kwargs) + +class ErrorCollector: + def __init__(self): + self.errors = [] + + def add(self, error): + self.errors.append(error) + + def raise_if_any(self): + if self.errors: + raise CollectedValidationErrors(self.errors) + +class CollectedValidationErrors(_ValidationError): + def __init__(self, errors): + self.errors = errors + + def asdict(self, with_message=True): + return [e.asdict(with_message=with_message) for e in self.errors] + + def __str__(self): + return f"{len(self.errors)} validation error(s) collected:\n" + "\n\n".join(str(e) for e in self.errors) + +class SyntaxError(_ValidationError): + def __init__(self, filecontent, exception): + self.filecontent = filecontent + self.exception = exception + + def asdict(self, with_message=True): + return { + "type": ( + "unexpected_token" + if isinstance(self.exception, UnexpectedToken) + else "unexpected_character" + ), + "lineno": self.exception.line, + "column": self.exception.column, + "found_type": self.exception.token.type.lower(), + "found_value": self.exception.token.value, + "expected": sorted(x for x in self.exception.accepts if "__ANON" not in x), + "line": self.filecontent.split("\n")[self.exception.line - 1], + **({"message": str(self)} if with_message else {}), + } + + def __str__(self): + d = self.asdict(with_message=False) + if len(d["expected"]) == 1: + exp = d["expected"][0] + else: + exp = f"one of {' '.join(d['expected'])}" + + sth = "character" if d["type"] == "unexpected_character" else "" + + return f"On line {d['lineno']} column {d['column']}:\nUnexpected {sth}{d['found_type']} ('{d['found_value']}')\nExpecting {exp}\n{d['lineno']:05d} | {d['line']}\n {' ' * (self.exception.column - 1)}^" + + +class DuplicateNameError(_ValidationError): + def __init__(self, filecontent, name, linenumbers): + self.name = name + self.filecontent = filecontent + self.linenumbers = linenumbers + + def asdict(self, with_message=True): + return { + "type": "duplicate_name", + "name": self.name, + "lineno": self.linenumbers[0], + "line": self.filecontent.split("\n")[self.linenumbers[0] - 1], + **({"message": str(self)} if with_message else {}), + } + + def __str__(self): + d = self.asdict(with_message=False) + + def build(): + yield f"On line {d['lineno']}:\nDuplicate instance name #{d['name']}" + yield f"{d['lineno']:05d} | {d['line']}" + yield " " * 8 + "^" * len(d["line"].rstrip()) + + return "\n".join(build()) + + +class HeaderFieldError(_ValidationError): + def __init__(self, field, found_len, expected_len): + self.field = field + self.found_len = found_len + self.expected_len = expected_len + + def asdict(self, with_message=True): + return { + "type": "invalid_header_field", + "field": self.field, + "expected_field_count": self.expected_len, + "actual_field_count": self.found_len, + **({"message": str(self)} if with_message else {}), + } + + def __str__(self): + return ( + f"Invalid number of parameters for HEADER field '{self.field}'. " + f"Expected {self.expected_len}, found {self.found_len}." + ) \ No newline at end of file diff --git a/parser/file.py b/parser/file.py new file mode 100644 index 0000000..908d6cf --- /dev/null +++ b/parser/file.py @@ -0,0 +1,108 @@ +import types +import re +import numbers +import itertools + +from .parse import parse, ParseResult +from .grammar import HEADER_FIELDS +from .transformer import entity_instance + +try: + from .mvd_info import MvdInfo, LARK_AVAILABLE +except ImportError: # in case of running module locally (e.g. test_parser.py) + from mvd_info import MvdInfo, LARK_AVAILABLE + +class file: + """ + A somewhat compatible interface (but very limited) to ifcopenshell.file + """ + + def __init__(self, result:ParseResult): + self.header_ = result.header + self.data_ = result.entities + + @property + def schema_identifier(self) -> str: + return self.header_["FILE_SCHEMA"][0][0] + + @property + def schema(self) -> str: + """General IFC schema version: IFC2X3, IFC4, IFC4X3.""" + prefixes = ("IFC", "X", "_ADD", "_TC") + reg = "".join(f"(?P<{s}>{s}\\d+)?" for s in prefixes) + match = re.match(reg, self.schema_identifier) + version_tuple = tuple( + map( + lambda pp: int(pp[1][len(pp[0]) :]) if pp[1] else None, + ((p, match.group(p)) for p in prefixes), + ) + ) + return "".join( + "".join(map(str, t)) if t[1] else "" + for t in zip(prefixes, version_tuple[0:2]) + ) + + @property + def schema_version(self) -> tuple[int, int, int, int]: + """Numeric representation of the full IFC schema version. + + E.g. IFC4X3_ADD2 is represented as (4, 3, 2, 0). + """ + schema = self.wrapped_data.schema + version = [] + for prefix in ("IFC", "X", "_ADD", "_TC"): + number = re.search(prefix + r"(\d)", schema) + version.append(int(number.group(1)) if number else 0) + return tuple(version) + + + @property + def header(self): + header = {} + for field_name, namedtuple_class in HEADER_FIELDS.items(): + field_data = self.header_.get(field_name.upper(), []) + header[field_name.lower()] = namedtuple_class(*field_data) + + return types.SimpleNamespace(**header) + + + @property + def mvd(self): + if not LARK_AVAILABLE or MvdInfo is None: + return None + return MvdInfo(self.header) + + def __getitem__(self, key: numbers.Integral) -> entity_instance: + return self.by_id(key) + + def by_id(self, id: int) -> entity_instance: + """Return an IFC entity instance filtered by IFC ID. + + :param id: STEP numerical identifier + :type id: int + + :raises RuntimeError: If `id` is not found or multiple definitions exist for `id`. + + :rtype: entity_instance + """ + ns = self.data_.get(id, []) + if len(ns) == 0: + raise RuntimeError(f"Instance with id {id} not found") + elif len(ns) > 1: + raise RuntimeError(f"Duplicate definition for id {id}") + return ns[0] + + def by_type(self, type: str) -> list[entity_instance]: + """Return IFC objects filtered by IFC Type and wrapped with the entity_instance class. + :rtype: list[entity_instance] + """ + type_lc = type.lower() + return list( + filter( + lambda ent: ent.type.lower() == type_lc, + itertools.chain.from_iterable(self.data_.values()), + ) + ) + +def open(fn, only_header= False) -> file: + return file(parse(filename=fn, only_header=only_header)) \ No newline at end of file diff --git a/parser/grammar.py b/parser/grammar.py new file mode 100644 index 0000000..9d6a016 --- /dev/null +++ b/parser/grammar.py @@ -0,0 +1,112 @@ +from collections import namedtuple + +grammar = r""" +file: "ISO-10303-21;" header data_section "END-ISO-10303-21;" +header: "HEADER" ";" header_entity_list "ENDSEC" ";" +header_line: (SPECIAL|DIGIT|LOWER|UPPER)* "*" +data_section: "DATA" ";" (entity_instance)* "ENDSEC" ";" +entity_instance: simple_entity_instance|complex_entity_instance +simple_entity_instance: id "=" simple_record ";" +complex_entity_instance: id "=" subsuper_record ";" +subsuper_record : "(" simple_record_list ")" +simple_record_list:simple_record simple_record* +simple_record: keyword "("parameter_list?")" +header_entity_list: file_description file_name file_schema +file_description: "FILE_DESCRIPTION" "(" parameter_list ")" ";" +file_name: "FILE_NAME" "(" parameter_list ")" ";" +file_schema: "FILE_SCHEMA" "(" parameter_list ")" ";" +id: /#[0-9]+/ +keyword: /[A-Z][0-9A-Z_]*/ +parameter: untyped_parameter|typed_parameter|omitted_parameter +parameter_list: parameter ("," parameter)* +list: "(" parameter ("," parameter)* ")" |"("")" +typed_parameter: keyword "(" parameter ")"|"()" +untyped_parameter: string| NONE |INT |REAL |enumeration |id |binary |list +omitted_parameter:STAR +enumeration: "." keyword "." +binary: "\"" ("0"|"1"|"2"|"3") (HEX)* "\"" +string: "'" (REVERSE_SOLIDUS REVERSE_SOLIDUS|SPECIAL|DIGIT|SPACE|LOWER|UPPER|CONTROL_DIRECTIVE|"\\*\\")* "'" + +STAR: "*" +SLASH: "/" +NONE: "$" +SPECIAL : "!" + | "*" + | "$" + | "%" + | "&" + | "." + | "#" + | "+" + | "," + | "-" + | "(" + | ")" + | "?" + | "/" + | ":" + | ";" + | "<" + | "=" + | ">" + | "@" + | "[" + | "]" + | "{" + | "|" + | "}" + | "^" + | "`" + | "~" + | "_" + | "\"" + | "\"\"" + | "''" +REAL: SIGN? DIGIT (DIGIT)* "." (DIGIT)* ("E" SIGN? DIGIT (DIGIT)* )? +INT: SIGN? DIGIT (DIGIT)* +CONTROL_DIRECTIVE: PAGE | ALPHABET | EXTENDED2 | EXTENDED4 | ARBITRARY +PAGE : REVERSE_SOLIDUS "S" REVERSE_SOLIDUS LATIN_CODEPOINT +LATIN_CODEPOINT : SPACE | DIGIT | LOWER | UPPER | SPECIAL | REVERSE_SOLIDUS | APOSTROPHE +ALPHABET : REVERSE_SOLIDUS "P" UPPER REVERSE_SOLIDUS +EXTENDED2: REVERSE_SOLIDUS "X2" REVERSE_SOLIDUS (HEX_TWO)* END_EXTENDED +EXTENDED4 :REVERSE_SOLIDUS "X4" REVERSE_SOLIDUS (HEX_FOUR)* END_EXTENDED +END_EXTENDED: REVERSE_SOLIDUS "X0" REVERSE_SOLIDUS +ARBITRARY: REVERSE_SOLIDUS "X" REVERSE_SOLIDUS HEX_ONE +HEX_FOUR: HEX_TWO HEX_TWO +HEX_TWO: HEX_ONE HEX_ONE +HEX_ONE: HEX HEX +HEX: "0" + | "1" + | "2" + | "3" + | "4" + | "5" + | "6" + | "7" + | "8" + | "9" + | "A" + | "B" + | "C" + | "D" + | "E" + | "F" +APOSTROPHE: "'" +REVERSE_SOLIDUS: "\\" +DIGIT: "0".."9" +SIGN: "+"|"-" +LOWER: "a".."z" +UPPER: "A".."Z" +ESCAPE : "\\" ( "$" | "\"" | CHAR ) +CHAR : /[^$"\n]/ +WORD : CHAR+ +SPACE.10 : " " + +%ignore /[ \t\f\r\n]/+ +""" + +HEADER_FIELDS = { + "file_description": namedtuple('file_description', ['description', 'implementation_level']), + "file_name": namedtuple('file_name', ['name', 'time_stamp', 'author', 'organization', 'preprocessor_version', 'originating_system', 'authorization']), + "file_schema": namedtuple('file_schema', ['schema_identifiers']), +} \ No newline at end of file diff --git a/mvd_info.py b/parser/mvd_info.py similarity index 100% rename from mvd_info.py rename to parser/mvd_info.py diff --git a/parser/parse.py b/parser/parse.py new file mode 100644 index 0000000..ccb5e9c --- /dev/null +++ b/parser/parse.py @@ -0,0 +1,166 @@ +from dataclasses import dataclass +from collections import defaultdict +import re +import sys +import builtins +from lark import Lark, UnexpectedCharacters, UnexpectedToken + +# import transformer +from .transformer import Transformer, entity_instance, make_header_ent, create_step_entity +from .grammar import grammar, HEADER_FIELDS +from .errors import HeaderFieldError, DuplicateNameError, ErrorCollector, SyntaxError + +def validate_header_fields(header, error_collector, only_header = False): + for field in HEADER_FIELDS.keys(): + observed = header.get(field.upper(), []) + expected = HEADER_FIELDS.get(field)._fields + if len(observed) != len(expected): + error_collector.add(HeaderFieldError(field.upper(), len(observed), len(expected))) + if only_header: + error_collector.raise_if_any() + +@dataclass +class ParseResult: + header: dict + entities: dict[int, list[entity_instance]] + + +def process_tree(filecontent, file_tree, with_progress, error_collector): + ents = defaultdict(list) + header, data = file_tree.children + + header = dict(map(make_header_ent, header.children[0].children)) + validate_header_fields(header, error_collector) + + n = len(data.children) + if n: + percentages = [i * 100.0 / n for i in range(n + 1)] + num_dots = [int(b) - int(a) for a, b in zip(percentages, percentages[1:])] + + for idx, entity_tree in enumerate(data.children): + if with_progress: + sys.stdout.write(num_dots[idx] * ".") + sys.stdout.flush() + ent = create_step_entity(entity_tree) + id_ = int(ent["id"]) + if ents[id_]: + error_collector.add(DuplicateNameError(filecontent, ent["id"], ent["lines"])) + else: + ents[id_].append(ent) + + return header, ents + +def parse( + *, + filename=None, + filecontent=None, + with_progress=False, + with_tree=True, + only_header=False, +) -> ParseResult: + error_collector = ErrorCollector() + if filename: + assert not filecontent + filecontent = builtins.open(filename, encoding=None).read() + + # Match and remove the comments + p = r"/\*[\s\S]*?\*/" + + def replace_fn(match): + return re.sub(r"[^\n]", " ", match.group(), flags=re.M) + + filecontent_wo_comments = re.sub(p, replace_fn, filecontent) + + if only_header: + # Extract just the HEADER section using regex + header_match = re.search( + r"ISO-10303-21;\s*HEADER;(.*?)ENDSEC;", + filecontent_wo_comments, + flags=re.DOTALL | re.IGNORECASE, + ) + if not header_match: + error_collector.add(HeaderFieldError( + 'header', '', 'No HEADER section found in file' + )) + error_collector.raise_if_any() + + header_text = f"HEADER;{header_match.group(1)}ENDSEC;" + full_header_text = f"ISO-10303-21;{header_text}DATA;ENDSEC;END-ISO-10303-21;" + + parser = Lark(grammar, parser="lalr", start="file") + try: + ast = parser.parse(full_header_text) + except (UnexpectedToken, UnexpectedCharacters) as e: + error_collector.add(SyntaxError(filecontent, e)) + error_collector.raise_if_any() # Immediately abort in case of critical error + + + header_tree = ast.children[0] # HEADER section + + header = dict(map(make_header_ent, header_tree.children[0].children)) + validate_header_fields(header, error_collector, only_header=True) + error_collector.raise_if_any() + return ParseResult( + header = header, + entities = defaultdict(list) + ) + + + instance_identifiers = [] + transformer = {} + if not with_tree: + # If we're not going to return the tree, we also don't need to + # keep in memory while parsing. So we build a transformer that + # just returns None for every rule. lark creates a dictionary + # of callbacks from the transformer type object, so we can't + # simply use __getattr__ we need an actual type objects with + # callback functions for the rules given in the + + # Create a temporary parser just for analysing the grammar + temp = Lark(grammar, parser="lalr", start="file") + # Extract the rule names + rule_names = filter( + lambda s: not s.startswith("_"), set(r.origin.name for r in temp.rules) + ) + null_function = lambda self, *args: None + # Create dictionary of methods for type() creation + methods = {r: null_function for r in rule_names} + + # Even in this case we do want to report duplicate identifiers + # so these need to be captured + methods["id"] = lambda self, *args: args + methods["simple_entity_instance"] = ( + lambda self, tree: instance_identifiers.append( + (int(tree[0][0][0][1:]), int(tree[0][0][0].line)) + ) + ) + + NT = type("NullTransformer", (Transformer,), methods) + transformer = {"transformer": NT()} + + parser = Lark(grammar, parser="lalr", start="file", **transformer) + + try: + ast = parser.parse(filecontent_wo_comments) + except (UnexpectedToken, UnexpectedCharacters) as e: + error_collector.add(SyntaxError(filecontent, e)) + error_collector.raise_if_any() # Immediately abort in case of critical error + + if with_tree: + header, data = process_tree(filecontent, ast, with_progress, error_collector) + error_collector.raise_if_any() + return ParseResult( + header = header, + entities = data + ) + else: + # process_tree() would take care of duplicate identifiers, + # but we need to do it ourselves now using our rudimentary + # transformer + seen = set() + for iden, lineno in instance_identifiers: + if iden in seen: + error_collector.add(DuplicateNameError(filecontent, iden, [lineno, lineno])) + else: + seen.add(iden) + error_collector.raise_if_any() diff --git a/parser/transformer.py b/parser/transformer.py new file mode 100644 index 0000000..584c499 --- /dev/null +++ b/parser/transformer.py @@ -0,0 +1,105 @@ +from lark import Transformer +from dataclasses import dataclass +import numbers +from lark import Lark, Transformer, Tree, Token + + +class IfcType: + def __init__(self, ifctype, value): + self.ifctype = ifctype + self.value = value + + def __str__(self): + return self.ifctype + "(" + str(self.value) + ")" + + __repr__ = __str__ + +@dataclass +class entity_instance: + id: int + type: str + attributes: tuple + lines: tuple + + def __getitem__(self, k): + if isinstance(k, numbers.Integral): + return self.attributes[k] + else: + # compatibility with dict + return getattr(self, k) + + def __repr__(self): + return f'#{self.id}={self.type}({",".join(map(str, self.attributes))})' + + +class T(Transformer): + def id(self, s): + return int(s[0][1:]) + + def string(self, s): + word = "".join(s).replace("''", "'") + return word + + def keyword(self, s): + word = "".join(s) + return word + + def untyped_parameter(self, s): + return s[0] + + def parameter(self, s): + return s[0] + + def typed_parameter(self, s): + if len(s): + return IfcType(s[0], s[1]) + else: + return () + + def omitted_parameter(self, s): + return s[0] + + def enumeration(self, s): + return s[0] + + parameter_list = tuple + list = tuple + subsuper_record = list + INT = int + REAL = float + NONE = lambda *args: None + STAR = str + + +def create_step_entity(entity_tree): + t = T(visit_tokens=True).transform(entity_tree) + + def get_line_number(t): + if isinstance(t, Token): + yield t.line + + def traverse(fn, x): + yield from fn(x) + if isinstance(x, Tree): + for c in x.children: + yield from traverse(fn, c) + + lines = list(traverse(get_line_number, entity_tree)) + + entity_id = t.children[0].children[0] + entity_type = t.children[0].children[1].children[0] + + attributes_tree = t.children[0].children[1].children[1] + attributes = list(attributes_tree) + + return entity_instance( + entity_id, + entity_type, + attributes, + (min(lines), max(lines)), + ) + +def make_header_ent(ast): + rule = ast.data + params = T(visit_tokens=True).transform(ast.children[0]) + return rule.upper(), params diff --git a/test_parser.py b/test_parser.py index 48f3185..5e4ef56 100644 --- a/test_parser.py +++ b/test_parser.py @@ -1,13 +1,13 @@ import glob import pytest -from __init__ import parse, open, ValidationError +from __init__ import parse, open, _ValidationError, CollectedValidationErrors, DuplicateNameError, HeaderFieldError from contextlib import nullcontext def create_context(fn): if "fail_" in fn: - return pytest.raises(ValidationError) + return pytest.raises(_ValidationError) else: return nullcontext() @@ -20,6 +20,10 @@ def test_file_with_tree(file): @pytest.mark.parametrize("file", glob.glob("fixtures/*.ifc")) def test_file_without_tree(file): + if any(sub in file for sub in ["fail_too_many_header_entity_fields.ifc", "fail_multiple_wrong_header_fields"]): + pytest.skip("This file relies on header field validation using the parsed AST, " + "but with_tree=False uses a NullTransformer that discards the AST, " + "so validating the header field names is not possible in this mode.") with create_context(file): parse(filename=file, with_tree=False) @@ -28,9 +32,11 @@ def test_parse_features(): f = open('fixtures/pass_1.ifc') assert f.by_id(1).id == 1 assert f.by_id(1).type == 'IFCPERSON' + assert f.data_[1][0].type == 'IFCPERSON' assert f.by_type('ifcperson')[0].id == 1 assert f[1][0] is None assert f.header.file_description[0][0] == 'ViewDefinition [CoordinationView]' + assert f.header_.get('FILE_DESCRIPTION')[0][0] assert f.by_type('ifcapplication')[1][2] == "Nested ' quotes" @@ -113,9 +119,9 @@ def test_file_mvd_attr(): 'fixtures/fail_no_header.ifc', ]) def test_invalid_headers_(filename): - # error in header; with_header should raise an error - with pytest.raises(ValidationError): - parse(filename=filename, with_tree=False, only_header=True, with_header=True) + # error in header + with pytest.raises(_ValidationError): + parse(filename=filename, with_tree=False, only_header=True) @pytest.mark.parametrize("filename", [ 'fixtures/fail_duplicate_id.ifc', @@ -123,6 +129,32 @@ def test_invalid_headers_(filename): 'fixtures/fail_double_semi.ifc' ]) def test_valid_headers(filename): - # error in body; with_header should not raise an error + # error in body with nullcontext(): - parse(filename=filename, with_tree=False, only_header=True, with_header=True) + parse(filename=filename, with_tree=False, only_header=True) + +def test_header_entity_fields(): + with pytest.raises(_ValidationError): + parse(filename='fixtures/fail_too_many_header_entity_fields.ifc', only_header=True) + +def test_header_entity_fields_whole_file(): + with pytest.raises(_ValidationError): + parse(filename='fixtures/fail_too_many_header_entity_fields.ifc') + +def test_header_entity_fields_whole_file(): + with pytest.raises(CollectedValidationErrors) as exc_info: + parse(filename="fixtures/fail_multiple_duplicate_ids.ifc") + + errors = exc_info.value.errors + + assert len(errors) == 2 + assert all(isinstance(e, DuplicateNameError) for e in errors) + +def test_multiple_wrong_header_fields(): + with pytest.raises(CollectedValidationErrors) as exc_info: + parse(filename="fixtures/fail_multiple_wrong_header_fields.ifc") + + errors = exc_info.value.errors + + assert len(errors) == 2 + assert all(isinstance(e, HeaderFieldError) for e in errors) \ No newline at end of file