diff --git a/__init__.py b/__init__.py
index 02a5d1e..b79fbe1 100644
--- a/__init__.py
+++ b/__init__.py
@@ -1,545 +1,11 @@
-import builtins
-from dataclasses import dataclass
-import itertools
-import numbers
-import sys
-import re
-
-from collections import defaultdict
-import types
-
-from lark import Lark, Transformer, Tree, Token
-from lark.exceptions import UnexpectedToken, UnexpectedCharacters
 try:
-    from .mvd_info import MvdInfo, LARK_AVAILABLE
-except ImportError: # in case of running module locally (e.g. test_parser.py)
-    from mvd_info import MvdInfo, LARK_AVAILABLE
-
-class ValidationError(Exception):
-    pass
-
-from collections import namedtuple
-
-class SyntaxError(ValidationError):
-    def __init__(self, filecontent, exception):
-        self.filecontent = filecontent
-        self.exception = exception
-
-    def asdict(self, with_message=True):
-        return {
-            "type": (
-                "unexpected_token"
-                if isinstance(self.exception, UnexpectedToken)
-                else "unexpected_character"
-            ),
-            "lineno": self.exception.line,
-            "column": self.exception.column,
-            "found_type": self.exception.token.type.lower(),
-            "found_value": self.exception.token.value,
-            "expected": sorted(x for x in self.exception.accepts if "__ANON" not in x),
-            "line": self.filecontent.split("\n")[self.exception.line - 1],
-            **({"message": str(self)} if with_message else {}),
-        }
-
-    def __str__(self):
-        d = self.asdict(with_message=False)
-        if len(d["expected"]) == 1:
-            exp = d["expected"][0]
-        else:
-            exp = f"one of {' '.join(d['expected'])}"
-
-        sth = "character" if d["type"] == "unexpected_character" else ""
-
-        return f"On line {d['lineno']} column {d['column']}:\nUnexpected {sth}{d['found_type']} ('{d['found_value']}')\nExpecting {exp}\n{d['lineno']:05d} | {d['line']}\n        {' ' * (self.exception.column - 1)}^"
-
-
-class DuplicateNameError(ValidationError):
-    def __init__(self, filecontent, name, linenumbers):
-        self.name = name
-        self.filecontent = filecontent
-        self.linenumbers = linenumbers
-
-    def asdict(self, with_message=True):
-        return {
-            "type": "duplicate_name",
-            "name": self.name,
-            "lineno": self.linenumbers[0],
-            "line": self.filecontent.split("\n")[self.linenumbers[0] - 1],
-            **({"message": str(self)} if with_message else {}),
-        }
-
-    def __str__(self):
-        d = self.asdict(with_message=False)
-
-        def build():
-            yield f"On line {d['lineno']}:\nDuplicate instance name #{d['name']}"
-            yield f"{d['lineno']:05d} | {d['line']}"
-            yield " " * 8 + "^" * len(d["line"].rstrip())
-
-        return "\n".join(build())
-
-
-grammar = r"""
-file: "ISO-10303-21;" header data_section "END-ISO-10303-21;"
-header: "HEADER" ";" header_entity_list "ENDSEC" ";"
-header_line: (SPECIAL|DIGIT|LOWER|UPPER)* "*"
-data_section: "DATA" ";" (entity_instance)* "ENDSEC" ";"
-entity_instance: simple_entity_instance|complex_entity_instance 
-simple_entity_instance: id "=" simple_record ";" 
-complex_entity_instance: id "=" subsuper_record ";"
-subsuper_record : "(" simple_record_list ")" 
-simple_record_list:simple_record simple_record* 
-simple_record: keyword "("parameter_list?")"
-header_entity_list: file_description file_name file_schema
-file_description: "FILE_DESCRIPTION" "(" parameter_list ")" ";"
-file_name: "FILE_NAME" "(" parameter_list ")" ";"
-file_schema: "FILE_SCHEMA" "(" parameter_list ")" ";"
-id: /#[0-9]+/
-keyword: /[A-Z][0-9A-Z_]*/
-parameter: untyped_parameter|typed_parameter|omitted_parameter
-parameter_list: parameter ("," parameter)*
-list: "(" parameter ("," parameter)* ")" |"("")"
-typed_parameter: keyword "(" parameter ")"|"()" 
-untyped_parameter: string| NONE |INT |REAL |enumeration |id |binary |list
-omitted_parameter:STAR
-enumeration: "." keyword "."
-binary: "\"" ("0"|"1"|"2"|"3") (HEX)* "\"" 
-string: "'" (REVERSE_SOLIDUS REVERSE_SOLIDUS|SPECIAL|DIGIT|SPACE|LOWER|UPPER|CONTROL_DIRECTIVE|"\\*\\")* "'"
-
-STAR: "*"
-SLASH: "/"
-NONE: "$"
-SPECIAL : "!"  
-        | "*"
-        | "$" 
-        | "%" 
-        | "&" 
-        | "." 
-        | "#" 
-        | "+" 
-        | "," 
-        | "-" 
-        | "(" 
-        | ")" 
-        | "?" 
-        | "/" 
-        | ":" 
-        | ";" 
-        | "<" 
-        | "=" 
-        | ">" 
-        | "@" 
-        | "[" 
-        | "]" 
-        | "{" 
-        | "|" 
-        | "}" 
-        | "^" 
-        | "`" 
-        | "~"
-        | "_"
-        | "\""
-        | "\"\""
-        | "''"
-REAL: SIGN?  DIGIT  (DIGIT)* "." (DIGIT)* ("E"  SIGN?  DIGIT (DIGIT)* )?
-INT: SIGN? DIGIT  (DIGIT)* 
-CONTROL_DIRECTIVE: PAGE | ALPHABET | EXTENDED2 | EXTENDED4 | ARBITRARY 
-PAGE : REVERSE_SOLIDUS "S" REVERSE_SOLIDUS LATIN_CODEPOINT
-LATIN_CODEPOINT : SPACE | DIGIT | LOWER | UPPER | SPECIAL | REVERSE_SOLIDUS | APOSTROPHE
-ALPHABET : REVERSE_SOLIDUS "P" UPPER REVERSE_SOLIDUS 
-EXTENDED2: REVERSE_SOLIDUS "X2" REVERSE_SOLIDUS (HEX_TWO)* END_EXTENDED 
-EXTENDED4 :REVERSE_SOLIDUS "X4" REVERSE_SOLIDUS (HEX_FOUR)* END_EXTENDED 
-END_EXTENDED: REVERSE_SOLIDUS "X0" REVERSE_SOLIDUS 
-ARBITRARY: REVERSE_SOLIDUS "X" REVERSE_SOLIDUS HEX_ONE 
-HEX_FOUR: HEX_TWO HEX_TWO
-HEX_TWO: HEX_ONE HEX_ONE 
-HEX_ONE: HEX HEX
-HEX:      "0" 
-        | "1" 
-        | "2" 
-        | "3" 
-        | "4" 
-        | "5"
-        | "6" 
-        | "7" 
-        | "8" 
-        | "9" 
-        | "A" 
-        | "B" 
-        | "C" 
-        | "D" 
-        | "E" 
-        | "F" 
-APOSTROPHE: "'"
-REVERSE_SOLIDUS: "\\"
-DIGIT: "0".."9"
-SIGN: "+"|"-"
-LOWER: "a".."z"
-UPPER: "A".."Z"
-ESCAPE    : "\\" ( "$" | "\"" | CHAR )
-CHAR      : /[^$"\n]/
-WORD      : CHAR+
-SPACE.10  : " "
-
-%ignore /[ \t\f\r\n]/+
-"""
-
-
-class Ref:
-    def __init__(self, id):
-        self.id = id
-
-    def __str__(self):
-        return "#" + str(self.id)
-
-    __repr__ = __str__
-
-
-class IfcType:
-    def __init__(self, ifctype, value):
-        self.ifctype = ifctype
-        self.value = value
-
-    def __str__(self):
-        return self.ifctype + "(" + str(self.value) + ")"
-
-    __repr__ = __str__
-
-
-class T(Transformer):
-    def id(self, s):
-        return int(s[0][1:])
-
-    def string(self, s):
-        word = "".join(s).replace("''", "'")
-        return word
-
-    def keyword(self, s):
-        word = "".join(s)
-        return word
-
-    def untyped_parameter(self, s):
-        return s[0]
-
-    def parameter(self, s):
-        return s[0]
-
-    def typed_parameter(self, s):
-        if len(s):
-            return IfcType(s[0], s[1])
-        else:
-            return ()
-
-    def omitted_parameter(self, s):
-        return s[0]
-
-    def enumeration(self, s):
-        return s[0]
-
-    parameter_list = tuple
-    list = tuple
-    subsuper_record = list
-    INT = int
-    REAL = float
-    NONE = lambda *args: None
-    STAR = str
-
-
-@dataclass
-class entity_instance:
-    id: int
-    type: str
-    attributes: tuple
-    lines: tuple
-
-    def __getitem__(self, k):
-        if isinstance(k, numbers.Integral):
-            return self.attributes[k]
-        else:
-            # compatibility with dict
-            return getattr(self, k)
-
-    def __repr__(self):
-        return f'#{self.id}={self.type}({",".join(map(str, self.attributes))})'
-
-
-def create_step_entity(entity_tree):
-    t = T(visit_tokens=True).transform(entity_tree)
-
-    def get_line_number(t):
-        if isinstance(t, Token):
-            yield t.line
-
-    def traverse(fn, x):
-        yield from fn(x)
-        if isinstance(x, Tree):
-            for c in x.children:
-                yield from traverse(fn, c)
-
-    lines = list(traverse(get_line_number, entity_tree))
-
-    entity_id = t.children[0].children[0]
-    entity_type = t.children[0].children[1].children[0]
-
-    attributes_tree = t.children[0].children[1].children[1]
-    attributes = list(attributes_tree)
-
-    return entity_instance(
-        entity_id,
-        entity_type,
-        attributes,
-        (min(lines), max(lines)),
-    )
-    
-def make_header_ent(ast):
-    rule = ast.data
-    params = T(visit_tokens=True).transform(ast.children[0])
-    return rule.upper(), params
-
-
-
-def process_tree(filecontent, file_tree, with_progress, with_header=False):
-    ents = defaultdict(list)
-    header, data = file_tree.children
-
-    if with_header:
-        header = dict(map(make_header_ent, header.children[0].children))
-
-    n = len(data.children)
-    if n:
-        percentages = [i * 100.0 / n for i in range(n + 1)]
-        num_dots = [int(b) - int(a) for a, b in zip(percentages, percentages[1:])]
-
-    for idx, entity_tree in enumerate(data.children):
-        if with_progress:
-            sys.stdout.write(num_dots[idx] * ".")
-            sys.stdout.flush()
-        ent = create_step_entity(entity_tree)
-        id_ = int(ent["id"])
-        if ents[id_]:
-            raise DuplicateNameError(filecontent, ent["id"], ent["lines"])
-        ents[id_].append(ent)
-
-    if with_header:
-        return header, ents
-    else:
-        return ents
-
-
-def parse(
-    *,
-    filename=None,
-    filecontent=None,
-    with_progress=False,
-    with_tree=True,
-    with_header=False,
-    only_header=False,
-):
-    if filename:
-        assert not filecontent
-        filecontent = builtins.open(filename, encoding=None).read()
-        
-    if only_header:
-        with_header = True
-
-    # Match and remove the comments
-    p = r"/\*[\s\S]*?\*/"
-
-    def replace_fn(match):
-        return re.sub(r"[^\n]", " ", match.group(), flags=re.M)
-
-    filecontent_wo_comments = re.sub(p, replace_fn, filecontent)
-    
-        
-    if only_header:
-        # Extract just the HEADER section using regex
-        header_match = re.search(
-            r"ISO-10303-21;\s*HEADER;(.*?)ENDSEC;",
-            filecontent_wo_comments,
-            flags=re.DOTALL | re.IGNORECASE,
-        )
-        if not header_match:
-            raise ValidationError("No HEADER section found in file")
-
-        header_text = f"HEADER;{header_match.group(1)}ENDSEC;"
-        full_header_text = f"ISO-10303-21;{header_text}DATA;ENDSEC;END-ISO-10303-21;"
-
-        parser = Lark(grammar, parser="lalr", start="file")
-        try:
-            ast = parser.parse(full_header_text)
-        except (UnexpectedToken, UnexpectedCharacters) as e:
-            raise SyntaxError(filecontent, e)
-
-        header_tree = ast.children[0]  # HEADER section
-
-        header = dict(map(make_header_ent, header_tree.children[0].children))
-        return header
-    
-
-    instance_identifiers = []
-    transformer = {}
-    if not with_tree:
-        # If we're not going to return the tree, we also don't need to
-        # keep in memory while parsing. So we build a transformer that
-        # just returns None for every rule. lark creates a dictionary
-        # of callbacks from the transformer type object, so we can't
-        # simply use __getattr__ we need an actual type objects with
-        # callback functions for the rules given in the grammar.
-
-        # Create a temporary parser just for analysing the grammar
-        temp = Lark(grammar, parser="lalr", start="file")
-        # Extract the rule names
-        rule_names = filter(
-            lambda s: not s.startswith("_"), set(r.origin.name for r in temp.rules)
-        )
-        null_function = lambda self, *args: None
-        # Create dictionary of methods for type() creation
-        methods = {r: null_function for r in rule_names}
-
-        # Even in this case we do want to report duplicate identifiers
-        # so these need to be captured
-        methods["id"] = lambda self, *args: args
-        methods["simple_entity_instance"] = (
-            lambda self, tree: instance_identifiers.append(
-                (int(tree[0][0][0][1:]), int(tree[0][0][0].line))
-            )
-        )
-
-        NT = type("NullTransformer", (Transformer,), methods)
-        transformer = {"transformer": NT()}
-
-    parser = Lark(grammar, parser="lalr", start="file", **transformer)
-
-    try:
-        ast = parser.parse(filecontent_wo_comments)
-    except (UnexpectedToken, UnexpectedCharacters) as e:
-        raise SyntaxError(filecontent, e)
-
-    if with_tree:
-        return process_tree(filecontent, ast, with_progress, with_header)
-    else:
-        # process_tree() would take care of duplicate identifiers,
-        # but we need to do it ourselves now using our rudimentary
-        # transformer
-        seen = set()
-        for iden, lineno in instance_identifiers:
-            if iden in seen:
-                raise DuplicateNameError(filecontent, iden, [lineno, lineno])
-            seen.add(iden)
-
-
-class file:
-    """
-    A somewhat compatible interface (but very limited) to ifcopenshell.file
-    """
-
-    def __init__(self, parse_outcomes):
-        self.header_, self.data_ = parse_outcomes
-
-    @property
-    def schema_identifier(self) -> str:
-        return self.header_["FILE_SCHEMA"][0][0]
-
-    @property
-    def schema(self) -> str:
-        """General IFC schema version: IFC2X3, IFC4, IFC4X3."""
-        prefixes = ("IFC", "X", "_ADD", "_TC")
-        reg = "".join(f"(?P<{s}>{s}\\d+)?" for s in prefixes)
-        match = re.match(reg, self.schema_identifier)
-        version_tuple = tuple(
-            map(
-                lambda pp: int(pp[1][len(pp[0]) :]) if pp[1] else None,
-                ((p, match.group(p)) for p in prefixes),
-            )
-        )
-        return "".join(
-            "".join(map(str, t)) if t[1] else ""
-            for t in zip(prefixes, version_tuple[0:2])
-        )
-
-    @property
-    def schema_version(self) -> tuple[int, int, int, int]:
-        """Numeric representation of the full IFC schema version.
-
-        E.g. IFC4X3_ADD2 is represented as (4, 3, 2, 0).
-        """
-        schema = self.wrapped_data.schema
-        version = []
-        for prefix in ("IFC", "X", "_ADD", "_TC"):
-            number = re.search(prefix + r"(\d)", schema)
-            version.append(int(number.group(1)) if number else 0)
-        return tuple(version)
-
-
-    @property
-    def header(self):
-        HEADER_FIELDS = {
-            "file_description": namedtuple('file_description', ['description', 'implementation_level']),
-            "file_name": namedtuple('file_name', ['name', 'time_stamp', 'author', 'organization', 'preprocessor_version', 'originating_system', 'authorization']),
-            "file_schema":  namedtuple('file_schema', ['schema_identifiers']),
-        }
-        header = {}
-
-        for field_name, namedtuple_class in HEADER_FIELDS.items():
-            field_data = self.header_.get(field_name.upper(), [])
-            header[field_name.lower()] = namedtuple_class(*field_data)
-
-        return types.SimpleNamespace(**header)
-    
-    
-    @property
-    def mvd(self):
-        if not LARK_AVAILABLE or MvdInfo is None:
-            return None
-        return MvdInfo(self.header)
-
-    def __getitem__(self, key: numbers.Integral) -> entity_instance:
-        return self.by_id(key)
-
-    def by_id(self, id: int) -> entity_instance:
-        """Return an IFC entity instance filtered by IFC ID.
-
-        :param id: STEP numerical identifier
-        :type id: int
-
-        :raises RuntimeError: If `id` is not found or multiple definitions exist for `id`.
-
-        :rtype: entity_instance
-        """
-        ns = self.data_.get(id, [])
-        if len(ns) == 0:
-            raise RuntimeError(f"Instance with id {id} not found")
-        elif len(ns) > 1:
-            raise RuntimeError(f"Duplicate definition for id {id}")
-        return ns[0]
-
-    def by_type(self, type: str) -> list[entity_instance]:
-        """Return IFC objects filtered by IFC Type and wrapped with the entity_instance class.
-        :rtype: list[entity_instance]
-        """
-        type_lc = type.lower()
-        return list(
-            filter(
-                lambda ent: ent.type.lower() == type_lc,
-                itertools.chain.from_iterable(self.data_.values()),
-            )
-        )
-
-
-def open(fn, only_header: bool = False) -> file:
-    if only_header: # Ensure consistent options
-        parse_outcomes = parse(
-            filename=fn,
-            with_tree=True,
-            with_header=True,  # must be True to return the header
-            only_header=True,
-        )
-        return file((parse_outcomes, defaultdict(list)))  # data section is empty
-    else:
-        parse_outcomes = parse(
-            filename=fn,
-            with_tree=True,
-            with_header=True,
-            only_header=False,
-        )
-        return file(parse_outcomes)
\ No newline at end of file
+    from parser.parse import parse
+    from parser.file import file, open
+    from parser.errors import _ValidationError, CollectedValidationErrors, DuplicateNameError, HeaderFieldError
+except:
+    from .parser.parse import parse 
+    from .parser.file import file, open 
+    from .parser.errors import _ValidationError, CollectedValidationErrors, DuplicateNameError, HeaderFieldError
+
+__all__ = ["parse", "open", "file", "_ValidationError", 
+           "CollectedValidationErrors", "DuplicateNameError", "HeaderFieldError"] # for testing 
\ No newline at end of file
diff --git a/__main__.py b/__main__.py
index 7230888..537d022 100644
--- a/__main__.py
+++ b/__main__.py
@@ -1,7 +1,7 @@
 import sys
 import json
 import argparse
-from . import parse, ValidationError
+from . import parse, CollectedValidationErrors
 
 def main():
     parser = argparse.ArgumentParser(description="Parse and validate STEP file.")
@@ -22,11 +22,11 @@ def main():
         if not args.json:
             print("Valid", file=sys.stderr)
         exit(0)
-    except ValidationError as exc:
+    except CollectedValidationErrors as exc:
         if not args.json:
             print(exc, file=sys.stderr)
         else:
-            json.dump(exc.asdict(), sys.stdout)
+            json.dump([e.asdict() for e in exc.errors], sys.stdout, indent=2)
         exit(1)
 
 if __name__ == '__main__':
diff --git a/fixtures/fail_multiple_duplicate_ids.ifc b/fixtures/fail_multiple_duplicate_ids.ifc
new file mode 100644
index 0000000..1df3800
--- /dev/null
+++ b/fixtures/fail_multiple_duplicate_ids.ifc
@@ -0,0 +1,29 @@
+ISO-10303-21;
+HEADER;
+FILE_DESCRIPTION(('ViewDefinition [CoordinationView]'),'2;1');
+FILE_NAME('','2022-05-04T08:08:30',(''),(''),'IfcOpenShell-0.7.0','IfcOpenShell-0.7.0','');
+FILE_SCHEMA(('IFC4'));
+ENDSEC;
+DATA;
+#1=IFCPERSON($,$,'',$,$,$,$,$);
+#2=IFCORGANIZATION($,'',$,$,$);
+#3=IFCPERSONANDORGANIZATION(#1,#2,$);
+#4=IFCAPPLICATION(#2,'0.7.0','IfcOpenShell-0.7.0','');
+#5=IFCOWNERHISTORY(#3,#4,$,.ADDED.,$,#3,#4,1651651710);
+#6=IFCDIRECTION((1.,0.,0.));
+#7=IFCDIRECTION((0.,0.,1.));
+#8=IFCCARTESIANPOINT((0.,0.,0.));
+#9=IFCAXIS2PLACEMENT3D(#8,#7,#6);
+#10=IFCDIRECTION((0.,1.,0.));
+#11=IFCGEOMETRICREPRESENTATIONCONTEXT($,'Model',3,1.E-05,#9,#10);
+#12=IFCDIMENSIONALEXPONENTS(0,0,0,0,0,0,0);
+#13=IFCSIUNIT(*,.LENGTHUNIT.,$,.METRE.);
+#14=IFCSIUNIT(*,.AREAUNIT.,$,.SQUARE_METRE.);
+#15=IFCSIUNIT(*,.VOLUMEUNIT.,$,.CUBIC_METRE.);
+#16=IFCSIUNIT(*,.PLANEANGLEUNIT.,$,.RADIAN.);
+#18=IFCMEASUREWITHUNIT(IFCPLANEANGLEMEASURE(0.017453292519943295),#16);
+#18=IFCCONVERSIONBASEDUNIT(#12,.PLANEANGLEUNIT.,'DEGREE',#17);
+#19=IFCUNITASSIGNMENT((#13,#14,#15,#18));
+#19=IFCPROJECT('2AyG2X0sb16Bjd4gQc07yZ',#5,'',$,$,$,$,(#11),#19);
+ENDSEC;
+END-ISO-10303-21;
diff --git a/fixtures/fail_multiple_wrong_header_fields.ifc b/fixtures/fail_multiple_wrong_header_fields.ifc
new file mode 100644
index 0000000..1ca0687
--- /dev/null
+++ b/fixtures/fail_multiple_wrong_header_fields.ifc
@@ -0,0 +1,30 @@
+ISO-10303-21;
+HEADER;
+FILE_DESCRIPTION(('ViewDefinition [ReferenceView_V1.2]', 'ExchangeRequirement [Any]'));
+FILE_NAME('Header.ifc','2025-02-13T15:58:45',('tricott'),('Trimble Inc.'),'TrimBimToIFC rel. 4.0.2','Example - Example - 2025.0','IFC4 model', '');
+FILE_SCHEMA(('IFC4'));
+ENDSEC;
+DATA;
+#1=IFCPERSON($,$,'',$,$,$,$,$);
+#2=IFCORGANIZATION($,'',$,$,$);
+#3=IFCPERSONANDORGANIZATION(#1,#2,$);
+#4=IFCAPPLICATION(#2,'v0.7.0-6c9e130ca','IfcOpenShell-v0.7.0-6c9e130ca','');
+#5=IFCOWNERHISTORY(#3,#4,$,.NOTDEFINED.,$,#3,#4,1700419055);
+#6=IFCDIRECTION((1.,0.,0.));
+#7=IFCDIRECTION((0.,0.,1.));
+#8=IFCCARTESIANPOINT((0.,0.,0.));
+#9=IFCAXIS2PLACEMENT3D(#8,#7,#6);
+#10=IFCDIRECTION((0.,1.));
+#11=IFCGEOMETRICREPRESENTATIONCONTEXT($,'Model',3,1.E-05,#9,#10);
+#12=IFCDIMENSIONALEXPONENTS(0,0,0,0,0,0,0);
+#13=IFCSIUNIT(*,.LENGTHUNIT.,$,.METRE.);
+#14=IFCSIUNIT(*,.AREAUNIT.,$,.SQUARE_METRE.);
+#15=IFCSIUNIT(*,.VOLUMEUNIT.,$,.CUBIC_METRE.);
+#16=IFCSIUNIT(*,.PLANEANGLEUNIT.,$,.RADIAN.);
+#17=IFCMEASUREWITHUNIT(IFCPLANEANGLEMEASURE(0.017453292519943295),#16);
+#18=IFCCONVERSIONBASEDUNIT(#12,.PLANEANGLEUNIT.,'DEGREE',#17);
+#19=IFCUNITASSIGNMENT((#13,#14,#15,#18));
+#20=IFCPROJECT('0iDmeiiLP3AOllitM2Favn',#5,'',$,$,$,$,(#11),#19);
+#21=IFCSITE('3rg2jGkIH10RFhrQsGZKRk',#5,$,$,$,$,$,$,$,$,$,$,$,$);
+ENDSEC;
+END-ISO-10303-21;
diff --git a/fixtures/fail_too_many_header_entity_fields.ifc b/fixtures/fail_too_many_header_entity_fields.ifc
new file mode 100644
index 0000000..24a853b
--- /dev/null
+++ b/fixtures/fail_too_many_header_entity_fields.ifc
@@ -0,0 +1,30 @@
+ISO-10303-21;
+HEADER;
+FILE_DESCRIPTION(('ViewDefinition [ReferenceView_V1.2]', 'ExchangeRequirement [Any]'),'2;1');
+FILE_NAME('Header.ifc','2025-02-13T15:58:45',('tricott'),('Trimble Inc.'),'TrimBimToIFC rel. 4.0.2','Example - Example - 2025.0','IFC4 model', '');
+FILE_SCHEMA(('IFC4'));
+ENDSEC;
+DATA;
+#1=IFCPERSON($,$,'',$,$,$,$,$);
+#2=IFCORGANIZATION($,'',$,$,$);
+#3=IFCPERSONANDORGANIZATION(#1,#2,$);
+#4=IFCAPPLICATION(#2,'v0.7.0-6c9e130ca','IfcOpenShell-v0.7.0-6c9e130ca','');
+#5=IFCOWNERHISTORY(#3,#4,$,.NOTDEFINED.,$,#3,#4,1700419055);
+#6=IFCDIRECTION((1.,0.,0.));
+#7=IFCDIRECTION((0.,0.,1.));
+#8=IFCCARTESIANPOINT((0.,0.,0.));
+#9=IFCAXIS2PLACEMENT3D(#8,#7,#6);
+#10=IFCDIRECTION((0.,1.));
+#11=IFCGEOMETRICREPRESENTATIONCONTEXT($,'Model',3,1.E-05,#9,#10);
+#12=IFCDIMENSIONALEXPONENTS(0,0,0,0,0,0,0);
+#13=IFCSIUNIT(*,.LENGTHUNIT.,$,.METRE.);
+#14=IFCSIUNIT(*,.AREAUNIT.,$,.SQUARE_METRE.);
+#15=IFCSIUNIT(*,.VOLUMEUNIT.,$,.CUBIC_METRE.);
+#16=IFCSIUNIT(*,.PLANEANGLEUNIT.,$,.RADIAN.);
+#17=IFCMEASUREWITHUNIT(IFCPLANEANGLEMEASURE(0.017453292519943295),#16);
+#18=IFCCONVERSIONBASEDUNIT(#12,.PLANEANGLEUNIT.,'DEGREE',#17);
+#19=IFCUNITASSIGNMENT((#13,#14,#15,#18));
+#20=IFCPROJECT('0iDmeiiLP3AOllitM2Favn',#5,'',$,$,$,$,(#11),#19);
+#21=IFCSITE('3rg2jGkIH10RFhrQsGZKRk',#5,$,$,$,$,$,$,$,$,$,$,$,$);
+ENDSEC;
+END-ISO-10303-21;
diff --git a/parser/__init__.py b/parser/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/parser/errors.py b/parser/errors.py
new file mode 100644
index 0000000..2449866
--- /dev/null
+++ b/parser/errors.py
@@ -0,0 +1,108 @@
+from lark.exceptions import UnexpectedToken
+
+class _ValidationError(Exception):
+    def __init__(self, *args, **kwargs):
+        if self.__class__ is _ValidationError:
+            raise TypeError("Do not raise _ValidationError directly.")
+        super().__init__(*args, **kwargs)
+
+class ErrorCollector:
+    def __init__(self):
+        self.errors = []
+
+    def add(self, error):
+        self.errors.append(error)
+
+    def raise_if_any(self):
+        if self.errors:
+            raise CollectedValidationErrors(self.errors)
+
+class CollectedValidationErrors(_ValidationError):
+    def __init__(self, errors):
+        self.errors = errors
+        
+    def asdict(self, with_message=True):
+        return [e.asdict(with_message=with_message) for e in self.errors]
+
+    def __str__(self):
+        return f"{len(self.errors)} validation error(s) collected:\n" + "\n\n".join(str(e) for e in self.errors)
+
+class SyntaxError(_ValidationError):
+    def __init__(self, filecontent, exception):
+        self.filecontent = filecontent
+        self.exception = exception
+
+    def asdict(self, with_message=True):
+        return {
+            "type": (
+                "unexpected_token"
+                if isinstance(self.exception, UnexpectedToken)
+                else "unexpected_character"
+            ),
+            "lineno": self.exception.line,
+            "column": self.exception.column,
+            "found_type": self.exception.token.type.lower(),
+            "found_value": self.exception.token.value,
+            "expected": sorted(x for x in self.exception.accepts if "__ANON" not in x),
+            "line": self.filecontent.split("\n")[self.exception.line - 1],
+            **({"message": str(self)} if with_message else {}),
+        }
+
+    def __str__(self):
+        d = self.asdict(with_message=False)
+        if len(d["expected"]) == 1:
+            exp = d["expected"][0]
+        else:
+            exp = f"one of {' '.join(d['expected'])}"
+
+        sth = "character" if d["type"] == "unexpected_character" else ""
+
+        return f"On line {d['lineno']} column {d['column']}:\nUnexpected {sth}{d['found_type']} ('{d['found_value']}')\nExpecting {exp}\n{d['lineno']:05d} | {d['line']}\n        {' ' * (self.exception.column - 1)}^"
+
+
+class DuplicateNameError(_ValidationError):
+    def __init__(self, filecontent, name, linenumbers):
+        self.name = name
+        self.filecontent = filecontent
+        self.linenumbers = linenumbers
+
+    def asdict(self, with_message=True):
+        return {
+            "type": "duplicate_name",
+            "name": self.name,
+            "lineno": self.linenumbers[0],
+            "line": self.filecontent.split("\n")[self.linenumbers[0] - 1],
+            **({"message": str(self)} if with_message else {}),
+        }
+
+    def __str__(self):
+        d = self.asdict(with_message=False)
+
+        def build():
+            yield f"On line {d['lineno']}:\nDuplicate instance name #{d['name']}"
+            yield f"{d['lineno']:05d} | {d['line']}"
+            yield " " * 8 + "^" * len(d["line"].rstrip())
+
+        return "\n".join(build())
+    
+    
+class HeaderFieldError(_ValidationError):
+    def __init__(self, field, found_len, expected_len):
+        self.field = field
+        self.found_len = found_len
+        self.expected_len = expected_len
+
+    def asdict(self, with_message=True):
+        return {
+            "type": "invalid_header_field",
+            "field": self.field,
+            "expected_field_count": self.expected_len,
+            "actual_field_count": self.found_len,
+            **({"message": str(self)} if with_message else {}),
+        }
+
+    def __str__(self):
+        return (
+            f"Invalid number of parameters for HEADER field '{self.field}'. "
+            f"Expected {self.expected_len}, found {self.found_len}."
+        )
\ No newline at end of file
diff --git a/parser/file.py b/parser/file.py
new file mode 100644
index 0000000..908d6cf
--- /dev/null
+++ b/parser/file.py
@@ -0,0 +1,108 @@
+import types
+import re 
+import numbers
+import itertools
+
+from .parse import parse, ParseResult
+from .grammar import HEADER_FIELDS
+from .transformer import entity_instance
+
+try:
+    from .mvd_info import MvdInfo, LARK_AVAILABLE
+except ImportError: # in case of running module locally (e.g. test_parser.py)
+    from mvd_info import MvdInfo, LARK_AVAILABLE
+
+class file:
+    """
+    A somewhat compatible interface (but very limited) to ifcopenshell.file
+    """
+
+    def __init__(self, result:ParseResult):
+        self.header_ = result.header
+        self.data_ = result.entities
+
+    @property
+    def schema_identifier(self) -> str:
+        return self.header_["FILE_SCHEMA"][0][0]
+
+    @property
+    def schema(self) -> str:
+        """General IFC schema version: IFC2X3, IFC4, IFC4X3."""
+        prefixes = ("IFC", "X", "_ADD", "_TC")
+        reg = "".join(f"(?P<{s}>{s}\\d+)?" for s in prefixes)
+        match = re.match(reg, self.schema_identifier)
+        version_tuple = tuple(
+            map(
+                lambda pp: int(pp[1][len(pp[0]) :]) if pp[1] else None,
+                ((p, match.group(p)) for p in prefixes),
+            )
+        )
+        return "".join(
+            "".join(map(str, t)) if t[1] else ""
+            for t in zip(prefixes, version_tuple[0:2])
+        )
+
+    @property
+    def schema_version(self) -> tuple[int, int, int, int]:
+        """Numeric representation of the full IFC schema version.
+
+        E.g. IFC4X3_ADD2 is represented as (4, 3, 2, 0).
+        """
+        schema = self.wrapped_data.schema
+        version = []
+        for prefix in ("IFC", "X", "_ADD", "_TC"):
+            number = re.search(prefix + r"(\d)", schema)
+            version.append(int(number.group(1)) if number else 0)
+        return tuple(version)
+
+
+    @property
+    def header(self):
+        header = {}
+        for field_name, namedtuple_class in HEADER_FIELDS.items():
+            field_data = self.header_.get(field_name.upper(), [])
+            header[field_name.lower()] = namedtuple_class(*field_data)
+
+        return types.SimpleNamespace(**header)
+    
+    
+    @property
+    def mvd(self):
+        if not LARK_AVAILABLE or MvdInfo is None:
+            return None
+        return MvdInfo(self.header)
+
+    def __getitem__(self, key: numbers.Integral) -> entity_instance:
+        return self.by_id(key)
+
+    def by_id(self, id: int) -> entity_instance:
+        """Return an IFC entity instance filtered by IFC ID.
+
+        :param id: STEP numerical identifier
+        :type id: int
+
+        :raises RuntimeError: If `id` is not found or multiple definitions exist for `id`.
+
+        :rtype: entity_instance
+        """
+        ns = self.data_.get(id, [])
+        if len(ns) == 0:
+            raise RuntimeError(f"Instance with id {id} not found")
+        elif len(ns) > 1:
+            raise RuntimeError(f"Duplicate definition for id {id}")
+        return ns[0]
+
+    def by_type(self, type: str) -> list[entity_instance]:
+        """Return IFC objects filtered by IFC Type and wrapped with the entity_instance class.
+        :rtype: list[entity_instance]
+        """
+        type_lc = type.lower()
+        return list(
+            filter(
+                lambda ent: ent.type.lower() == type_lc,
+                itertools.chain.from_iterable(self.data_.values()),
+            )
+        )
+
+def open(fn, only_header= False) -> file:
+    return file(parse(filename=fn, only_header=only_header))
\ No newline at end of file
diff --git a/parser/grammar.py b/parser/grammar.py
new file mode 100644
index 0000000..9d6a016
--- /dev/null
+++ b/parser/grammar.py
@@ -0,0 +1,112 @@
+from collections import namedtuple
+
+grammar = r"""
+file: "ISO-10303-21;" header data_section "END-ISO-10303-21;"
+header: "HEADER" ";" header_entity_list "ENDSEC" ";"
+header_line: (SPECIAL|DIGIT|LOWER|UPPER)* "*"
+data_section: "DATA" ";" (entity_instance)* "ENDSEC" ";"
+entity_instance: simple_entity_instance|complex_entity_instance 
+simple_entity_instance: id "=" simple_record ";" 
+complex_entity_instance: id "=" subsuper_record ";"
+subsuper_record : "(" simple_record_list ")" 
+simple_record_list:simple_record simple_record* 
+simple_record: keyword "("parameter_list?")"
+header_entity_list: file_description file_name file_schema
+file_description: "FILE_DESCRIPTION" "(" parameter_list ")" ";"
+file_name: "FILE_NAME" "(" parameter_list ")" ";"
+file_schema: "FILE_SCHEMA" "(" parameter_list ")" ";"
+id: /#[0-9]+/
+keyword: /[A-Z][0-9A-Z_]*/
+parameter: untyped_parameter|typed_parameter|omitted_parameter
+parameter_list: parameter ("," parameter)*
+list: "(" parameter ("," parameter)* ")" |"("")"
+typed_parameter: keyword "(" parameter ")"|"()" 
+untyped_parameter: string| NONE |INT |REAL |enumeration |id |binary |list
+omitted_parameter:STAR
+enumeration: "." keyword "."
+binary: "\"" ("0"|"1"|"2"|"3") (HEX)* "\"" 
+string: "'" (REVERSE_SOLIDUS REVERSE_SOLIDUS|SPECIAL|DIGIT|SPACE|LOWER|UPPER|CONTROL_DIRECTIVE|"\\*\\")* "'"
+
+STAR: "*"
+SLASH: "/"
+NONE: "$"
+SPECIAL : "!"  
+        | "*"
+        | "$" 
+        | "%" 
+        | "&" 
+        | "." 
+        | "#" 
+        | "+" 
+        | "," 
+        | "-" 
+        | "(" 
+        | ")" 
+        | "?" 
+        | "/" 
+        | ":" 
+        | ";" 
+        | "<" 
+        | "=" 
+        | ">" 
+        | "@" 
+        | "[" 
+        | "]" 
+        | "{" 
+        | "|" 
+        | "}" 
+        | "^" 
+        | "`" 
+        | "~"
+        | "_"
+        | "\""
+        | "\"\""
+        | "''"
+REAL: SIGN?  DIGIT  (DIGIT)* "." (DIGIT)* ("E"  SIGN?  DIGIT (DIGIT)* )?
+INT: SIGN? DIGIT  (DIGIT)* 
+CONTROL_DIRECTIVE: PAGE | ALPHABET | EXTENDED2 | EXTENDED4 | ARBITRARY 
+PAGE : REVERSE_SOLIDUS "S" REVERSE_SOLIDUS LATIN_CODEPOINT
+LATIN_CODEPOINT : SPACE | DIGIT | LOWER | UPPER | SPECIAL | REVERSE_SOLIDUS | APOSTROPHE
+ALPHABET : REVERSE_SOLIDUS "P" UPPER REVERSE_SOLIDUS 
+EXTENDED2: REVERSE_SOLIDUS "X2" REVERSE_SOLIDUS (HEX_TWO)* END_EXTENDED 
+EXTENDED4 :REVERSE_SOLIDUS "X4" REVERSE_SOLIDUS (HEX_FOUR)* END_EXTENDED 
+END_EXTENDED: REVERSE_SOLIDUS "X0" REVERSE_SOLIDUS 
+ARBITRARY: REVERSE_SOLIDUS "X" REVERSE_SOLIDUS HEX_ONE 
+HEX_FOUR: HEX_TWO HEX_TWO
+HEX_TWO: HEX_ONE HEX_ONE 
+HEX_ONE: HEX HEX
+HEX:      "0" 
+        | "1" 
+        | "2" 
+        | "3" 
+        | "4" 
+        | "5"
+        | "6" 
+        | "7" 
+        | "8" 
+        | "9" 
+        | "A" 
+        | "B" 
+        | "C" 
+        | "D" 
+        | "E" 
+        | "F" 
+APOSTROPHE: "'"
+REVERSE_SOLIDUS: "\\"
+DIGIT: "0".."9"
+SIGN: "+"|"-"
+LOWER: "a".."z"
+UPPER: "A".."Z"
+ESCAPE    : "\\" ( "$" | "\"" | CHAR )
+CHAR      : /[^$"\n]/
+WORD      : CHAR+
+SPACE.10  : " "
+
+%ignore /[ \t\f\r\n]/+
+"""
+
+HEADER_FIELDS = {
+    "file_description": namedtuple('file_description', ['description', 'implementation_level']),
+    "file_name": namedtuple('file_name', ['name', 'time_stamp', 'author', 'organization', 'preprocessor_version', 'originating_system', 'authorization']),
+    "file_schema":  namedtuple('file_schema', ['schema_identifiers']),
+}
\ No newline at end of file
diff --git a/mvd_info.py b/parser/mvd_info.py
similarity index 100%
rename from mvd_info.py
rename to parser/mvd_info.py
diff --git a/parser/parse.py b/parser/parse.py
new file mode 100644
index 0000000..ccb5e9c
--- /dev/null
+++ b/parser/parse.py
@@ -0,0 +1,166 @@
+from dataclasses import dataclass
+from collections import defaultdict
+import re 
+import sys
+import builtins
+from lark import Lark, UnexpectedCharacters, UnexpectedToken
+
+# import transformer
+from .transformer import Transformer, entity_instance, make_header_ent, create_step_entity
+from .grammar import grammar, HEADER_FIELDS
+from .errors import HeaderFieldError, DuplicateNameError, ErrorCollector, SyntaxError
+
+def validate_header_fields(header, error_collector, only_header = False):
+    for field in HEADER_FIELDS.keys():
+        observed = header.get(field.upper(), [])
+        expected = HEADER_FIELDS.get(field)._fields
+        if len(observed) != len(expected):
+            error_collector.add(HeaderFieldError(field.upper(), len(observed), len(expected)))
+            if only_header:
+                error_collector.raise_if_any()
+
+@dataclass
+class ParseResult:
+    header: dict
+    entities: dict[int, list[entity_instance]]
+    
+
+def process_tree(filecontent, file_tree, with_progress, error_collector):
+    ents = defaultdict(list)
+    header, data = file_tree.children
+
+    header = dict(map(make_header_ent, header.children[0].children))
+    validate_header_fields(header, error_collector)
+
+    n = len(data.children)
+    if n:
+        percentages = [i * 100.0 / n for i in range(n + 1)]
+        num_dots = [int(b) - int(a) for a, b in zip(percentages, percentages[1:])]
+
+    for idx, entity_tree in enumerate(data.children):
+        if with_progress:
+            sys.stdout.write(num_dots[idx] * ".")
+            sys.stdout.flush()
+        ent = create_step_entity(entity_tree)
+        id_ = int(ent["id"])
+        if ents[id_]:
+            error_collector.add(DuplicateNameError(filecontent, ent["id"], ent["lines"]))
+        else:
+            ents[id_].append(ent)
+
+    return header, ents
+
+def parse(
+    *,
+    filename=None,
+    filecontent=None,
+    with_progress=False,
+    with_tree=True,
+    only_header=False,
+) -> ParseResult:
+    error_collector = ErrorCollector()
+    if filename:
+        assert not filecontent
+        filecontent = builtins.open(filename, encoding=None).read()
+
+    # Match and remove the comments
+    p = r"/\*[\s\S]*?\*/"
+
+    def replace_fn(match):
+        return re.sub(r"[^\n]", " ", match.group(), flags=re.M)
+
+    filecontent_wo_comments = re.sub(p, replace_fn, filecontent)
+
+    if only_header:
+        # Extract just the HEADER section using regex
+        header_match = re.search(
+            r"ISO-10303-21;\s*HEADER;(.*?)ENDSEC;",
+            filecontent_wo_comments,
+            flags=re.DOTALL | re.IGNORECASE,
+        )
+        if not header_match:
+            error_collector.add(HeaderFieldError(
+                'header', '', 'No HEADER section found in file'
+            ))
+            error_collector.raise_if_any()
+
+        header_text = f"HEADER;{header_match.group(1)}ENDSEC;"
+        full_header_text = f"ISO-10303-21;{header_text}DATA;ENDSEC;END-ISO-10303-21;"
+
+        parser = Lark(grammar, parser="lalr", start="file")
+        try:
+            ast = parser.parse(full_header_text)
+        except (UnexpectedToken, UnexpectedCharacters) as e:
+            error_collector.add(SyntaxError(filecontent, e))
+            error_collector.raise_if_any()  # Immediately abort in case of critical error
+
+
+        header_tree = ast.children[0]  # HEADER section
+
+        header = dict(map(make_header_ent, header_tree.children[0].children))
+        validate_header_fields(header, error_collector, only_header=True)
+        error_collector.raise_if_any() 
+        return ParseResult(
+            header = header,
+            entities = defaultdict(list)
+        )
+    
+
+    instance_identifiers = []
+    transformer = {}
+    if not with_tree:
+        # If we're not going to return the tree, we also don't need to
+        # keep in memory while parsing. So we build a transformer that
+        # just returns None for every rule. lark creates a dictionary
+        # of callbacks from the transformer type object, so we can't
+        # simply use __getattr__ we need an actual type objects with
+        # callback functions for the rules given in the 
+
+        # Create a temporary parser just for analysing the grammar
+        temp = Lark(grammar, parser="lalr", start="file")
+        # Extract the rule names
+        rule_names = filter(
+            lambda s: not s.startswith("_"), set(r.origin.name for r in temp.rules)
+        )
+        null_function = lambda self, *args: None
+        # Create dictionary of methods for type() creation
+        methods = {r: null_function for r in rule_names}
+
+        # Even in this case we do want to report duplicate identifiers
+        # so these need to be captured
+        methods["id"] = lambda self, *args: args
+        methods["simple_entity_instance"] = (
+            lambda self, tree: instance_identifiers.append(
+                (int(tree[0][0][0][1:]), int(tree[0][0][0].line))
+            )
+        )
+
+        NT = type("NullTransformer", (Transformer,), methods)
+        transformer = {"transformer": NT()}
+
+    parser = Lark(grammar, parser="lalr", start="file", **transformer)
+    
+    try:
+        ast = parser.parse(filecontent_wo_comments)
+    except (UnexpectedToken, UnexpectedCharacters) as e:
+        error_collector.add(SyntaxError(filecontent, e))
+        error_collector.raise_if_any()  # Immediately abort in case of critical error
+
+    if with_tree:
+        header, data = process_tree(filecontent, ast, with_progress, error_collector)
+        error_collector.raise_if_any() 
+        return ParseResult(
+            header = header, 
+            entities = data
+        )
+    else:
+        # process_tree() would take care of duplicate identifiers,
+        # but we need to do it ourselves now using our rudimentary
+        # transformer
+        seen = set()
+        for iden, lineno in instance_identifiers:
+            if iden in seen:
+                error_collector.add(DuplicateNameError(filecontent, iden, [lineno, lineno]))
+            else:
+                seen.add(iden)
+        error_collector.raise_if_any()
diff --git a/parser/transformer.py b/parser/transformer.py
new file mode 100644
index 0000000..584c499
--- /dev/null
+++ b/parser/transformer.py
@@ -0,0 +1,105 @@
+from lark import Transformer
+from dataclasses import dataclass
+import numbers
+from lark import Lark, Transformer, Tree, Token
+
+
+class IfcType:
+    def __init__(self, ifctype, value):
+        self.ifctype = ifctype
+        self.value = value
+
+    def __str__(self):
+        return self.ifctype + "(" + str(self.value) + ")"
+
+    __repr__ = __str__
+
+@dataclass
+class entity_instance:
+    id: int
+    type: str
+    attributes: tuple
+    lines: tuple
+
+    def __getitem__(self, k):
+        if isinstance(k, numbers.Integral):
+            return self.attributes[k]
+        else:
+            # compatibility with dict
+            return getattr(self, k)
+
+    def __repr__(self):
+        return f'#{self.id}={self.type}({",".join(map(str, self.attributes))})'
+
+
+class T(Transformer):
+    def id(self, s):
+        return int(s[0][1:])
+
+    def string(self, s):
+        word = "".join(s).replace("''", "'")
+        return word
+
+    def keyword(self, s):
+        word = "".join(s)
+        return word
+
+    def untyped_parameter(self, s):
+        return s[0]
+
+    def parameter(self, s):
+        return s[0]
+
+    def typed_parameter(self, s):
+        if len(s):
+            return IfcType(s[0], s[1])
+        else:
+            return ()
+
+    def omitted_parameter(self, s):
+        return s[0]
+
+    def enumeration(self, s):
+        return s[0]
+
+    parameter_list = tuple
+    list = tuple
+    subsuper_record = list
+    INT = int
+    REAL = float
+    NONE = lambda *args: None
+    STAR = str
+
+
+def create_step_entity(entity_tree):
+    t = T(visit_tokens=True).transform(entity_tree)
+
+    def get_line_number(t):
+        if isinstance(t, Token):
+            yield t.line
+
+    def traverse(fn, x):
+        yield from fn(x)
+        if isinstance(x, Tree):
+            for c in x.children:
+                yield from traverse(fn, c)
+
+    lines = list(traverse(get_line_number, entity_tree))
+
+    entity_id = t.children[0].children[0]
+    entity_type = t.children[0].children[1].children[0]
+
+    attributes_tree = t.children[0].children[1].children[1]
+    attributes = list(attributes_tree)
+
+    return entity_instance(
+        entity_id,
+        entity_type,
+        attributes,
+        (min(lines), max(lines)),
+    )
+
+def make_header_ent(ast):
+    rule = ast.data
+    params = T(visit_tokens=True).transform(ast.children[0])
+    return rule.upper(), params
diff --git a/test_parser.py b/test_parser.py
index 48f3185..5e4ef56 100644
--- a/test_parser.py
+++ b/test_parser.py
@@ -1,13 +1,13 @@
 import glob
 import pytest
 
-from __init__ import parse, open, ValidationError
+from __init__ import parse, open, _ValidationError, CollectedValidationErrors, DuplicateNameError, HeaderFieldError
 from contextlib import nullcontext
 
 
 def create_context(fn):
     if "fail_" in fn:
-        return pytest.raises(ValidationError)
+        return pytest.raises(_ValidationError)
     else:
         return nullcontext()
 
@@ -20,6 +20,10 @@ def test_file_with_tree(file):
 
 @pytest.mark.parametrize("file", glob.glob("fixtures/*.ifc"))
 def test_file_without_tree(file):
+    if any(sub in file for sub in ["fail_too_many_header_entity_fields.ifc", "fail_multiple_wrong_header_fields"]):
+        pytest.skip("This file relies on header field validation using the parsed AST, "
+                "but with_tree=False uses a NullTransformer that discards the AST, "
+                "so validating the header field names is not possible in this mode.")
     with create_context(file):
         parse(filename=file, with_tree=False)
 
@@ -28,9 +32,11 @@ def test_parse_features():
     f = open('fixtures/pass_1.ifc')
     assert f.by_id(1).id == 1
     assert f.by_id(1).type == 'IFCPERSON'
+    assert f.data_[1][0].type == 'IFCPERSON'
     assert f.by_type('ifcperson')[0].id == 1
     assert f[1][0] is None
     assert f.header.file_description[0][0] == 'ViewDefinition [CoordinationView]'
+    assert f.header_.get('FILE_DESCRIPTION')[0][0]
     assert f.by_type('ifcapplication')[1][2] == "Nested ' quotes"
 
 
@@ -113,9 +119,9 @@ def test_file_mvd_attr():
     'fixtures/fail_no_header.ifc',
 ])
 def test_invalid_headers_(filename):
-    # error in header; with_header should raise an error
-    with pytest.raises(ValidationError):
-        parse(filename=filename, with_tree=False, only_header=True, with_header=True)
+    # error in header
+    with pytest.raises(_ValidationError):
+        parse(filename=filename, with_tree=False, only_header=True)
 
 @pytest.mark.parametrize("filename", [
     'fixtures/fail_duplicate_id.ifc',
@@ -123,6 +129,32 @@ def test_invalid_headers_(filename):
     'fixtures/fail_double_semi.ifc'
 ])
 def test_valid_headers(filename):
-    # error in body; with_header should not raise an error
+    # error in body
     with nullcontext():
-        parse(filename=filename, with_tree=False, only_header=True, with_header=True)
+        parse(filename=filename, with_tree=False, only_header=True)
+
+def test_header_entity_fields():
+    with pytest.raises(_ValidationError):
+        parse(filename='fixtures/fail_too_many_header_entity_fields.ifc', only_header=True)
+
+def test_header_entity_fields_whole_file():
+    with pytest.raises(_ValidationError):
+        parse(filename='fixtures/fail_too_many_header_entity_fields.ifc')
+
+def test_header_entity_fields_whole_file():
+    with pytest.raises(CollectedValidationErrors) as exc_info:
+        parse(filename="fixtures/fail_multiple_duplicate_ids.ifc")
+
+    errors = exc_info.value.errors
+    
+    assert len(errors) == 2
+    assert all(isinstance(e, DuplicateNameError) for e in errors)
+
+def test_multiple_wrong_header_fields():
+    with pytest.raises(CollectedValidationErrors) as exc_info:
+        parse(filename="fixtures/fail_multiple_wrong_header_fields.ifc")
+
+    errors = exc_info.value.errors
+    
+    assert len(errors) == 2
+    assert all(isinstance(e, HeaderFieldError) for e in errors)
\ No newline at end of file