# Copyright (c) 2020-2023, Manfred Moitzi # License: MIT License from __future__ import annotations import string import typing from typing import ( TYPE_CHECKING, BinaryIO, Iterable, Iterator, Callable, Union, Optional, ) import itertools import re from collections import defaultdict from pathlib import Path import logging from ezdxf.lldxf import const from ezdxf.lldxf import repair from ezdxf.lldxf.encoding import ( has_dxf_unicode, decode_dxf_unicode, has_mif_encoding, decode_mif_to_unicode, ) from ezdxf.lldxf.types import ( DXFTag, DXFVertex, DXFBinaryTag, POINT_CODES, BINARY_DATA, TYPE_TABLE, MAX_GROUP_CODE, ) from ezdxf.lldxf.tags import group_tags, Tags from ezdxf.lldxf.validator import entity_structure_validator from ezdxf.tools.codepage import toencoding from ezdxf.audit import Auditor, AuditError if TYPE_CHECKING: from ezdxf.document import Drawing from ezdxf.eztypes import SectionDict __all__ = ["read", "readfile"] EXCLUDE_STRUCTURE_CHECK = { "SECTION", "ENDSEC", "EOF", "TABLE", "ENDTAB", "ENDBLK", "SEQEND", } logger = logging.getLogger("ezdxf") def readfile( filename: Union[str, Path], errors: str = "surrogateescape" ) -> tuple[Drawing, Auditor]: """Read a DXF document from file system similar to :func:`ezdxf.readfile`, but this function will repair as many flaws as possible, runs the required audit process automatically the DXF document and the :class:`Auditor`. Args: filename: file-system name of the DXF document to load errors: specify decoding error handler - "surrogateescape" to preserve possible binary data (default) - "ignore" to use the replacement char U+FFFD "\ufffd" for invalid data - "strict" to raise an :class:`UnicodeDecodeError` exception for invalid data Raises: DXFStructureError: for invalid or corrupted DXF structures UnicodeDecodeError: if `errors` is "strict" and a decoding error occurs """ filename = str(filename) with open(filename, mode="rb") as fp: doc, auditor = read(fp, errors=errors) doc.filename = filename return doc, auditor def read(stream: BinaryIO, errors: str = "surrogateescape") -> tuple[Drawing, Auditor]: """Read a DXF document from a binary-stream similar to :func:`ezdxf.read`, but this function will detect the text encoding automatically and repair as many flaws as possible, runs the required audit process afterwards and returns the DXF document and the :class:`Auditor`. Args: stream: data stream to load in binary read mode errors: specify decoding error handler - "surrogateescape" to preserve possible binary data (default) - "ignore" to use the replacement char U+FFFD "\ufffd" for invalid data - "strict" to raise an :class:`UnicodeDecodeError` exception for invalid data Raises: DXFStructureError: for invalid or corrupted DXF structures UnicodeDecodeError: if `errors` is "strict" and a decoding error occurs """ recover_tool = Recover.run(stream, errors=errors) return _load_and_audit_document(recover_tool) def explore( filename: Union[str, Path], errors: str = "ignore" ) -> tuple[Drawing, Auditor]: """Read a DXF document from file system similar to :func:`readfile`, but this function will use a special tag loader, which tries to recover the tag stream if invalid tags occur. This function is intended to load corrupted DXF files and should only be used to explore such files, data loss is very likely. Args: filename: file-system name of the DXF document to load errors: specify decoding error handler - "surrogateescape" to preserve possible binary data (default) - "ignore" to use the replacement char U+FFFD "\ufffd" for invalid data - "strict" to raise an :class:`UnicodeDecodeError` exception for invalid data Raises: DXFStructureError: for invalid or corrupted DXF structures UnicodeDecodeError: if `errors` is "strict" and a decoding error occurs """ filename = str(filename) with open(filename, mode="rb") as fp: recover_tool = Recover.run(fp, errors=errors, loader=synced_bytes_loader) doc, auditor = _load_and_audit_document(recover_tool) doc.filename = filename return doc, auditor def _load_and_audit_document(recover_tool) -> tuple[Drawing, Auditor]: from ezdxf.document import Drawing doc = Drawing() doc._load_section_dict(recover_tool.section_dict) auditor = Auditor(doc) for code, msg in recover_tool.errors: auditor.add_error(code, msg) for code, msg in recover_tool.fixes: auditor.fixed_error(code, msg) auditor.run() return doc, auditor # noinspection PyMethodMayBeStatic class Recover: """Loose coupled recovering tools.""" def __init__(self, loader: Optional[Callable] = None): # different tag loading strategies can be used: # - bytes_loader(): expects a valid low level structure # - synced_bytes_loader(): loads everything which looks like a tag # and skip other content (dangerous!) self.tag_loader = loader or bytes_loader # The main goal of all efforts, a Drawing compatible dict of sections: self.section_dict: "SectionDict" = dict() # Store error messages from low level processes self.errors: list[tuple[int, str]] = [] self.fixes: list[tuple[int, str]] = [] # Detected DXF version self.dxfversion = const.DXF12 @classmethod def run( cls, stream: BinaryIO, loader: Optional[Callable] = None, errors: str = "surrogateescape", ) -> Recover: """Execute the recover process.""" recover_tool = Recover(loader) tags = recover_tool.load_tags(stream, errors) sections = recover_tool.rebuild_sections(tags) recover_tool.load_section_dict(sections) tables = recover_tool.section_dict.get("TABLES") if tables: tables = recover_tool.rebuild_tables(tables) # type: ignore recover_tool.section_dict["TABLES"] = tables if recover_tool.dxfversion > "AC1009": recover_tool.recover_rootdict() recover_tool.fix_broken_layout_links() section_dict = recover_tool.section_dict is_r12 = recover_tool.dxfversion <= "AC1009" for name, entities in section_dict.items(): if name in {"TABLES", "BLOCKS", "OBJECTS", "ENTITIES"}: section_dict[name] = list( recover_tool.check_entities(entities, is_r12) # type: ignore ) return recover_tool def load_tags(self, stream: BinaryIO, errors: str) -> Iterator[DXFTag]: return safe_tag_loader( stream, self.tag_loader, messages=self.errors, errors=errors ) def rebuild_sections(self, tags: Iterable[DXFTag]) -> list[list[DXFTag]]: """Collect tags between SECTION and ENDSEC or next SECTION tag as list of DXFTag objects, collects tags outside of sections as an extra section. Returns: List of sections as list of DXFTag() objects, the last section contains orphaned tags found outside of sections """ # Invalid placed DXF entities are removed in the audit process! def close_section(): # ENDSEC tag is not collected nonlocal collector, inside_section if inside_section: sections.append(collector) else: # missing SECTION # ignore this tag, it is even not an orphan self.fixes.append( ( AuditError.MISSING_SECTION_TAG, "DXF structure error: missing SECTION tag.", ) ) collector = [] inside_section = False def open_section(): nonlocal inside_section if inside_section: # missing ENDSEC self.fixes.append( ( AuditError.MISSING_ENDSEC_TAG, "DXF structure error: missing ENDSEC tag.", ) ) close_section() collector.append(tag) inside_section = True def process_structure_tag(): if value == "SECTION": open_section() elif value == "ENDSEC": close_section() elif value == "EOF": if inside_section: self.fixes.append( ( AuditError.MISSING_ENDSEC_TAG, "DXF structure error: missing ENDSEC tag.", ) ) close_section() else: collect() def collect(): if inside_section: collector.append(tag) else: self.fixes.append( ( AuditError.FOUND_TAG_OUTSIDE_SECTION, f"DXF structure error: found tag outside section: " f"({code}, {value})", ) ) orphans.append(tag) orphans: list[DXFTag] = [] sections: list[list[DXFTag]] = [] collector: list[DXFTag] = [] inside_section = False for tag in tags: code, value = tag if code == 0: process_structure_tag() else: collect() sections.append(orphans) return sections def load_section_dict(self, sections: list[list[DXFTag]]) -> None: """Merge sections of same type.""" def add_section(name: str, tags) -> None: if name in section_dict: section_dict[name].extend(tags[2:]) else: section_dict[name] = tags def _build_section_dict(d: dict) -> None: for name, section in d.items(): if name in const.MANAGED_SECTIONS: self.section_dict[name] = list(group_tags(section, 0)) def _remove_unsupported_sections(d: dict): for name in ("CLASSES", "OBJECTS", "ACDSDATA"): if name in d: del d[name] self.fixes.append( ( AuditError.REMOVED_UNSUPPORTED_SECTION, f"Removed unsupported {name} section for DXF R12.", ) ) # Last section could be orphaned tags: orphans = sections.pop() if orphans and orphans[0] == (0, "SECTION"): # The last section contains not the orphaned tags: sections.append(orphans) orphans = [] section_dict: "SectionDict" = dict() for section in sections: code, name = section[1] if code == 2: add_section(name, section) else: # invalid section name tag e.g. (2, "HEADER") self.fixes.append( ( AuditError.MISSING_SECTION_NAME_TAG, "DXF structure error: missing section name tag, ignore section.", ) ) header = section_dict.setdefault( "HEADER", [ DXFTag(0, "SECTION"), # type: ignore DXFTag(2, "HEADER"), # type: ignore ], ) self.rescue_orphaned_header_vars(header, orphans) # type: ignore self.dxfversion = _detect_dxf_version(header) if self.dxfversion <= const.DXF12: _remove_unsupported_sections(section_dict) _build_section_dict(section_dict) def rebuild_tables(self, tables: list[Tags]) -> list[Tags]: """Rebuild TABLES section.""" # Note: the recover module does not report invalid placed table entries, # it just recovers them. The "normal" loading process ignore these # misplaced table entries and logs a warning. def append_table(name: str): if name not in content: return head = heads.get(name) if head: tables.append(head) else: # The new table head gets a valid handle from Auditor. tables.append(Tags([DXFTag(0, "TABLE"), DXFTag(2, name)])) tables.extend(content[name]) tables.append(Tags([DXFTag(0, "ENDTAB")])) heads = dict() content = defaultdict(list) valid_tables = set(const.TABLE_NAMES_ACAD_ORDER) for entry in tables: name = entry[0].value.upper() if name == "TABLE": try: table_name = entry[1].value.upper() except (IndexError, AttributeError): pass else: heads[table_name] = entry elif name in valid_tables: content[name].append(entry) tables = [Tags([DXFTag(0, "SECTION"), DXFTag(2, "TABLES")])] names = list(const.TABLE_NAMES_ACAD_ORDER) if self.dxfversion <= const.DXF12: # Ignore BLOCK_RECORD table names.remove("BLOCK_RECORD") if "BLOCK_RECORD" in content: self.fixes.append( ( AuditError.REMOVED_UNSUPPORTED_TABLE, f"Removed unsupported BLOCK_RECORD table for DXF R12.", ) ) for name in names: append_table(name) return tables def rescue_orphaned_header_vars( self, header: list[DXFTag], orphans: Iterable[DXFTag] ) -> None: var_name = None for tag in orphans: code, value = tag if code == 9: var_name = tag elif var_name is not None: header.append(var_name) header.append(tag) var_name = None def check_entities(self, entities: list[Tags], is_r12: bool) -> Iterator[Tags]: subclass_markers = (100,) for entity in entities: _, dxftype = entity[0] if dxftype in EXCLUDE_STRUCTURE_CHECK: yield entity else: # raises DXFStructureError() for invalid entities tags = Tags(entity_structure_validator(entity)) if is_r12: # subclass markers (100, ...) in DXF R12 files confuses the # ezdxf parser #1106 tags.remove_tags(subclass_markers) yield tags def recover_rootdict(self): objects = self.section_dict.get("OBJECTS") if not objects or len(objects) < 2: return # empty OBJECTS section # index 0 is [DXFTag(0, 'SECTION'), DXFTag(2, 'OBJECTS')], this is a # requirement to be stored in the section_dict! if _is_rootdict(objects[1]): return # everything is fine index, rootdict = _find_rootdict(objects) if index: # make rootdict to first entity in OBJECTS section objects[index] = objects[1] objects[1] = rootdict try: handle = rootdict.get_handle() except const.DXFValueError: handle = "None" self.fixes.append( ( AuditError.MISPLACED_ROOT_DICT, f"Recovered misplaced root DICTIONARY(#{handle}).", ) ) def fix_broken_layout_links(self): """Fixes broke links (block_record_handle) between LAYOUT and BLOCK_RECORD entities. See issue #997 for more information. """ pass def _detect_dxf_version(header: list) -> str: next_is_dxf_version = False for tag in header: if next_is_dxf_version: dxfversion = str(tag[1]).strip() if re.fullmatch(r"AC[0-9]{4}", dxfversion): return dxfversion else: break if tag == (9, "$ACADVER"): next_is_dxf_version = True return const.DXF12 def _is_rootdict(entity: Tags) -> bool: if entity[0] != (0, "DICTIONARY"): return False # The entry "ACAD_GROUP" in the rootdict is absolutely necessary! return any(tag == (3, "ACAD_GROUP") for tag in entity) def _find_rootdict(objects: list[Tags]) -> tuple[int, Tags]: for index, entity in enumerate(objects): if _is_rootdict(entity): return index, entity return 0, Tags() def safe_tag_loader( stream: BinaryIO, loader: Optional[Callable] = None, messages: Optional[list] = None, errors: str = "surrogateescape", ) -> Iterator[DXFTag]: """Yields :class:``DXFTag`` objects from a bytes `stream` (untrusted external source), skips all comment tags (group code == 999). - Fixes unordered and invalid vertex tags. - Pass :func:`synced_bytes_loader` as argument `loader` to brute force load invalid tag structure. Args: stream: input data stream as bytes loader: low level tag loader, default loader is :func:`bytes_loader` messages: list to store error messages errors: specify decoding error handler - "surrogateescape" to preserve possible binary data (default) - "ignore" to use the replacement char U+FFFD "\ufffd" for invalid data - "strict" to raise an :class:`UnicodeDecodeError` exception for invalid data """ if loader is None: loader = bytes_loader tags, detector_stream = itertools.tee(loader(stream), 2) encoding = detect_encoding(detector_stream) # Apply repair filter: tags = repair.tag_reorder_layer(tags) tags = repair.filter_invalid_point_codes(tags) # type: ignore tags = repair.filter_invalid_handles(tags) return byte_tag_compiler(tags, encoding, messages=messages, errors=errors) INT_PATTERN_S = re.compile(r"[+-]?\d+") INT_PATTERN_B = re.compile(rb"[+-]?\d+") def _search_int(s: Union[str, bytes]) -> int: """Emulate the behavior of the C function stoll(), which just stop converting strings to integers at the first invalid char without raising an exception. e.g. "42xyz" is a valid integer 42 """ res = re.search( INT_PATTERN_S if isinstance(s, str) else INT_PATTERN_B, s # type: ignore ) if res: s = res.group() return int(s) FLOAT_PATTERN_S = re.compile(r"[+-]?\d+(:?\.\d*)?(:?[eE][+-]?\d+)?") FLOAT_PATTERN_B = re.compile(rb"[+-]?\d+(:?\.\d*)?(:?[eE][+-]?\d+)?") def _search_float(s: Union[str, bytes]) -> float: """Emulate the behavior of the C function stod(), which just stop converting strings to doubles at the first invalid char without raising an exception. e.g. "47.11xyz" is a valid double 47.11 """ res = re.search( FLOAT_PATTERN_S if isinstance(s, str) else FLOAT_PATTERN_B, s # type: ignore ) if res: s = res.group() return float(s) @typing.no_type_check def bytes_loader(stream: BinaryIO) -> Iterator[DXFTag]: """Yields :class:``DXFTag`` objects from a bytes `stream` (untrusted external source), skips all comment tags (group code == 999). ``DXFTag.code`` is always an ``int`` and ``DXFTag.value`` is always a raw bytes value without line endings. Works with file system streams and :class:`BytesIO` streams. Raises: DXFStructureError: Found invalid group code. """ eof = False line = 1 readline = stream.readline while not eof: code = readline() # ByteIO(): empty strings indicates EOF - does not raise an exception if code: try: code = int(code) except ValueError: try: # harder to find an int code = _search_int(code) except ValueError: code = code.decode(errors="ignore").rstrip("\r\n") raise const.DXFStructureError( f'Invalid group code "{code}" at line {line}.' ) else: return value = readline() # ByteIO(): empty strings indicates EOF if value: value = value.rstrip(b"\r\n") if code == 0 and value == b"EOF": eof = True if code != 999: yield DXFTag(code, value) line += 2 else: return def synced_bytes_loader(stream: BinaryIO) -> Iterator[DXFTag]: """Yields :class:``DXFTag`` objects from a bytes `stream` (untrusted external source), skips all comment tags (group code == 999). ``DXFTag.code`` is always an ``int`` and ``DXFTag.value`` is always a raw bytes value without line endings. Works with file system streams and :class:`BytesIO` streams. Does not raise DXFStructureError on invalid group codes, instead skips lines until a valid group code or EOF is found. This can remove invalid lines before group codes, but can not detect invalid lines between group code and tag value. """ code = 999 upper_boundary = MAX_GROUP_CODE + 1 readline = stream.readline while True: seeking_valid_group_code = True while seeking_valid_group_code: code = readline() # type: ignore if code: try: # hard to find an int code = _search_int(code) # type: ignore except ValueError: pass else: if 0 <= code < upper_boundary: seeking_valid_group_code = False else: return # empty string is EOF value = readline() if value: if code != 999: yield DXFTag(code, value.rstrip(b"\r\n")) else: return # empty string is EOF DWGCODEPAGE = b"$DWGCODEPAGE" ACADVER = b"$ACADVER" def _strip_whitespace(s: str) -> str: ws = set(string.whitespace) return "".join([c for c in s if c not in ws]) def detect_encoding(tags: Iterable[DXFTag]) -> str: """Detect text encoding from header variables $DWGCODEPAGE and $ACADVER out of a stream of DXFTag objects. Assuming a malformed DXF file: The header variables could reside outside of the HEADER section, an ENDSEC tag is not a reliable fact that no $DWGCODEPAGE or $ACADVER header variable will show up in the remaining tag stream. Worst case: DXF file without a $ACADVER var, and a $DWGCODEPAGE unequal to "ANSI_1252" at the end of the file. """ encoding = None dxfversion = None next_tag = None for code, value in tags: if code == 9: if value == DWGCODEPAGE: next_tag = DWGCODEPAGE # e.g. (3, "ANSI_1252") elif value == ACADVER: next_tag = ACADVER # e.g. (1, "AC1012") elif code == 3 and next_tag == DWGCODEPAGE: encoding = toencoding(value.decode(const.DEFAULT_ENCODING)) next_tag = None elif code == 1 and next_tag == ACADVER: dxfversion = value.decode(const.DEFAULT_ENCODING) next_tag = None if encoding and dxfversion: return "utf8" if dxfversion >= const.DXF2007 else encoding return const.DEFAULT_ENCODING @typing.no_type_check def byte_tag_compiler( tags: Iterable[DXFTag], encoding=const.DEFAULT_ENCODING, messages: Optional[list] = None, errors: str = "surrogateescape", ) -> Iterator[DXFTag]: """Compiles DXF tag values imported by bytes_loader() into Python types. Raises DXFStructureError() for invalid float values and invalid coordinate values. Expects DXF coordinates written in x, y[, z] order, see function :func:`safe_tag_loader` for usage with applied repair filters. Args: tags: DXF tag generator, yielding tag values as bytes like bytes_loader() encoding: text encoding messages: list to store error messages errors: specify decoding error handler - "surrogateescape" to preserve possible binary data (default) - "ignore" to use the replacement char U+FFFD "\ufffd" for invalid data - "strict" to raise an :class:`UnicodeDecodeError` exception for invalid data Raises: DXFStructureError: Found invalid DXF tag or unexpected coordinate order. """ def error_msg(tag): code = tag.code value = tag.value.decode(encoding) return f'Invalid tag ({code}, "{value}") near line: {line}.' def recover_int(s: Union[str, bytes]) -> int: if isinstance(s, bytes): s = s.decode(encoding="utf8", errors="ignore") value = _search_int(s) msg = f'recovered invalid integer value "{s}" near line {line} as "{value}"' messages.append((AuditError.INVALID_INTEGER_VALUE, msg)) logger.warning(msg) return value def recover_float(s: Union[str, bytes]) -> float: if isinstance(s, bytes): s = s.decode(encoding="utf8", errors="ignore") s = _strip_whitespace(s) value = _search_float(s) msg = f'recovered invalid floating point value "{s}" near line {line} as "{value}"' messages.append((AuditError.INVALID_FLOATING_POINT_VALUE, msg)) logger.warning(msg) return value assert isinstance(encoding, str) assert isinstance(errors, str) if messages is None: messages = [] tags = iter(tags) undo_tag = None line = 0 while True: try: if undo_tag is not None: x = undo_tag undo_tag = None else: x = next(tags) line += 2 code = x.code if code in POINT_CODES: y = next(tags) # y coordinate is mandatory line += 2 # e.g. y-code for x-code=10 is 20 if y.code != code + 10: raise const.DXFStructureError( f"Missing required y-coordinate near line: {line}." ) # optional z coordinate z = next(tags) line += 2 try: # is it a z-coordinate like (30, 0.0) for base x-code=10 if z.code == code + 20: try: point = ( float(x.value), float(y.value), float(z.value), ) except ValueError: # search for any float values point = ( recover_float(x.value), recover_float(y.value), recover_float(z.value), ) else: try: point = (float(x.value), float(y.value)) except ValueError: # search for any float values point = ( recover_float(x.value), recover_float(y.value), ) undo_tag = z except ValueError: raise const.DXFStructureError( f"Invalid floating point values near line: {line}." ) yield DXFVertex(code, point) elif code in BINARY_DATA: # maybe pre compiled in low level tagger (binary DXF) if isinstance(x, DXFBinaryTag): tag = x else: try: tag = DXFBinaryTag.from_string(code, x.value) except ValueError: raise const.DXFStructureError( f"Invalid binary data near line: {line}." ) yield tag else: # just a single tag type_ = TYPE_TABLE.get(code, str) value: bytes = x.value if type_ is str: if code == 0: # remove white space from structure tags value = x.value.strip().upper() try: # 2 stages to document decoding errors str_ = value.decode(encoding, errors="strict") except UnicodeDecodeError: str_ = value.decode(encoding, errors=errors) messages.append( ( AuditError.DECODING_ERROR, f"Fixed unicode decoding error near line {line}", ) ) # exclude structure tags (code == 0): if code: # Convert DXF-Unicode notation "\U+xxxx" to unicode if has_dxf_unicode(str_): str_ = decode_dxf_unicode(str_) # Convert MIF notation "\M+cxxxx" to unicode elif has_mif_encoding(str_): str_ = decode_mif_to_unicode(str_) yield DXFTag(code, str_) else: try: # fast path for int and float yield DXFTag(code, type_(value)) except ValueError: # slow path - e.g. ProE stores int values as floats :(( if type_ is int: try: yield DXFTag(code, recover_int(x.value)) except ValueError: raise const.DXFStructureError(error_msg(x)) elif type_ is float: try: yield DXFTag(code, recover_float(x.value)) except ValueError: raise const.DXFStructureError(error_msg(x)) else: raise const.DXFStructureError(error_msg(x)) except StopIteration: return