skills/skills/document-skills/docx/ooxml/scripts/validation/base.py at main

  1"""
  2Base validator with common validation logic for document files.
  3"""
  4
  5import re
  6from pathlib import Path
  7
  8import lxml.etree
  9
 10
 11class BaseSchemaValidator:
 12    """Base validator with common validation logic for document files."""
 13
 14    # Elements whose 'id' attributes must be unique within their file
 15    # Format: element_name -> (attribute_name, scope)
 16    # scope can be 'file' (unique within file) or 'global' (unique across all files)
 17    UNIQUE_ID_REQUIREMENTS = {
 18        # Word elements
 19        "comment": ("id", "file"),  # Comment IDs in comments.xml
 20        "commentrangestart": ("id", "file"),  # Must match comment IDs
 21        "commentrangeend": ("id", "file"),  # Must match comment IDs
 22        "bookmarkstart": ("id", "file"),  # Bookmark start IDs
 23        "bookmarkend": ("id", "file"),  # Bookmark end IDs
 24        # Note: ins and del (track changes) can share IDs when part of same revision
 25        # PowerPoint elements
 26        "sldid": ("id", "file"),  # Slide IDs in presentation.xml
 27        "sldmasterid": ("id", "global"),  # Slide master IDs must be globally unique
 28        "sldlayoutid": ("id", "global"),  # Slide layout IDs must be globally unique
 29        "cm": ("authorid", "file"),  # Comment author IDs
 30        # Excel elements
 31        "sheet": ("sheetid", "file"),  # Sheet IDs in workbook.xml
 32        "definedname": ("id", "file"),  # Named range IDs
 33        # Drawing/Shape elements (all formats)
 34        "cxnsp": ("id", "file"),  # Connection shape IDs
 35        "sp": ("id", "file"),  # Shape IDs
 36        "pic": ("id", "file"),  # Picture IDs
 37        "grpsp": ("id", "file"),  # Group shape IDs
 38    }
 39
 40    # Mapping of element names to expected relationship types
 41    # Subclasses should override this with format-specific mappings
 42    ELEMENT_RELATIONSHIP_TYPES = {}
 43
 44    # Unified schema mappings for all Office document types
 45    SCHEMA_MAPPINGS = {
 46        # Document type specific schemas
 47        "word": "ISO-IEC29500-4_2016/wml.xsd",  # Word documents
 48        "ppt": "ISO-IEC29500-4_2016/pml.xsd",  # PowerPoint presentations
 49        "xl": "ISO-IEC29500-4_2016/sml.xsd",  # Excel spreadsheets
 50        # Common file types
 51        "[Content_Types].xml": "ecma/fouth-edition/opc-contentTypes.xsd",
 52        "app.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd",
 53        "core.xml": "ecma/fouth-edition/opc-coreProperties.xsd",
 54        "custom.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd",
 55        ".rels": "ecma/fouth-edition/opc-relationships.xsd",
 56        # Word-specific files
 57        "people.xml": "microsoft/wml-2012.xsd",
 58        "commentsIds.xml": "microsoft/wml-cid-2016.xsd",
 59        "commentsExtensible.xml": "microsoft/wml-cex-2018.xsd",
 60        "commentsExtended.xml": "microsoft/wml-2012.xsd",
 61        # Chart files (common across document types)
 62        "chart": "ISO-IEC29500-4_2016/dml-chart.xsd",
 63        # Theme files (common across document types)
 64        "theme": "ISO-IEC29500-4_2016/dml-main.xsd",
 65        # Drawing and media files
 66        "drawing": "ISO-IEC29500-4_2016/dml-main.xsd",
 67    }
 68
 69    # Unified namespace constants
 70    MC_NAMESPACE = "http://schemas.openxmlformats.org/markup-compatibility/2006"
 71    XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
 72
 73    # Common OOXML namespaces used across validators
 74    PACKAGE_RELATIONSHIPS_NAMESPACE = (
 75        "http://schemas.openxmlformats.org/package/2006/relationships"
 76    )
 77    OFFICE_RELATIONSHIPS_NAMESPACE = (
 78        "http://schemas.openxmlformats.org/officeDocument/2006/relationships"
 79    )
 80    CONTENT_TYPES_NAMESPACE = (
 81        "http://schemas.openxmlformats.org/package/2006/content-types"
 82    )
 83
 84    # Folders where we should clean ignorable namespaces
 85    MAIN_CONTENT_FOLDERS = {"word", "ppt", "xl"}
 86
 87    # All allowed OOXML namespaces (superset of all document types)
 88    OOXML_NAMESPACES = {
 89        "http://schemas.openxmlformats.org/officeDocument/2006/math",
 90        "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
 91        "http://schemas.openxmlformats.org/schemaLibrary/2006/main",
 92        "http://schemas.openxmlformats.org/drawingml/2006/main",
 93        "http://schemas.openxmlformats.org/drawingml/2006/chart",
 94        "http://schemas.openxmlformats.org/drawingml/2006/chartDrawing",
 95        "http://schemas.openxmlformats.org/drawingml/2006/diagram",
 96        "http://schemas.openxmlformats.org/drawingml/2006/picture",
 97        "http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing",
 98        "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
 99        "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
100        "http://schemas.openxmlformats.org/presentationml/2006/main",
101        "http://schemas.openxmlformats.org/spreadsheetml/2006/main",
102        "http://schemas.openxmlformats.org/officeDocument/2006/sharedTypes",
103        "http://www.w3.org/XML/1998/namespace",
104    }
105
106    def __init__(self, unpacked_dir, original_file, verbose=False):
107        self.unpacked_dir = Path(unpacked_dir).resolve()
108        self.original_file = Path(original_file)
109        self.verbose = verbose
110
111        # Set schemas directory
112        self.schemas_dir = Path(__file__).parent.parent.parent / "schemas"
113
114        # Get all XML and .rels files
115        patterns = ["*.xml", "*.rels"]
116        self.xml_files = [
117            f for pattern in patterns for f in self.unpacked_dir.rglob(pattern)
118        ]
119
120        if not self.xml_files:
121            print(f"Warning: No XML files found in {self.unpacked_dir}")
122
123    def validate(self):
124        """Run all validation checks and return True if all pass."""
125        raise NotImplementedError("Subclasses must implement the validate method")
126
127    def validate_xml(self):
128        """Validate that all XML files are well-formed."""
129        errors = []
130
131        for xml_file in self.xml_files:
132            try:
133                # Try to parse the XML file
134                lxml.etree.parse(str(xml_file))
135            except lxml.etree.XMLSyntaxError as e:
136                errors.append(
137                    f"  {xml_file.relative_to(self.unpacked_dir)}: "
138                    f"Line {e.lineno}: {e.msg}"
139                )
140            except Exception as e:
141                errors.append(
142                    f"  {xml_file.relative_to(self.unpacked_dir)}: "
143                    f"Unexpected error: {str(e)}"
144                )
145
146        if errors:
147            print(f"FAILED - Found {len(errors)} XML violations:")
148            for error in errors:
149                print(error)
150            return False
151        else:
152            if self.verbose:
153                print("PASSED - All XML files are well-formed")
154            return True
155
156    def validate_namespaces(self):
157        """Validate that namespace prefixes in Ignorable attributes are declared."""
158        errors = []
159
160        for xml_file in self.xml_files:
161            try:
162                root = lxml.etree.parse(str(xml_file)).getroot()
163                declared = set(root.nsmap.keys()) - {None}  # Exclude default namespace
164
165                for attr_val in [
166                    v for k, v in root.attrib.items() if k.endswith("Ignorable")
167                ]:
168                    undeclared = set(attr_val.split()) - declared
169                    errors.extend(
170                        f"  {xml_file.relative_to(self.unpacked_dir)}: "
171                        f"Namespace '{ns}' in Ignorable but not declared"
172                        for ns in undeclared
173                    )
174            except lxml.etree.XMLSyntaxError:
175                continue
176
177        if errors:
178            print(f"FAILED - {len(errors)} namespace issues:")
179            for error in errors:
180                print(error)
181            return False
182        if self.verbose:
183            print("PASSED - All namespace prefixes properly declared")
184        return True
185
186    def validate_unique_ids(self):
187        """Validate that specific IDs are unique according to OOXML requirements."""
188        errors = []
189        global_ids = {}  # Track globally unique IDs across all files
190
191        for xml_file in self.xml_files:
192            try:
193                root = lxml.etree.parse(str(xml_file)).getroot()
194                file_ids = {}  # Track IDs that must be unique within this file
195
196                # Remove all mc:AlternateContent elements from the tree
197                mc_elements = root.xpath(
198                    ".//mc:AlternateContent", namespaces={"mc": self.MC_NAMESPACE}
199                )
200                for elem in mc_elements:
201                    elem.getparent().remove(elem)
202
203                # Now check IDs in the cleaned tree
204                for elem in root.iter():
205                    # Get the element name without namespace
206                    tag = (
207                        elem.tag.split("}")[-1].lower()
208                        if "}" in elem.tag
209                        else elem.tag.lower()
210                    )
211
212                    # Check if this element type has ID uniqueness requirements
213                    if tag in self.UNIQUE_ID_REQUIREMENTS:
214                        attr_name, scope = self.UNIQUE_ID_REQUIREMENTS[tag]
215
216                        # Look for the specified attribute
217                        id_value = None
218                        for attr, value in elem.attrib.items():
219                            attr_local = (
220                                attr.split("}")[-1].lower()
221                                if "}" in attr
222                                else attr.lower()
223                            )
224                            if attr_local == attr_name:
225                                id_value = value
226                                break
227
228                        if id_value is not None:
229                            if scope == "global":
230                                # Check global uniqueness
231                                if id_value in global_ids:
232                                    prev_file, prev_line, prev_tag = global_ids[
233                                        id_value
234                                    ]
235                                    errors.append(
236                                        f"  {xml_file.relative_to(self.unpacked_dir)}: "
237                                        f"Line {elem.sourceline}: Global ID '{id_value}' in <{tag}> "
238                                        f"already used in {prev_file} at line {prev_line} in <{prev_tag}>"
239                                    )
240                                else:
241                                    global_ids[id_value] = (
242                                        xml_file.relative_to(self.unpacked_dir),
243                                        elem.sourceline,
244                                        tag,
245                                    )
246                            elif scope == "file":
247                                # Check file-level uniqueness
248                                key = (tag, attr_name)
249                                if key not in file_ids:
250                                    file_ids[key] = {}
251
252                                if id_value in file_ids[key]:
253                                    prev_line = file_ids[key][id_value]
254                                    errors.append(
255                                        f"  {xml_file.relative_to(self.unpacked_dir)}: "
256                                        f"Line {elem.sourceline}: Duplicate {attr_name}='{id_value}' in <{tag}> "
257                                        f"(first occurrence at line {prev_line})"
258                                    )
259                                else:
260                                    file_ids[key][id_value] = elem.sourceline
261
262            except (lxml.etree.XMLSyntaxError, Exception) as e:
263                errors.append(
264                    f"  {xml_file.relative_to(self.unpacked_dir)}: Error: {e}"
265                )
266
267        if errors:
268            print(f"FAILED - Found {len(errors)} ID uniqueness violations:")
269            for error in errors:
270                print(error)
271            return False
272        else:
273            if self.verbose:
274                print("PASSED - All required IDs are unique")
275            return True
276
277    def validate_file_references(self):
278        """
279        Validate that all .rels files properly reference files and that all files are referenced.
280        """
281        errors = []
282
283        # Find all .rels files
284        rels_files = list(self.unpacked_dir.rglob("*.rels"))
285
286        if not rels_files:
287            if self.verbose:
288                print("PASSED - No .rels files found")
289            return True
290
291        # Get all files in the unpacked directory (excluding reference files)
292        all_files = []
293        for file_path in self.unpacked_dir.rglob("*"):
294            if (
295                file_path.is_file()
296                and file_path.name != "[Content_Types].xml"
297                and not file_path.name.endswith(".rels")
298            ):  # This file is not referenced by .rels
299                all_files.append(file_path.resolve())
300
301        # Track all files that are referenced by any .rels file
302        all_referenced_files = set()
303
304        if self.verbose:
305            print(
306                f"Found {len(rels_files)} .rels files and {len(all_files)} target files"
307            )
308
309        # Check each .rels file
310        for rels_file in rels_files:
311            try:
312                # Parse relationships file
313                rels_root = lxml.etree.parse(str(rels_file)).getroot()
314
315                # Get the directory where this .rels file is located
316                rels_dir = rels_file.parent
317
318                # Find all relationships and their targets
319                referenced_files = set()
320                broken_refs = []
321
322                for rel in rels_root.findall(
323                    ".//ns:Relationship",
324                    namespaces={"ns": self.PACKAGE_RELATIONSHIPS_NAMESPACE},
325                ):
326                    target = rel.get("Target")
327                    if target and not target.startswith(
328                        ("http", "mailto:")
329                    ):  # Skip external URLs
330                        # Resolve the target path relative to the .rels file location
331                        if rels_file.name == ".rels":
332                            # Root .rels file - targets are relative to unpacked_dir
333                            target_path = self.unpacked_dir / target
334                        else:
335                            # Other .rels files - targets are relative to their parent's parent
336                            # e.g., word/_rels/document.xml.rels -> targets relative to word/
337                            base_dir = rels_dir.parent
338                            target_path = base_dir / target
339
340                        # Normalize the path and check if it exists
341                        try:
342                            target_path = target_path.resolve()
343                            if target_path.exists() and target_path.is_file():
344                                referenced_files.add(target_path)
345                                all_referenced_files.add(target_path)
346                            else:
347                                broken_refs.append((target, rel.sourceline))
348                        except (OSError, ValueError):
349                            broken_refs.append((target, rel.sourceline))
350
351                # Report broken references
352                if broken_refs:
353                    rel_path = rels_file.relative_to(self.unpacked_dir)
354                    for broken_ref, line_num in broken_refs:
355                        errors.append(
356                            f"  {rel_path}: Line {line_num}: Broken reference to {broken_ref}"
357                        )
358
359            except Exception as e:
360                rel_path = rels_file.relative_to(self.unpacked_dir)
361                errors.append(f"  Error parsing {rel_path}: {e}")
362
363        # Check for unreferenced files (files that exist but are not referenced anywhere)
364        unreferenced_files = set(all_files) - all_referenced_files
365
366        if unreferenced_files:
367            for unref_file in sorted(unreferenced_files):
368                unref_rel_path = unref_file.relative_to(self.unpacked_dir)
369                errors.append(f"  Unreferenced file: {unref_rel_path}")
370
371        if errors:
372            print(f"FAILED - Found {len(errors)} relationship validation errors:")
373            for error in errors:
374                print(error)
375            print(
376                "CRITICAL: These errors will cause the document to appear corrupt. "
377                + "Broken references MUST be fixed, "
378                + "and unreferenced files MUST be referenced or removed."
379            )
380            return False
381        else:
382            if self.verbose:
383                print(
384                    "PASSED - All references are valid and all files are properly referenced"
385                )
386            return True
387
388    def validate_all_relationship_ids(self):
389        """
390        Validate that all r:id attributes in XML files reference existing IDs
391        in their corresponding .rels files, and optionally validate relationship types.
392        """
393        import lxml.etree
394
395        errors = []
396
397        # Process each XML file that might contain r:id references
398        for xml_file in self.xml_files:
399            # Skip .rels files themselves
400            if xml_file.suffix == ".rels":
401                continue
402
403            # Determine the corresponding .rels file
404            # For dir/file.xml, it's dir/_rels/file.xml.rels
405            rels_dir = xml_file.parent / "_rels"
406            rels_file = rels_dir / f"{xml_file.name}.rels"
407
408            # Skip if there's no corresponding .rels file (that's okay)
409            if not rels_file.exists():
410                continue
411
412            try:
413                # Parse the .rels file to get valid relationship IDs and their types
414                rels_root = lxml.etree.parse(str(rels_file)).getroot()
415                rid_to_type = {}
416
417                for rel in rels_root.findall(
418                    f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship"
419                ):
420                    rid = rel.get("Id")
421                    rel_type = rel.get("Type", "")
422                    if rid:
423                        # Check for duplicate rIds
424                        if rid in rid_to_type:
425                            rels_rel_path = rels_file.relative_to(self.unpacked_dir)
426                            errors.append(
427                                f"  {rels_rel_path}: Line {rel.sourceline}: "
428                                f"Duplicate relationship ID '{rid}' (IDs must be unique)"
429                            )
430                        # Extract just the type name from the full URL
431                        type_name = (
432                            rel_type.split("/")[-1] if "/" in rel_type else rel_type
433                        )
434                        rid_to_type[rid] = type_name
435
436                # Parse the XML file to find all r:id references
437                xml_root = lxml.etree.parse(str(xml_file)).getroot()
438
439                # Find all elements with r:id attributes
440                for elem in xml_root.iter():
441                    # Check for r:id attribute (relationship ID)
442                    rid_attr = elem.get(f"{{{self.OFFICE_RELATIONSHIPS_NAMESPACE}}}id")
443                    if rid_attr:
444                        xml_rel_path = xml_file.relative_to(self.unpacked_dir)
445                        elem_name = (
446                            elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag
447                        )
448
449                        # Check if the ID exists
450                        if rid_attr not in rid_to_type:
451                            errors.append(
452                                f"  {xml_rel_path}: Line {elem.sourceline}: "
453                                f"<{elem_name}> references non-existent relationship '{rid_attr}' "
454                                f"(valid IDs: {', '.join(sorted(rid_to_type.keys())[:5])}{'...' if len(rid_to_type) > 5 else ''})"
455                            )
456                        # Check if we have type expectations for this element
457                        elif self.ELEMENT_RELATIONSHIP_TYPES:
458                            expected_type = self._get_expected_relationship_type(
459                                elem_name
460                            )
461                            if expected_type:
462                                actual_type = rid_to_type[rid_attr]
463                                # Check if the actual type matches or contains the expected type
464                                if expected_type not in actual_type.lower():
465                                    errors.append(
466                                        f"  {xml_rel_path}: Line {elem.sourceline}: "
467                                        f"<{elem_name}> references '{rid_attr}' which points to '{actual_type}' "
468                                        f"but should point to a '{expected_type}' relationship"
469                                    )
470
471            except Exception as e:
472                xml_rel_path = xml_file.relative_to(self.unpacked_dir)
473                errors.append(f"  Error processing {xml_rel_path}: {e}")
474
475        if errors:
476            print(f"FAILED - Found {len(errors)} relationship ID reference errors:")
477            for error in errors:
478                print(error)
479            print("\nThese ID mismatches will cause the document to appear corrupt!")
480            return False
481        else:
482            if self.verbose:
483                print("PASSED - All relationship ID references are valid")
484            return True
485
486    def _get_expected_relationship_type(self, element_name):
487        """
488        Get the expected relationship type for an element.
489        First checks the explicit mapping, then tries pattern detection.
490        """
491        # Normalize element name to lowercase
492        elem_lower = element_name.lower()
493
494        # Check explicit mapping first
495        if elem_lower in self.ELEMENT_RELATIONSHIP_TYPES:
496            return self.ELEMENT_RELATIONSHIP_TYPES[elem_lower]
497
498        # Try pattern detection for common patterns
499        # Pattern 1: Elements ending in "Id" often expect a relationship of the prefix type
500        if elem_lower.endswith("id") and len(elem_lower) > 2:
501            # e.g., "sldId" -> "sld", "sldMasterId" -> "sldMaster"
502            prefix = elem_lower[:-2]  # Remove "id"
503            # Check if this might be a compound like "sldMasterId"
504            if prefix.endswith("master"):
505                return prefix.lower()
506            elif prefix.endswith("layout"):
507                return prefix.lower()
508            else:
509                # Simple case like "sldId" -> "slide"
510                # Common transformations
511                if prefix == "sld":
512                    return "slide"
513                return prefix.lower()
514
515        # Pattern 2: Elements ending in "Reference" expect a relationship of the prefix type
516        if elem_lower.endswith("reference") and len(elem_lower) > 9:
517            prefix = elem_lower[:-9]  # Remove "reference"
518            return prefix.lower()
519
520        return None
521
522    def validate_content_types(self):
523        """Validate that all content files are properly declared in [Content_Types].xml."""
524        errors = []
525
526        # Find [Content_Types].xml file
527        content_types_file = self.unpacked_dir / "[Content_Types].xml"
528        if not content_types_file.exists():
529            print("FAILED - [Content_Types].xml file not found")
530            return False
531
532        try:
533            # Parse and get all declared parts and extensions
534            root = lxml.etree.parse(str(content_types_file)).getroot()
535            declared_parts = set()
536            declared_extensions = set()
537
538            # Get Override declarations (specific files)
539            for override in root.findall(
540                f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Override"
541            ):
542                part_name = override.get("PartName")
543                if part_name is not None:
544                    declared_parts.add(part_name.lstrip("/"))
545
546            # Get Default declarations (by extension)
547            for default in root.findall(
548                f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Default"
549            ):
550                extension = default.get("Extension")
551                if extension is not None:
552                    declared_extensions.add(extension.lower())
553
554            # Root elements that require content type declaration
555            declarable_roots = {
556                "sld",
557                "sldLayout",
558                "sldMaster",
559                "presentation",  # PowerPoint
560                "document",  # Word
561                "workbook",
562                "worksheet",  # Excel
563                "theme",  # Common
564            }
565
566            # Common media file extensions that should be declared
567            media_extensions = {
568                "png": "image/png",
569                "jpg": "image/jpeg",
570                "jpeg": "image/jpeg",
571                "gif": "image/gif",
572                "bmp": "image/bmp",
573                "tiff": "image/tiff",
574                "wmf": "image/x-wmf",
575                "emf": "image/x-emf",
576            }
577
578            # Get all files in the unpacked directory
579            all_files = list(self.unpacked_dir.rglob("*"))
580            all_files = [f for f in all_files if f.is_file()]
581
582            # Check all XML files for Override declarations
583            for xml_file in self.xml_files:
584                path_str = str(xml_file.relative_to(self.unpacked_dir)).replace(
585                    "\\", "/"
586                )
587
588                # Skip non-content files
589                if any(
590                    skip in path_str
591                    for skip in [".rels", "[Content_Types]", "docProps/", "_rels/"]
592                ):
593                    continue
594
595                try:
596                    root_tag = lxml.etree.parse(str(xml_file)).getroot().tag
597                    root_name = root_tag.split("}")[-1] if "}" in root_tag else root_tag
598
599                    if root_name in declarable_roots and path_str not in declared_parts:
600                        errors.append(
601                            f"  {path_str}: File with <{root_name}> root not declared in [Content_Types].xml"
602                        )
603
604                except Exception:
605                    continue  # Skip unparseable files
606
607            # Check all non-XML files for Default extension declarations
608            for file_path in all_files:
609                # Skip XML files and metadata files (already checked above)
610                if file_path.suffix.lower() in {".xml", ".rels"}:
611                    continue
612                if file_path.name == "[Content_Types].xml":
613                    continue
614                if "_rels" in file_path.parts or "docProps" in file_path.parts:
615                    continue
616
617                extension = file_path.suffix.lstrip(".").lower()
618                if extension and extension not in declared_extensions:
619                    # Check if it's a known media extension that should be declared
620                    if extension in media_extensions:
621                        relative_path = file_path.relative_to(self.unpacked_dir)
622                        errors.append(
623                            f'  {relative_path}: File with extension \'{extension}\' not declared in [Content_Types].xml - should add: <Default Extension="{extension}" ContentType="{media_extensions[extension]}"/>'
624                        )
625
626        except Exception as e:
627            errors.append(f"  Error parsing [Content_Types].xml: {e}")
628
629        if errors:
630            print(f"FAILED - Found {len(errors)} content type declaration errors:")
631            for error in errors:
632                print(error)
633            return False
634        else:
635            if self.verbose:
636                print(
637                    "PASSED - All content files are properly declared in [Content_Types].xml"
638                )
639            return True
640
641    def validate_file_against_xsd(self, xml_file, verbose=False):
642        """Validate a single XML file against XSD schema, comparing with original.
643
644        Args:
645            xml_file: Path to XML file to validate
646            verbose: Enable verbose output
647
648        Returns:
649            tuple: (is_valid, new_errors_set) where is_valid is True/False/None (skipped)
650        """
651        # Resolve both paths to handle symlinks
652        xml_file = Path(xml_file).resolve()
653        unpacked_dir = self.unpacked_dir.resolve()
654
655        # Validate current file
656        is_valid, current_errors = self._validate_single_file_xsd(
657            xml_file, unpacked_dir
658        )
659
660        if is_valid is None:
661            return None, set()  # Skipped
662        elif is_valid:
663            return True, set()  # Valid, no errors
664
665        # Get errors from original file for this specific file
666        original_errors = self._get_original_file_errors(xml_file)
667
668        # Compare with original (both are guaranteed to be sets here)
669        assert current_errors is not None
670        new_errors = current_errors - original_errors
671
672        if new_errors:
673            if verbose:
674                relative_path = xml_file.relative_to(unpacked_dir)
675                print(f"FAILED - {relative_path}: {len(new_errors)} new error(s)")
676                for error in list(new_errors)[:3]:
677                    truncated = error[:250] + "..." if len(error) > 250 else error
678                    print(f"  - {truncated}")
679            return False, new_errors
680        else:
681            # All errors existed in original
682            if verbose:
683                print(
684                    f"PASSED - No new errors (original had {len(current_errors)} errors)"
685                )
686            return True, set()
687
688    def validate_against_xsd(self):
689        """Validate XML files against XSD schemas, showing only new errors compared to original."""
690        new_errors = []
691        original_error_count = 0
692        valid_count = 0
693        skipped_count = 0
694
695        for xml_file in self.xml_files:
696            relative_path = str(xml_file.relative_to(self.unpacked_dir))
697            is_valid, new_file_errors = self.validate_file_against_xsd(
698                xml_file, verbose=False
699            )
700
701            if is_valid is None:
702                skipped_count += 1
703                continue
704            elif is_valid and not new_file_errors:
705                valid_count += 1
706                continue
707            elif is_valid:
708                # Had errors but all existed in original
709                original_error_count += 1
710                valid_count += 1
711                continue
712
713            # Has new errors
714            new_errors.append(f"  {relative_path}: {len(new_file_errors)} new error(s)")
715            for error in list(new_file_errors)[:3]:  # Show first 3 errors
716                new_errors.append(
717                    f"    - {error[:250]}..." if len(error) > 250 else f"    - {error}"
718                )
719
720        # Print summary
721        if self.verbose:
722            print(f"Validated {len(self.xml_files)} files:")
723            print(f"  - Valid: {valid_count}")
724            print(f"  - Skipped (no schema): {skipped_count}")
725            if original_error_count:
726                print(f"  - With original errors (ignored): {original_error_count}")
727            print(
728                f"  - With NEW errors: {len(new_errors) > 0 and len([e for e in new_errors if not e.startswith('    ')]) or 0}"
729            )
730
731        if new_errors:
732            print("\nFAILED - Found NEW validation errors:")
733            for error in new_errors:
734                print(error)
735            return False
736        else:
737            if self.verbose:
738                print("\nPASSED - No new XSD validation errors introduced")
739            return True
740
741    def _get_schema_path(self, xml_file):
742        """Determine the appropriate schema path for an XML file."""
743        # Check exact filename match
744        if xml_file.name in self.SCHEMA_MAPPINGS:
745            return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.name]
746
747        # Check .rels files
748        if xml_file.suffix == ".rels":
749            return self.schemas_dir / self.SCHEMA_MAPPINGS[".rels"]
750
751        # Check chart files
752        if "charts/" in str(xml_file) and xml_file.name.startswith("chart"):
753            return self.schemas_dir / self.SCHEMA_MAPPINGS["chart"]
754
755        # Check theme files
756        if "theme/" in str(xml_file) and xml_file.name.startswith("theme"):
757            return self.schemas_dir / self.SCHEMA_MAPPINGS["theme"]
758
759        # Check if file is in a main content folder and use appropriate schema
760        if xml_file.parent.name in self.MAIN_CONTENT_FOLDERS:
761            return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.parent.name]
762
763        return None
764
765    def _clean_ignorable_namespaces(self, xml_doc):
766        """Remove attributes and elements not in allowed namespaces."""
767        # Create a clean copy
768        xml_string = lxml.etree.tostring(xml_doc, encoding="unicode")
769        xml_copy = lxml.etree.fromstring(xml_string)
770
771        # Remove attributes not in allowed namespaces
772        for elem in xml_copy.iter():
773            attrs_to_remove = []
774
775            for attr in elem.attrib:
776                # Check if attribute is from a namespace other than allowed ones
777                if "{" in attr:
778                    ns = attr.split("}")[0][1:]
779                    if ns not in self.OOXML_NAMESPACES:
780                        attrs_to_remove.append(attr)
781
782            # Remove collected attributes
783            for attr in attrs_to_remove:
784                del elem.attrib[attr]
785
786        # Remove elements not in allowed namespaces
787        self._remove_ignorable_elements(xml_copy)
788
789        return lxml.etree.ElementTree(xml_copy)
790
791    def _remove_ignorable_elements(self, root):
792        """Recursively remove all elements not in allowed namespaces."""
793        elements_to_remove = []
794
795        # Find elements to remove
796        for elem in list(root):
797            # Skip non-element nodes (comments, processing instructions, etc.)
798            if not hasattr(elem, "tag") or callable(elem.tag):
799                continue
800
801            tag_str = str(elem.tag)
802            if tag_str.startswith("{"):
803                ns = tag_str.split("}")[0][1:]
804                if ns not in self.OOXML_NAMESPACES:
805                    elements_to_remove.append(elem)
806                    continue
807
808            # Recursively clean child elements
809            self._remove_ignorable_elements(elem)
810
811        # Remove collected elements
812        for elem in elements_to_remove:
813            root.remove(elem)
814
815    def _preprocess_for_mc_ignorable(self, xml_doc):
816        """Preprocess XML to handle mc:Ignorable attribute properly."""
817        # Remove mc:Ignorable attributes before validation
818        root = xml_doc.getroot()
819
820        # Remove mc:Ignorable attribute from root
821        if f"{{{self.MC_NAMESPACE}}}Ignorable" in root.attrib:
822            del root.attrib[f"{{{self.MC_NAMESPACE}}}Ignorable"]
823
824        return xml_doc
825
826    def _validate_single_file_xsd(self, xml_file, base_path):
827        """Validate a single XML file against XSD schema. Returns (is_valid, errors_set)."""
828        schema_path = self._get_schema_path(xml_file)
829        if not schema_path:
830            return None, None  # Skip file
831
832        try:
833            # Load schema
834            with open(schema_path, "rb") as xsd_file:
835                parser = lxml.etree.XMLParser()
836                xsd_doc = lxml.etree.parse(
837                    xsd_file, parser=parser, base_url=str(schema_path)
838                )
839                schema = lxml.etree.XMLSchema(xsd_doc)
840
841            # Load and preprocess XML
842            with open(xml_file, "r") as f:
843                xml_doc = lxml.etree.parse(f)
844
845            xml_doc, _ = self._remove_template_tags_from_text_nodes(xml_doc)
846            xml_doc = self._preprocess_for_mc_ignorable(xml_doc)
847
848            # Clean ignorable namespaces if needed
849            relative_path = xml_file.relative_to(base_path)
850            if (
851                relative_path.parts
852                and relative_path.parts[0] in self.MAIN_CONTENT_FOLDERS
853            ):
854                xml_doc = self._clean_ignorable_namespaces(xml_doc)
855
856            # Validate
857            if schema.validate(xml_doc):
858                return True, set()
859            else:
860                errors = set()
861                for error in schema.error_log:
862                    # Store normalized error message (without line numbers for comparison)
863                    errors.add(error.message)
864                return False, errors
865
866        except Exception as e:
867            return False, {str(e)}
868
869    def _get_original_file_errors(self, xml_file):
870        """Get XSD validation errors from a single file in the original document.
871
872        Args:
873            xml_file: Path to the XML file in unpacked_dir to check
874
875        Returns:
876            set: Set of error messages from the original file
877        """
878        import tempfile
879        import zipfile
880
881        # Resolve both paths to handle symlinks (e.g., /var vs /private/var on macOS)
882        xml_file = Path(xml_file).resolve()
883        unpacked_dir = self.unpacked_dir.resolve()
884        relative_path = xml_file.relative_to(unpacked_dir)
885
886        with tempfile.TemporaryDirectory() as temp_dir:
887            temp_path = Path(temp_dir)
888
889            # Extract original file
890            with zipfile.ZipFile(self.original_file, "r") as zip_ref:
891                zip_ref.extractall(temp_path)
892
893            # Find corresponding file in original
894            original_xml_file = temp_path / relative_path
895
896            if not original_xml_file.exists():
897                # File didn't exist in original, so no original errors
898                return set()
899
900            # Validate the specific file in original
901            is_valid, errors = self._validate_single_file_xsd(
902                original_xml_file, temp_path
903            )
904            return errors if errors else set()
905
906    def _remove_template_tags_from_text_nodes(self, xml_doc):
907        """Remove template tags from XML text nodes and collect warnings.
908
909        Template tags follow the pattern {{ ... }} and are used as placeholders
910        for content replacement. They should be removed from text content before
911        XSD validation while preserving XML structure.
912
913        Returns:
914            tuple: (cleaned_xml_doc, warnings_list)
915        """
916        warnings = []
917        template_pattern = re.compile(r"\{\{[^}]*\}\}")
918
919        # Create a copy of the document to avoid modifying the original
920        xml_string = lxml.etree.tostring(xml_doc, encoding="unicode")
921        xml_copy = lxml.etree.fromstring(xml_string)
922
923        def process_text_content(text, content_type):
924            if not text:
925                return text
926            matches = list(template_pattern.finditer(text))
927            if matches:
928                for match in matches:
929                    warnings.append(
930                        f"Found template tag in {content_type}: {match.group()}"
931                    )
932                return template_pattern.sub("", text)
933            return text
934
935        # Process all text nodes in the document
936        for elem in xml_copy.iter():
937            # Skip processing if this is a w:t element
938            if not hasattr(elem, "tag") or callable(elem.tag):
939                continue
940            tag_str = str(elem.tag)
941            if tag_str.endswith("}t") or tag_str == "t":
942                continue
943
944            elem.text = process_text_content(elem.text, "text content")
945            elem.tail = process_text_content(elem.tail, "tail content")
946
947        return lxml.etree.ElementTree(xml_copy), warnings
948
949
950if __name__ == "__main__":
951    raise RuntimeError("This module should not be run directly.")