main
1"""
2Base validator with common validation logic for document files.
3"""
4
5import re
6from pathlib import Path
7
8import lxml.etree
9
10
11class BaseSchemaValidator:
12 """Base validator with common validation logic for document files."""
13
14 # Elements whose 'id' attributes must be unique within their file
15 # Format: element_name -> (attribute_name, scope)
16 # scope can be 'file' (unique within file) or 'global' (unique across all files)
17 UNIQUE_ID_REQUIREMENTS = {
18 # Word elements
19 "comment": ("id", "file"), # Comment IDs in comments.xml
20 "commentrangestart": ("id", "file"), # Must match comment IDs
21 "commentrangeend": ("id", "file"), # Must match comment IDs
22 "bookmarkstart": ("id", "file"), # Bookmark start IDs
23 "bookmarkend": ("id", "file"), # Bookmark end IDs
24 # Note: ins and del (track changes) can share IDs when part of same revision
25 # PowerPoint elements
26 "sldid": ("id", "file"), # Slide IDs in presentation.xml
27 "sldmasterid": ("id", "global"), # Slide master IDs must be globally unique
28 "sldlayoutid": ("id", "global"), # Slide layout IDs must be globally unique
29 "cm": ("authorid", "file"), # Comment author IDs
30 # Excel elements
31 "sheet": ("sheetid", "file"), # Sheet IDs in workbook.xml
32 "definedname": ("id", "file"), # Named range IDs
33 # Drawing/Shape elements (all formats)
34 "cxnsp": ("id", "file"), # Connection shape IDs
35 "sp": ("id", "file"), # Shape IDs
36 "pic": ("id", "file"), # Picture IDs
37 "grpsp": ("id", "file"), # Group shape IDs
38 }
39
40 # Mapping of element names to expected relationship types
41 # Subclasses should override this with format-specific mappings
42 ELEMENT_RELATIONSHIP_TYPES = {}
43
44 # Unified schema mappings for all Office document types
45 SCHEMA_MAPPINGS = {
46 # Document type specific schemas
47 "word": "ISO-IEC29500-4_2016/wml.xsd", # Word documents
48 "ppt": "ISO-IEC29500-4_2016/pml.xsd", # PowerPoint presentations
49 "xl": "ISO-IEC29500-4_2016/sml.xsd", # Excel spreadsheets
50 # Common file types
51 "[Content_Types].xml": "ecma/fouth-edition/opc-contentTypes.xsd",
52 "app.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd",
53 "core.xml": "ecma/fouth-edition/opc-coreProperties.xsd",
54 "custom.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd",
55 ".rels": "ecma/fouth-edition/opc-relationships.xsd",
56 # Word-specific files
57 "people.xml": "microsoft/wml-2012.xsd",
58 "commentsIds.xml": "microsoft/wml-cid-2016.xsd",
59 "commentsExtensible.xml": "microsoft/wml-cex-2018.xsd",
60 "commentsExtended.xml": "microsoft/wml-2012.xsd",
61 # Chart files (common across document types)
62 "chart": "ISO-IEC29500-4_2016/dml-chart.xsd",
63 # Theme files (common across document types)
64 "theme": "ISO-IEC29500-4_2016/dml-main.xsd",
65 # Drawing and media files
66 "drawing": "ISO-IEC29500-4_2016/dml-main.xsd",
67 }
68
69 # Unified namespace constants
70 MC_NAMESPACE = "http://schemas.openxmlformats.org/markup-compatibility/2006"
71 XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
72
73 # Common OOXML namespaces used across validators
74 PACKAGE_RELATIONSHIPS_NAMESPACE = (
75 "http://schemas.openxmlformats.org/package/2006/relationships"
76 )
77 OFFICE_RELATIONSHIPS_NAMESPACE = (
78 "http://schemas.openxmlformats.org/officeDocument/2006/relationships"
79 )
80 CONTENT_TYPES_NAMESPACE = (
81 "http://schemas.openxmlformats.org/package/2006/content-types"
82 )
83
84 # Folders where we should clean ignorable namespaces
85 MAIN_CONTENT_FOLDERS = {"word", "ppt", "xl"}
86
87 # All allowed OOXML namespaces (superset of all document types)
88 OOXML_NAMESPACES = {
89 "http://schemas.openxmlformats.org/officeDocument/2006/math",
90 "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
91 "http://schemas.openxmlformats.org/schemaLibrary/2006/main",
92 "http://schemas.openxmlformats.org/drawingml/2006/main",
93 "http://schemas.openxmlformats.org/drawingml/2006/chart",
94 "http://schemas.openxmlformats.org/drawingml/2006/chartDrawing",
95 "http://schemas.openxmlformats.org/drawingml/2006/diagram",
96 "http://schemas.openxmlformats.org/drawingml/2006/picture",
97 "http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing",
98 "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
99 "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
100 "http://schemas.openxmlformats.org/presentationml/2006/main",
101 "http://schemas.openxmlformats.org/spreadsheetml/2006/main",
102 "http://schemas.openxmlformats.org/officeDocument/2006/sharedTypes",
103 "http://www.w3.org/XML/1998/namespace",
104 }
105
106 def __init__(self, unpacked_dir, original_file, verbose=False):
107 self.unpacked_dir = Path(unpacked_dir).resolve()
108 self.original_file = Path(original_file)
109 self.verbose = verbose
110
111 # Set schemas directory
112 self.schemas_dir = Path(__file__).parent.parent.parent / "schemas"
113
114 # Get all XML and .rels files
115 patterns = ["*.xml", "*.rels"]
116 self.xml_files = [
117 f for pattern in patterns for f in self.unpacked_dir.rglob(pattern)
118 ]
119
120 if not self.xml_files:
121 print(f"Warning: No XML files found in {self.unpacked_dir}")
122
123 def validate(self):
124 """Run all validation checks and return True if all pass."""
125 raise NotImplementedError("Subclasses must implement the validate method")
126
127 def validate_xml(self):
128 """Validate that all XML files are well-formed."""
129 errors = []
130
131 for xml_file in self.xml_files:
132 try:
133 # Try to parse the XML file
134 lxml.etree.parse(str(xml_file))
135 except lxml.etree.XMLSyntaxError as e:
136 errors.append(
137 f" {xml_file.relative_to(self.unpacked_dir)}: "
138 f"Line {e.lineno}: {e.msg}"
139 )
140 except Exception as e:
141 errors.append(
142 f" {xml_file.relative_to(self.unpacked_dir)}: "
143 f"Unexpected error: {str(e)}"
144 )
145
146 if errors:
147 print(f"FAILED - Found {len(errors)} XML violations:")
148 for error in errors:
149 print(error)
150 return False
151 else:
152 if self.verbose:
153 print("PASSED - All XML files are well-formed")
154 return True
155
156 def validate_namespaces(self):
157 """Validate that namespace prefixes in Ignorable attributes are declared."""
158 errors = []
159
160 for xml_file in self.xml_files:
161 try:
162 root = lxml.etree.parse(str(xml_file)).getroot()
163 declared = set(root.nsmap.keys()) - {None} # Exclude default namespace
164
165 for attr_val in [
166 v for k, v in root.attrib.items() if k.endswith("Ignorable")
167 ]:
168 undeclared = set(attr_val.split()) - declared
169 errors.extend(
170 f" {xml_file.relative_to(self.unpacked_dir)}: "
171 f"Namespace '{ns}' in Ignorable but not declared"
172 for ns in undeclared
173 )
174 except lxml.etree.XMLSyntaxError:
175 continue
176
177 if errors:
178 print(f"FAILED - {len(errors)} namespace issues:")
179 for error in errors:
180 print(error)
181 return False
182 if self.verbose:
183 print("PASSED - All namespace prefixes properly declared")
184 return True
185
186 def validate_unique_ids(self):
187 """Validate that specific IDs are unique according to OOXML requirements."""
188 errors = []
189 global_ids = {} # Track globally unique IDs across all files
190
191 for xml_file in self.xml_files:
192 try:
193 root = lxml.etree.parse(str(xml_file)).getroot()
194 file_ids = {} # Track IDs that must be unique within this file
195
196 # Remove all mc:AlternateContent elements from the tree
197 mc_elements = root.xpath(
198 ".//mc:AlternateContent", namespaces={"mc": self.MC_NAMESPACE}
199 )
200 for elem in mc_elements:
201 elem.getparent().remove(elem)
202
203 # Now check IDs in the cleaned tree
204 for elem in root.iter():
205 # Get the element name without namespace
206 tag = (
207 elem.tag.split("}")[-1].lower()
208 if "}" in elem.tag
209 else elem.tag.lower()
210 )
211
212 # Check if this element type has ID uniqueness requirements
213 if tag in self.UNIQUE_ID_REQUIREMENTS:
214 attr_name, scope = self.UNIQUE_ID_REQUIREMENTS[tag]
215
216 # Look for the specified attribute
217 id_value = None
218 for attr, value in elem.attrib.items():
219 attr_local = (
220 attr.split("}")[-1].lower()
221 if "}" in attr
222 else attr.lower()
223 )
224 if attr_local == attr_name:
225 id_value = value
226 break
227
228 if id_value is not None:
229 if scope == "global":
230 # Check global uniqueness
231 if id_value in global_ids:
232 prev_file, prev_line, prev_tag = global_ids[
233 id_value
234 ]
235 errors.append(
236 f" {xml_file.relative_to(self.unpacked_dir)}: "
237 f"Line {elem.sourceline}: Global ID '{id_value}' in <{tag}> "
238 f"already used in {prev_file} at line {prev_line} in <{prev_tag}>"
239 )
240 else:
241 global_ids[id_value] = (
242 xml_file.relative_to(self.unpacked_dir),
243 elem.sourceline,
244 tag,
245 )
246 elif scope == "file":
247 # Check file-level uniqueness
248 key = (tag, attr_name)
249 if key not in file_ids:
250 file_ids[key] = {}
251
252 if id_value in file_ids[key]:
253 prev_line = file_ids[key][id_value]
254 errors.append(
255 f" {xml_file.relative_to(self.unpacked_dir)}: "
256 f"Line {elem.sourceline}: Duplicate {attr_name}='{id_value}' in <{tag}> "
257 f"(first occurrence at line {prev_line})"
258 )
259 else:
260 file_ids[key][id_value] = elem.sourceline
261
262 except (lxml.etree.XMLSyntaxError, Exception) as e:
263 errors.append(
264 f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}"
265 )
266
267 if errors:
268 print(f"FAILED - Found {len(errors)} ID uniqueness violations:")
269 for error in errors:
270 print(error)
271 return False
272 else:
273 if self.verbose:
274 print("PASSED - All required IDs are unique")
275 return True
276
277 def validate_file_references(self):
278 """
279 Validate that all .rels files properly reference files and that all files are referenced.
280 """
281 errors = []
282
283 # Find all .rels files
284 rels_files = list(self.unpacked_dir.rglob("*.rels"))
285
286 if not rels_files:
287 if self.verbose:
288 print("PASSED - No .rels files found")
289 return True
290
291 # Get all files in the unpacked directory (excluding reference files)
292 all_files = []
293 for file_path in self.unpacked_dir.rglob("*"):
294 if (
295 file_path.is_file()
296 and file_path.name != "[Content_Types].xml"
297 and not file_path.name.endswith(".rels")
298 ): # This file is not referenced by .rels
299 all_files.append(file_path.resolve())
300
301 # Track all files that are referenced by any .rels file
302 all_referenced_files = set()
303
304 if self.verbose:
305 print(
306 f"Found {len(rels_files)} .rels files and {len(all_files)} target files"
307 )
308
309 # Check each .rels file
310 for rels_file in rels_files:
311 try:
312 # Parse relationships file
313 rels_root = lxml.etree.parse(str(rels_file)).getroot()
314
315 # Get the directory where this .rels file is located
316 rels_dir = rels_file.parent
317
318 # Find all relationships and their targets
319 referenced_files = set()
320 broken_refs = []
321
322 for rel in rels_root.findall(
323 ".//ns:Relationship",
324 namespaces={"ns": self.PACKAGE_RELATIONSHIPS_NAMESPACE},
325 ):
326 target = rel.get("Target")
327 if target and not target.startswith(
328 ("http", "mailto:")
329 ): # Skip external URLs
330 # Resolve the target path relative to the .rels file location
331 if rels_file.name == ".rels":
332 # Root .rels file - targets are relative to unpacked_dir
333 target_path = self.unpacked_dir / target
334 else:
335 # Other .rels files - targets are relative to their parent's parent
336 # e.g., word/_rels/document.xml.rels -> targets relative to word/
337 base_dir = rels_dir.parent
338 target_path = base_dir / target
339
340 # Normalize the path and check if it exists
341 try:
342 target_path = target_path.resolve()
343 if target_path.exists() and target_path.is_file():
344 referenced_files.add(target_path)
345 all_referenced_files.add(target_path)
346 else:
347 broken_refs.append((target, rel.sourceline))
348 except (OSError, ValueError):
349 broken_refs.append((target, rel.sourceline))
350
351 # Report broken references
352 if broken_refs:
353 rel_path = rels_file.relative_to(self.unpacked_dir)
354 for broken_ref, line_num in broken_refs:
355 errors.append(
356 f" {rel_path}: Line {line_num}: Broken reference to {broken_ref}"
357 )
358
359 except Exception as e:
360 rel_path = rels_file.relative_to(self.unpacked_dir)
361 errors.append(f" Error parsing {rel_path}: {e}")
362
363 # Check for unreferenced files (files that exist but are not referenced anywhere)
364 unreferenced_files = set(all_files) - all_referenced_files
365
366 if unreferenced_files:
367 for unref_file in sorted(unreferenced_files):
368 unref_rel_path = unref_file.relative_to(self.unpacked_dir)
369 errors.append(f" Unreferenced file: {unref_rel_path}")
370
371 if errors:
372 print(f"FAILED - Found {len(errors)} relationship validation errors:")
373 for error in errors:
374 print(error)
375 print(
376 "CRITICAL: These errors will cause the document to appear corrupt. "
377 + "Broken references MUST be fixed, "
378 + "and unreferenced files MUST be referenced or removed."
379 )
380 return False
381 else:
382 if self.verbose:
383 print(
384 "PASSED - All references are valid and all files are properly referenced"
385 )
386 return True
387
388 def validate_all_relationship_ids(self):
389 """
390 Validate that all r:id attributes in XML files reference existing IDs
391 in their corresponding .rels files, and optionally validate relationship types.
392 """
393 import lxml.etree
394
395 errors = []
396
397 # Process each XML file that might contain r:id references
398 for xml_file in self.xml_files:
399 # Skip .rels files themselves
400 if xml_file.suffix == ".rels":
401 continue
402
403 # Determine the corresponding .rels file
404 # For dir/file.xml, it's dir/_rels/file.xml.rels
405 rels_dir = xml_file.parent / "_rels"
406 rels_file = rels_dir / f"{xml_file.name}.rels"
407
408 # Skip if there's no corresponding .rels file (that's okay)
409 if not rels_file.exists():
410 continue
411
412 try:
413 # Parse the .rels file to get valid relationship IDs and their types
414 rels_root = lxml.etree.parse(str(rels_file)).getroot()
415 rid_to_type = {}
416
417 for rel in rels_root.findall(
418 f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship"
419 ):
420 rid = rel.get("Id")
421 rel_type = rel.get("Type", "")
422 if rid:
423 # Check for duplicate rIds
424 if rid in rid_to_type:
425 rels_rel_path = rels_file.relative_to(self.unpacked_dir)
426 errors.append(
427 f" {rels_rel_path}: Line {rel.sourceline}: "
428 f"Duplicate relationship ID '{rid}' (IDs must be unique)"
429 )
430 # Extract just the type name from the full URL
431 type_name = (
432 rel_type.split("/")[-1] if "/" in rel_type else rel_type
433 )
434 rid_to_type[rid] = type_name
435
436 # Parse the XML file to find all r:id references
437 xml_root = lxml.etree.parse(str(xml_file)).getroot()
438
439 # Find all elements with r:id attributes
440 for elem in xml_root.iter():
441 # Check for r:id attribute (relationship ID)
442 rid_attr = elem.get(f"{{{self.OFFICE_RELATIONSHIPS_NAMESPACE}}}id")
443 if rid_attr:
444 xml_rel_path = xml_file.relative_to(self.unpacked_dir)
445 elem_name = (
446 elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag
447 )
448
449 # Check if the ID exists
450 if rid_attr not in rid_to_type:
451 errors.append(
452 f" {xml_rel_path}: Line {elem.sourceline}: "
453 f"<{elem_name}> references non-existent relationship '{rid_attr}' "
454 f"(valid IDs: {', '.join(sorted(rid_to_type.keys())[:5])}{'...' if len(rid_to_type) > 5 else ''})"
455 )
456 # Check if we have type expectations for this element
457 elif self.ELEMENT_RELATIONSHIP_TYPES:
458 expected_type = self._get_expected_relationship_type(
459 elem_name
460 )
461 if expected_type:
462 actual_type = rid_to_type[rid_attr]
463 # Check if the actual type matches or contains the expected type
464 if expected_type not in actual_type.lower():
465 errors.append(
466 f" {xml_rel_path}: Line {elem.sourceline}: "
467 f"<{elem_name}> references '{rid_attr}' which points to '{actual_type}' "
468 f"but should point to a '{expected_type}' relationship"
469 )
470
471 except Exception as e:
472 xml_rel_path = xml_file.relative_to(self.unpacked_dir)
473 errors.append(f" Error processing {xml_rel_path}: {e}")
474
475 if errors:
476 print(f"FAILED - Found {len(errors)} relationship ID reference errors:")
477 for error in errors:
478 print(error)
479 print("\nThese ID mismatches will cause the document to appear corrupt!")
480 return False
481 else:
482 if self.verbose:
483 print("PASSED - All relationship ID references are valid")
484 return True
485
486 def _get_expected_relationship_type(self, element_name):
487 """
488 Get the expected relationship type for an element.
489 First checks the explicit mapping, then tries pattern detection.
490 """
491 # Normalize element name to lowercase
492 elem_lower = element_name.lower()
493
494 # Check explicit mapping first
495 if elem_lower in self.ELEMENT_RELATIONSHIP_TYPES:
496 return self.ELEMENT_RELATIONSHIP_TYPES[elem_lower]
497
498 # Try pattern detection for common patterns
499 # Pattern 1: Elements ending in "Id" often expect a relationship of the prefix type
500 if elem_lower.endswith("id") and len(elem_lower) > 2:
501 # e.g., "sldId" -> "sld", "sldMasterId" -> "sldMaster"
502 prefix = elem_lower[:-2] # Remove "id"
503 # Check if this might be a compound like "sldMasterId"
504 if prefix.endswith("master"):
505 return prefix.lower()
506 elif prefix.endswith("layout"):
507 return prefix.lower()
508 else:
509 # Simple case like "sldId" -> "slide"
510 # Common transformations
511 if prefix == "sld":
512 return "slide"
513 return prefix.lower()
514
515 # Pattern 2: Elements ending in "Reference" expect a relationship of the prefix type
516 if elem_lower.endswith("reference") and len(elem_lower) > 9:
517 prefix = elem_lower[:-9] # Remove "reference"
518 return prefix.lower()
519
520 return None
521
522 def validate_content_types(self):
523 """Validate that all content files are properly declared in [Content_Types].xml."""
524 errors = []
525
526 # Find [Content_Types].xml file
527 content_types_file = self.unpacked_dir / "[Content_Types].xml"
528 if not content_types_file.exists():
529 print("FAILED - [Content_Types].xml file not found")
530 return False
531
532 try:
533 # Parse and get all declared parts and extensions
534 root = lxml.etree.parse(str(content_types_file)).getroot()
535 declared_parts = set()
536 declared_extensions = set()
537
538 # Get Override declarations (specific files)
539 for override in root.findall(
540 f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Override"
541 ):
542 part_name = override.get("PartName")
543 if part_name is not None:
544 declared_parts.add(part_name.lstrip("/"))
545
546 # Get Default declarations (by extension)
547 for default in root.findall(
548 f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Default"
549 ):
550 extension = default.get("Extension")
551 if extension is not None:
552 declared_extensions.add(extension.lower())
553
554 # Root elements that require content type declaration
555 declarable_roots = {
556 "sld",
557 "sldLayout",
558 "sldMaster",
559 "presentation", # PowerPoint
560 "document", # Word
561 "workbook",
562 "worksheet", # Excel
563 "theme", # Common
564 }
565
566 # Common media file extensions that should be declared
567 media_extensions = {
568 "png": "image/png",
569 "jpg": "image/jpeg",
570 "jpeg": "image/jpeg",
571 "gif": "image/gif",
572 "bmp": "image/bmp",
573 "tiff": "image/tiff",
574 "wmf": "image/x-wmf",
575 "emf": "image/x-emf",
576 }
577
578 # Get all files in the unpacked directory
579 all_files = list(self.unpacked_dir.rglob("*"))
580 all_files = [f for f in all_files if f.is_file()]
581
582 # Check all XML files for Override declarations
583 for xml_file in self.xml_files:
584 path_str = str(xml_file.relative_to(self.unpacked_dir)).replace(
585 "\\", "/"
586 )
587
588 # Skip non-content files
589 if any(
590 skip in path_str
591 for skip in [".rels", "[Content_Types]", "docProps/", "_rels/"]
592 ):
593 continue
594
595 try:
596 root_tag = lxml.etree.parse(str(xml_file)).getroot().tag
597 root_name = root_tag.split("}")[-1] if "}" in root_tag else root_tag
598
599 if root_name in declarable_roots and path_str not in declared_parts:
600 errors.append(
601 f" {path_str}: File with <{root_name}> root not declared in [Content_Types].xml"
602 )
603
604 except Exception:
605 continue # Skip unparseable files
606
607 # Check all non-XML files for Default extension declarations
608 for file_path in all_files:
609 # Skip XML files and metadata files (already checked above)
610 if file_path.suffix.lower() in {".xml", ".rels"}:
611 continue
612 if file_path.name == "[Content_Types].xml":
613 continue
614 if "_rels" in file_path.parts or "docProps" in file_path.parts:
615 continue
616
617 extension = file_path.suffix.lstrip(".").lower()
618 if extension and extension not in declared_extensions:
619 # Check if it's a known media extension that should be declared
620 if extension in media_extensions:
621 relative_path = file_path.relative_to(self.unpacked_dir)
622 errors.append(
623 f' {relative_path}: File with extension \'{extension}\' not declared in [Content_Types].xml - should add: <Default Extension="{extension}" ContentType="{media_extensions[extension]}"/>'
624 )
625
626 except Exception as e:
627 errors.append(f" Error parsing [Content_Types].xml: {e}")
628
629 if errors:
630 print(f"FAILED - Found {len(errors)} content type declaration errors:")
631 for error in errors:
632 print(error)
633 return False
634 else:
635 if self.verbose:
636 print(
637 "PASSED - All content files are properly declared in [Content_Types].xml"
638 )
639 return True
640
641 def validate_file_against_xsd(self, xml_file, verbose=False):
642 """Validate a single XML file against XSD schema, comparing with original.
643
644 Args:
645 xml_file: Path to XML file to validate
646 verbose: Enable verbose output
647
648 Returns:
649 tuple: (is_valid, new_errors_set) where is_valid is True/False/None (skipped)
650 """
651 # Resolve both paths to handle symlinks
652 xml_file = Path(xml_file).resolve()
653 unpacked_dir = self.unpacked_dir.resolve()
654
655 # Validate current file
656 is_valid, current_errors = self._validate_single_file_xsd(
657 xml_file, unpacked_dir
658 )
659
660 if is_valid is None:
661 return None, set() # Skipped
662 elif is_valid:
663 return True, set() # Valid, no errors
664
665 # Get errors from original file for this specific file
666 original_errors = self._get_original_file_errors(xml_file)
667
668 # Compare with original (both are guaranteed to be sets here)
669 assert current_errors is not None
670 new_errors = current_errors - original_errors
671
672 if new_errors:
673 if verbose:
674 relative_path = xml_file.relative_to(unpacked_dir)
675 print(f"FAILED - {relative_path}: {len(new_errors)} new error(s)")
676 for error in list(new_errors)[:3]:
677 truncated = error[:250] + "..." if len(error) > 250 else error
678 print(f" - {truncated}")
679 return False, new_errors
680 else:
681 # All errors existed in original
682 if verbose:
683 print(
684 f"PASSED - No new errors (original had {len(current_errors)} errors)"
685 )
686 return True, set()
687
688 def validate_against_xsd(self):
689 """Validate XML files against XSD schemas, showing only new errors compared to original."""
690 new_errors = []
691 original_error_count = 0
692 valid_count = 0
693 skipped_count = 0
694
695 for xml_file in self.xml_files:
696 relative_path = str(xml_file.relative_to(self.unpacked_dir))
697 is_valid, new_file_errors = self.validate_file_against_xsd(
698 xml_file, verbose=False
699 )
700
701 if is_valid is None:
702 skipped_count += 1
703 continue
704 elif is_valid and not new_file_errors:
705 valid_count += 1
706 continue
707 elif is_valid:
708 # Had errors but all existed in original
709 original_error_count += 1
710 valid_count += 1
711 continue
712
713 # Has new errors
714 new_errors.append(f" {relative_path}: {len(new_file_errors)} new error(s)")
715 for error in list(new_file_errors)[:3]: # Show first 3 errors
716 new_errors.append(
717 f" - {error[:250]}..." if len(error) > 250 else f" - {error}"
718 )
719
720 # Print summary
721 if self.verbose:
722 print(f"Validated {len(self.xml_files)} files:")
723 print(f" - Valid: {valid_count}")
724 print(f" - Skipped (no schema): {skipped_count}")
725 if original_error_count:
726 print(f" - With original errors (ignored): {original_error_count}")
727 print(
728 f" - With NEW errors: {len(new_errors) > 0 and len([e for e in new_errors if not e.startswith(' ')]) or 0}"
729 )
730
731 if new_errors:
732 print("\nFAILED - Found NEW validation errors:")
733 for error in new_errors:
734 print(error)
735 return False
736 else:
737 if self.verbose:
738 print("\nPASSED - No new XSD validation errors introduced")
739 return True
740
741 def _get_schema_path(self, xml_file):
742 """Determine the appropriate schema path for an XML file."""
743 # Check exact filename match
744 if xml_file.name in self.SCHEMA_MAPPINGS:
745 return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.name]
746
747 # Check .rels files
748 if xml_file.suffix == ".rels":
749 return self.schemas_dir / self.SCHEMA_MAPPINGS[".rels"]
750
751 # Check chart files
752 if "charts/" in str(xml_file) and xml_file.name.startswith("chart"):
753 return self.schemas_dir / self.SCHEMA_MAPPINGS["chart"]
754
755 # Check theme files
756 if "theme/" in str(xml_file) and xml_file.name.startswith("theme"):
757 return self.schemas_dir / self.SCHEMA_MAPPINGS["theme"]
758
759 # Check if file is in a main content folder and use appropriate schema
760 if xml_file.parent.name in self.MAIN_CONTENT_FOLDERS:
761 return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.parent.name]
762
763 return None
764
765 def _clean_ignorable_namespaces(self, xml_doc):
766 """Remove attributes and elements not in allowed namespaces."""
767 # Create a clean copy
768 xml_string = lxml.etree.tostring(xml_doc, encoding="unicode")
769 xml_copy = lxml.etree.fromstring(xml_string)
770
771 # Remove attributes not in allowed namespaces
772 for elem in xml_copy.iter():
773 attrs_to_remove = []
774
775 for attr in elem.attrib:
776 # Check if attribute is from a namespace other than allowed ones
777 if "{" in attr:
778 ns = attr.split("}")[0][1:]
779 if ns not in self.OOXML_NAMESPACES:
780 attrs_to_remove.append(attr)
781
782 # Remove collected attributes
783 for attr in attrs_to_remove:
784 del elem.attrib[attr]
785
786 # Remove elements not in allowed namespaces
787 self._remove_ignorable_elements(xml_copy)
788
789 return lxml.etree.ElementTree(xml_copy)
790
791 def _remove_ignorable_elements(self, root):
792 """Recursively remove all elements not in allowed namespaces."""
793 elements_to_remove = []
794
795 # Find elements to remove
796 for elem in list(root):
797 # Skip non-element nodes (comments, processing instructions, etc.)
798 if not hasattr(elem, "tag") or callable(elem.tag):
799 continue
800
801 tag_str = str(elem.tag)
802 if tag_str.startswith("{"):
803 ns = tag_str.split("}")[0][1:]
804 if ns not in self.OOXML_NAMESPACES:
805 elements_to_remove.append(elem)
806 continue
807
808 # Recursively clean child elements
809 self._remove_ignorable_elements(elem)
810
811 # Remove collected elements
812 for elem in elements_to_remove:
813 root.remove(elem)
814
815 def _preprocess_for_mc_ignorable(self, xml_doc):
816 """Preprocess XML to handle mc:Ignorable attribute properly."""
817 # Remove mc:Ignorable attributes before validation
818 root = xml_doc.getroot()
819
820 # Remove mc:Ignorable attribute from root
821 if f"{{{self.MC_NAMESPACE}}}Ignorable" in root.attrib:
822 del root.attrib[f"{{{self.MC_NAMESPACE}}}Ignorable"]
823
824 return xml_doc
825
826 def _validate_single_file_xsd(self, xml_file, base_path):
827 """Validate a single XML file against XSD schema. Returns (is_valid, errors_set)."""
828 schema_path = self._get_schema_path(xml_file)
829 if not schema_path:
830 return None, None # Skip file
831
832 try:
833 # Load schema
834 with open(schema_path, "rb") as xsd_file:
835 parser = lxml.etree.XMLParser()
836 xsd_doc = lxml.etree.parse(
837 xsd_file, parser=parser, base_url=str(schema_path)
838 )
839 schema = lxml.etree.XMLSchema(xsd_doc)
840
841 # Load and preprocess XML
842 with open(xml_file, "r") as f:
843 xml_doc = lxml.etree.parse(f)
844
845 xml_doc, _ = self._remove_template_tags_from_text_nodes(xml_doc)
846 xml_doc = self._preprocess_for_mc_ignorable(xml_doc)
847
848 # Clean ignorable namespaces if needed
849 relative_path = xml_file.relative_to(base_path)
850 if (
851 relative_path.parts
852 and relative_path.parts[0] in self.MAIN_CONTENT_FOLDERS
853 ):
854 xml_doc = self._clean_ignorable_namespaces(xml_doc)
855
856 # Validate
857 if schema.validate(xml_doc):
858 return True, set()
859 else:
860 errors = set()
861 for error in schema.error_log:
862 # Store normalized error message (without line numbers for comparison)
863 errors.add(error.message)
864 return False, errors
865
866 except Exception as e:
867 return False, {str(e)}
868
869 def _get_original_file_errors(self, xml_file):
870 """Get XSD validation errors from a single file in the original document.
871
872 Args:
873 xml_file: Path to the XML file in unpacked_dir to check
874
875 Returns:
876 set: Set of error messages from the original file
877 """
878 import tempfile
879 import zipfile
880
881 # Resolve both paths to handle symlinks (e.g., /var vs /private/var on macOS)
882 xml_file = Path(xml_file).resolve()
883 unpacked_dir = self.unpacked_dir.resolve()
884 relative_path = xml_file.relative_to(unpacked_dir)
885
886 with tempfile.TemporaryDirectory() as temp_dir:
887 temp_path = Path(temp_dir)
888
889 # Extract original file
890 with zipfile.ZipFile(self.original_file, "r") as zip_ref:
891 zip_ref.extractall(temp_path)
892
893 # Find corresponding file in original
894 original_xml_file = temp_path / relative_path
895
896 if not original_xml_file.exists():
897 # File didn't exist in original, so no original errors
898 return set()
899
900 # Validate the specific file in original
901 is_valid, errors = self._validate_single_file_xsd(
902 original_xml_file, temp_path
903 )
904 return errors if errors else set()
905
906 def _remove_template_tags_from_text_nodes(self, xml_doc):
907 """Remove template tags from XML text nodes and collect warnings.
908
909 Template tags follow the pattern {{ ... }} and are used as placeholders
910 for content replacement. They should be removed from text content before
911 XSD validation while preserving XML structure.
912
913 Returns:
914 tuple: (cleaned_xml_doc, warnings_list)
915 """
916 warnings = []
917 template_pattern = re.compile(r"\{\{[^}]*\}\}")
918
919 # Create a copy of the document to avoid modifying the original
920 xml_string = lxml.etree.tostring(xml_doc, encoding="unicode")
921 xml_copy = lxml.etree.fromstring(xml_string)
922
923 def process_text_content(text, content_type):
924 if not text:
925 return text
926 matches = list(template_pattern.finditer(text))
927 if matches:
928 for match in matches:
929 warnings.append(
930 f"Found template tag in {content_type}: {match.group()}"
931 )
932 return template_pattern.sub("", text)
933 return text
934
935 # Process all text nodes in the document
936 for elem in xml_copy.iter():
937 # Skip processing if this is a w:t element
938 if not hasattr(elem, "tag") or callable(elem.tag):
939 continue
940 tag_str = str(elem.tag)
941 if tag_str.endswith("}t") or tag_str == "t":
942 continue
943
944 elem.text = process_text_content(elem.text, "text content")
945 elem.tail = process_text_content(elem.tail, "tail content")
946
947 return lxml.etree.ElementTree(xml_copy), warnings
948
949
950if __name__ == "__main__":
951 raise RuntimeError("This module should not be run directly.")