main
1"""
2Validator for PowerPoint presentation XML files against XSD schemas.
3"""
4
5import re
6
7from .base import BaseSchemaValidator
8
9
10class PPTXSchemaValidator(BaseSchemaValidator):
11 """Validator for PowerPoint presentation XML files against XSD schemas."""
12
13 # PowerPoint presentation namespace
14 PRESENTATIONML_NAMESPACE = (
15 "http://schemas.openxmlformats.org/presentationml/2006/main"
16 )
17
18 # PowerPoint-specific element to relationship type mappings
19 ELEMENT_RELATIONSHIP_TYPES = {
20 "sldid": "slide",
21 "sldmasterid": "slidemaster",
22 "notesmasterid": "notesmaster",
23 "sldlayoutid": "slidelayout",
24 "themeid": "theme",
25 "tablestyleid": "tablestyles",
26 }
27
28 def validate(self):
29 """Run all validation checks and return True if all pass."""
30 # Test 0: XML well-formedness
31 if not self.validate_xml():
32 return False
33
34 # Test 1: Namespace declarations
35 all_valid = True
36 if not self.validate_namespaces():
37 all_valid = False
38
39 # Test 2: Unique IDs
40 if not self.validate_unique_ids():
41 all_valid = False
42
43 # Test 3: UUID ID validation
44 if not self.validate_uuid_ids():
45 all_valid = False
46
47 # Test 4: Relationship and file reference validation
48 if not self.validate_file_references():
49 all_valid = False
50
51 # Test 5: Slide layout ID validation
52 if not self.validate_slide_layout_ids():
53 all_valid = False
54
55 # Test 6: Content type declarations
56 if not self.validate_content_types():
57 all_valid = False
58
59 # Test 7: XSD schema validation
60 if not self.validate_against_xsd():
61 all_valid = False
62
63 # Test 8: Notes slide reference validation
64 if not self.validate_notes_slide_references():
65 all_valid = False
66
67 # Test 9: Relationship ID reference validation
68 if not self.validate_all_relationship_ids():
69 all_valid = False
70
71 # Test 10: Duplicate slide layout references validation
72 if not self.validate_no_duplicate_slide_layouts():
73 all_valid = False
74
75 return all_valid
76
77 def validate_uuid_ids(self):
78 """Validate that ID attributes that look like UUIDs contain only hex values."""
79 import lxml.etree
80
81 errors = []
82 # UUID pattern: 8-4-4-4-12 hex digits with optional braces/hyphens
83 uuid_pattern = re.compile(
84 r"^[\{\(]?[0-9A-Fa-f]{8}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{12}[\}\)]?$"
85 )
86
87 for xml_file in self.xml_files:
88 try:
89 root = lxml.etree.parse(str(xml_file)).getroot()
90
91 # Check all elements for ID attributes
92 for elem in root.iter():
93 for attr, value in elem.attrib.items():
94 # Check if this is an ID attribute
95 attr_name = attr.split("}")[-1].lower()
96 if attr_name == "id" or attr_name.endswith("id"):
97 # Check if value looks like a UUID (has the right length and pattern structure)
98 if self._looks_like_uuid(value):
99 # Validate that it contains only hex characters in the right positions
100 if not uuid_pattern.match(value):
101 errors.append(
102 f" {xml_file.relative_to(self.unpacked_dir)}: "
103 f"Line {elem.sourceline}: ID '{value}' appears to be a UUID but contains invalid hex characters"
104 )
105
106 except (lxml.etree.XMLSyntaxError, Exception) as e:
107 errors.append(
108 f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}"
109 )
110
111 if errors:
112 print(f"FAILED - Found {len(errors)} UUID ID validation errors:")
113 for error in errors:
114 print(error)
115 return False
116 else:
117 if self.verbose:
118 print("PASSED - All UUID-like IDs contain valid hex values")
119 return True
120
121 def _looks_like_uuid(self, value):
122 """Check if a value has the general structure of a UUID."""
123 # Remove common UUID delimiters
124 clean_value = value.strip("{}()").replace("-", "")
125 # Check if it's 32 hex-like characters (could include invalid hex chars)
126 return len(clean_value) == 32 and all(c.isalnum() for c in clean_value)
127
128 def validate_slide_layout_ids(self):
129 """Validate that sldLayoutId elements in slide masters reference valid slide layouts."""
130 import lxml.etree
131
132 errors = []
133
134 # Find all slide master files
135 slide_masters = list(self.unpacked_dir.glob("ppt/slideMasters/*.xml"))
136
137 if not slide_masters:
138 if self.verbose:
139 print("PASSED - No slide masters found")
140 return True
141
142 for slide_master in slide_masters:
143 try:
144 # Parse the slide master file
145 root = lxml.etree.parse(str(slide_master)).getroot()
146
147 # Find the corresponding _rels file for this slide master
148 rels_file = slide_master.parent / "_rels" / f"{slide_master.name}.rels"
149
150 if not rels_file.exists():
151 errors.append(
152 f" {slide_master.relative_to(self.unpacked_dir)}: "
153 f"Missing relationships file: {rels_file.relative_to(self.unpacked_dir)}"
154 )
155 continue
156
157 # Parse the relationships file
158 rels_root = lxml.etree.parse(str(rels_file)).getroot()
159
160 # Build a set of valid relationship IDs that point to slide layouts
161 valid_layout_rids = set()
162 for rel in rels_root.findall(
163 f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship"
164 ):
165 rel_type = rel.get("Type", "")
166 if "slideLayout" in rel_type:
167 valid_layout_rids.add(rel.get("Id"))
168
169 # Find all sldLayoutId elements in the slide master
170 for sld_layout_id in root.findall(
171 f".//{{{self.PRESENTATIONML_NAMESPACE}}}sldLayoutId"
172 ):
173 r_id = sld_layout_id.get(
174 f"{{{self.OFFICE_RELATIONSHIPS_NAMESPACE}}}id"
175 )
176 layout_id = sld_layout_id.get("id")
177
178 if r_id and r_id not in valid_layout_rids:
179 errors.append(
180 f" {slide_master.relative_to(self.unpacked_dir)}: "
181 f"Line {sld_layout_id.sourceline}: sldLayoutId with id='{layout_id}' "
182 f"references r:id='{r_id}' which is not found in slide layout relationships"
183 )
184
185 except (lxml.etree.XMLSyntaxError, Exception) as e:
186 errors.append(
187 f" {slide_master.relative_to(self.unpacked_dir)}: Error: {e}"
188 )
189
190 if errors:
191 print(f"FAILED - Found {len(errors)} slide layout ID validation errors:")
192 for error in errors:
193 print(error)
194 print(
195 "Remove invalid references or add missing slide layouts to the relationships file."
196 )
197 return False
198 else:
199 if self.verbose:
200 print("PASSED - All slide layout IDs reference valid slide layouts")
201 return True
202
203 def validate_no_duplicate_slide_layouts(self):
204 """Validate that each slide has exactly one slideLayout reference."""
205 import lxml.etree
206
207 errors = []
208 slide_rels_files = list(self.unpacked_dir.glob("ppt/slides/_rels/*.xml.rels"))
209
210 for rels_file in slide_rels_files:
211 try:
212 root = lxml.etree.parse(str(rels_file)).getroot()
213
214 # Find all slideLayout relationships
215 layout_rels = [
216 rel
217 for rel in root.findall(
218 f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship"
219 )
220 if "slideLayout" in rel.get("Type", "")
221 ]
222
223 if len(layout_rels) > 1:
224 errors.append(
225 f" {rels_file.relative_to(self.unpacked_dir)}: has {len(layout_rels)} slideLayout references"
226 )
227
228 except Exception as e:
229 errors.append(
230 f" {rels_file.relative_to(self.unpacked_dir)}: Error: {e}"
231 )
232
233 if errors:
234 print("FAILED - Found slides with duplicate slideLayout references:")
235 for error in errors:
236 print(error)
237 return False
238 else:
239 if self.verbose:
240 print("PASSED - All slides have exactly one slideLayout reference")
241 return True
242
243 def validate_notes_slide_references(self):
244 """Validate that each notesSlide file is referenced by only one slide."""
245 import lxml.etree
246
247 errors = []
248 notes_slide_references = {} # Track which slides reference each notesSlide
249
250 # Find all slide relationship files
251 slide_rels_files = list(self.unpacked_dir.glob("ppt/slides/_rels/*.xml.rels"))
252
253 if not slide_rels_files:
254 if self.verbose:
255 print("PASSED - No slide relationship files found")
256 return True
257
258 for rels_file in slide_rels_files:
259 try:
260 # Parse the relationships file
261 root = lxml.etree.parse(str(rels_file)).getroot()
262
263 # Find all notesSlide relationships
264 for rel in root.findall(
265 f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship"
266 ):
267 rel_type = rel.get("Type", "")
268 if "notesSlide" in rel_type:
269 target = rel.get("Target", "")
270 if target:
271 # Normalize the target path to handle relative paths
272 normalized_target = target.replace("../", "")
273
274 # Track which slide references this notesSlide
275 slide_name = rels_file.stem.replace(
276 ".xml", ""
277 ) # e.g., "slide1"
278
279 if normalized_target not in notes_slide_references:
280 notes_slide_references[normalized_target] = []
281 notes_slide_references[normalized_target].append(
282 (slide_name, rels_file)
283 )
284
285 except (lxml.etree.XMLSyntaxError, Exception) as e:
286 errors.append(
287 f" {rels_file.relative_to(self.unpacked_dir)}: Error: {e}"
288 )
289
290 # Check for duplicate references
291 for target, references in notes_slide_references.items():
292 if len(references) > 1:
293 slide_names = [ref[0] for ref in references]
294 errors.append(
295 f" Notes slide '{target}' is referenced by multiple slides: {', '.join(slide_names)}"
296 )
297 for slide_name, rels_file in references:
298 errors.append(f" - {rels_file.relative_to(self.unpacked_dir)}")
299
300 if errors:
301 print(
302 f"FAILED - Found {len([e for e in errors if not e.startswith(' ')])} notes slide reference validation errors:"
303 )
304 for error in errors:
305 print(error)
306 print("Each slide may optionally have its own slide file.")
307 return False
308 else:
309 if self.verbose:
310 print("PASSED - All notes slide references are unique")
311 return True
312
313
314if __name__ == "__main__":
315 raise RuntimeError("This module should not be run directly.")