1"""
  2Validator for PowerPoint presentation XML files against XSD schemas.
  3"""
  4
  5import re
  6
  7from .base import BaseSchemaValidator
  8
  9
 10class PPTXSchemaValidator(BaseSchemaValidator):
 11    """Validator for PowerPoint presentation XML files against XSD schemas."""
 12
 13    # PowerPoint presentation namespace
 14    PRESENTATIONML_NAMESPACE = (
 15        "http://schemas.openxmlformats.org/presentationml/2006/main"
 16    )
 17
 18    # PowerPoint-specific element to relationship type mappings
 19    ELEMENT_RELATIONSHIP_TYPES = {
 20        "sldid": "slide",
 21        "sldmasterid": "slidemaster",
 22        "notesmasterid": "notesmaster",
 23        "sldlayoutid": "slidelayout",
 24        "themeid": "theme",
 25        "tablestyleid": "tablestyles",
 26    }
 27
 28    def validate(self):
 29        """Run all validation checks and return True if all pass."""
 30        # Test 0: XML well-formedness
 31        if not self.validate_xml():
 32            return False
 33
 34        # Test 1: Namespace declarations
 35        all_valid = True
 36        if not self.validate_namespaces():
 37            all_valid = False
 38
 39        # Test 2: Unique IDs
 40        if not self.validate_unique_ids():
 41            all_valid = False
 42
 43        # Test 3: UUID ID validation
 44        if not self.validate_uuid_ids():
 45            all_valid = False
 46
 47        # Test 4: Relationship and file reference validation
 48        if not self.validate_file_references():
 49            all_valid = False
 50
 51        # Test 5: Slide layout ID validation
 52        if not self.validate_slide_layout_ids():
 53            all_valid = False
 54
 55        # Test 6: Content type declarations
 56        if not self.validate_content_types():
 57            all_valid = False
 58
 59        # Test 7: XSD schema validation
 60        if not self.validate_against_xsd():
 61            all_valid = False
 62
 63        # Test 8: Notes slide reference validation
 64        if not self.validate_notes_slide_references():
 65            all_valid = False
 66
 67        # Test 9: Relationship ID reference validation
 68        if not self.validate_all_relationship_ids():
 69            all_valid = False
 70
 71        # Test 10: Duplicate slide layout references validation
 72        if not self.validate_no_duplicate_slide_layouts():
 73            all_valid = False
 74
 75        return all_valid
 76
 77    def validate_uuid_ids(self):
 78        """Validate that ID attributes that look like UUIDs contain only hex values."""
 79        import lxml.etree
 80
 81        errors = []
 82        # UUID pattern: 8-4-4-4-12 hex digits with optional braces/hyphens
 83        uuid_pattern = re.compile(
 84            r"^[\{\(]?[0-9A-Fa-f]{8}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{12}[\}\)]?$"
 85        )
 86
 87        for xml_file in self.xml_files:
 88            try:
 89                root = lxml.etree.parse(str(xml_file)).getroot()
 90
 91                # Check all elements for ID attributes
 92                for elem in root.iter():
 93                    for attr, value in elem.attrib.items():
 94                        # Check if this is an ID attribute
 95                        attr_name = attr.split("}")[-1].lower()
 96                        if attr_name == "id" or attr_name.endswith("id"):
 97                            # Check if value looks like a UUID (has the right length and pattern structure)
 98                            if self._looks_like_uuid(value):
 99                                # Validate that it contains only hex characters in the right positions
100                                if not uuid_pattern.match(value):
101                                    errors.append(
102                                        f"  {xml_file.relative_to(self.unpacked_dir)}: "
103                                        f"Line {elem.sourceline}: ID '{value}' appears to be a UUID but contains invalid hex characters"
104                                    )
105
106            except (lxml.etree.XMLSyntaxError, Exception) as e:
107                errors.append(
108                    f"  {xml_file.relative_to(self.unpacked_dir)}: Error: {e}"
109                )
110
111        if errors:
112            print(f"FAILED - Found {len(errors)} UUID ID validation errors:")
113            for error in errors:
114                print(error)
115            return False
116        else:
117            if self.verbose:
118                print("PASSED - All UUID-like IDs contain valid hex values")
119            return True
120
121    def _looks_like_uuid(self, value):
122        """Check if a value has the general structure of a UUID."""
123        # Remove common UUID delimiters
124        clean_value = value.strip("{}()").replace("-", "")
125        # Check if it's 32 hex-like characters (could include invalid hex chars)
126        return len(clean_value) == 32 and all(c.isalnum() for c in clean_value)
127
128    def validate_slide_layout_ids(self):
129        """Validate that sldLayoutId elements in slide masters reference valid slide layouts."""
130        import lxml.etree
131
132        errors = []
133
134        # Find all slide master files
135        slide_masters = list(self.unpacked_dir.glob("ppt/slideMasters/*.xml"))
136
137        if not slide_masters:
138            if self.verbose:
139                print("PASSED - No slide masters found")
140            return True
141
142        for slide_master in slide_masters:
143            try:
144                # Parse the slide master file
145                root = lxml.etree.parse(str(slide_master)).getroot()
146
147                # Find the corresponding _rels file for this slide master
148                rels_file = slide_master.parent / "_rels" / f"{slide_master.name}.rels"
149
150                if not rels_file.exists():
151                    errors.append(
152                        f"  {slide_master.relative_to(self.unpacked_dir)}: "
153                        f"Missing relationships file: {rels_file.relative_to(self.unpacked_dir)}"
154                    )
155                    continue
156
157                # Parse the relationships file
158                rels_root = lxml.etree.parse(str(rels_file)).getroot()
159
160                # Build a set of valid relationship IDs that point to slide layouts
161                valid_layout_rids = set()
162                for rel in rels_root.findall(
163                    f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship"
164                ):
165                    rel_type = rel.get("Type", "")
166                    if "slideLayout" in rel_type:
167                        valid_layout_rids.add(rel.get("Id"))
168
169                # Find all sldLayoutId elements in the slide master
170                for sld_layout_id in root.findall(
171                    f".//{{{self.PRESENTATIONML_NAMESPACE}}}sldLayoutId"
172                ):
173                    r_id = sld_layout_id.get(
174                        f"{{{self.OFFICE_RELATIONSHIPS_NAMESPACE}}}id"
175                    )
176                    layout_id = sld_layout_id.get("id")
177
178                    if r_id and r_id not in valid_layout_rids:
179                        errors.append(
180                            f"  {slide_master.relative_to(self.unpacked_dir)}: "
181                            f"Line {sld_layout_id.sourceline}: sldLayoutId with id='{layout_id}' "
182                            f"references r:id='{r_id}' which is not found in slide layout relationships"
183                        )
184
185            except (lxml.etree.XMLSyntaxError, Exception) as e:
186                errors.append(
187                    f"  {slide_master.relative_to(self.unpacked_dir)}: Error: {e}"
188                )
189
190        if errors:
191            print(f"FAILED - Found {len(errors)} slide layout ID validation errors:")
192            for error in errors:
193                print(error)
194            print(
195                "Remove invalid references or add missing slide layouts to the relationships file."
196            )
197            return False
198        else:
199            if self.verbose:
200                print("PASSED - All slide layout IDs reference valid slide layouts")
201            return True
202
203    def validate_no_duplicate_slide_layouts(self):
204        """Validate that each slide has exactly one slideLayout reference."""
205        import lxml.etree
206
207        errors = []
208        slide_rels_files = list(self.unpacked_dir.glob("ppt/slides/_rels/*.xml.rels"))
209
210        for rels_file in slide_rels_files:
211            try:
212                root = lxml.etree.parse(str(rels_file)).getroot()
213
214                # Find all slideLayout relationships
215                layout_rels = [
216                    rel
217                    for rel in root.findall(
218                        f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship"
219                    )
220                    if "slideLayout" in rel.get("Type", "")
221                ]
222
223                if len(layout_rels) > 1:
224                    errors.append(
225                        f"  {rels_file.relative_to(self.unpacked_dir)}: has {len(layout_rels)} slideLayout references"
226                    )
227
228            except Exception as e:
229                errors.append(
230                    f"  {rels_file.relative_to(self.unpacked_dir)}: Error: {e}"
231                )
232
233        if errors:
234            print("FAILED - Found slides with duplicate slideLayout references:")
235            for error in errors:
236                print(error)
237            return False
238        else:
239            if self.verbose:
240                print("PASSED - All slides have exactly one slideLayout reference")
241            return True
242
243    def validate_notes_slide_references(self):
244        """Validate that each notesSlide file is referenced by only one slide."""
245        import lxml.etree
246
247        errors = []
248        notes_slide_references = {}  # Track which slides reference each notesSlide
249
250        # Find all slide relationship files
251        slide_rels_files = list(self.unpacked_dir.glob("ppt/slides/_rels/*.xml.rels"))
252
253        if not slide_rels_files:
254            if self.verbose:
255                print("PASSED - No slide relationship files found")
256            return True
257
258        for rels_file in slide_rels_files:
259            try:
260                # Parse the relationships file
261                root = lxml.etree.parse(str(rels_file)).getroot()
262
263                # Find all notesSlide relationships
264                for rel in root.findall(
265                    f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship"
266                ):
267                    rel_type = rel.get("Type", "")
268                    if "notesSlide" in rel_type:
269                        target = rel.get("Target", "")
270                        if target:
271                            # Normalize the target path to handle relative paths
272                            normalized_target = target.replace("../", "")
273
274                            # Track which slide references this notesSlide
275                            slide_name = rels_file.stem.replace(
276                                ".xml", ""
277                            )  # e.g., "slide1"
278
279                            if normalized_target not in notes_slide_references:
280                                notes_slide_references[normalized_target] = []
281                            notes_slide_references[normalized_target].append(
282                                (slide_name, rels_file)
283                            )
284
285            except (lxml.etree.XMLSyntaxError, Exception) as e:
286                errors.append(
287                    f"  {rels_file.relative_to(self.unpacked_dir)}: Error: {e}"
288                )
289
290        # Check for duplicate references
291        for target, references in notes_slide_references.items():
292            if len(references) > 1:
293                slide_names = [ref[0] for ref in references]
294                errors.append(
295                    f"  Notes slide '{target}' is referenced by multiple slides: {', '.join(slide_names)}"
296                )
297                for slide_name, rels_file in references:
298                    errors.append(f"    - {rels_file.relative_to(self.unpacked_dir)}")
299
300        if errors:
301            print(
302                f"FAILED - Found {len([e for e in errors if not e.startswith('    ')])} notes slide reference validation errors:"
303            )
304            for error in errors:
305                print(error)
306            print("Each slide may optionally have its own slide file.")
307            return False
308        else:
309            if self.verbose:
310                print("PASSED - All notes slide references are unique")
311            return True
312
313
314if __name__ == "__main__":
315    raise RuntimeError("This module should not be run directly.")