skills/skills/document-skills/pptx/ooxml/scripts/validation/docx.py at main

  1"""
  2Validator for Word document XML files against XSD schemas.
  3"""
  4
  5import re
  6import tempfile
  7import zipfile
  8
  9import lxml.etree
 10
 11from .base import BaseSchemaValidator
 12
 13
 14class DOCXSchemaValidator(BaseSchemaValidator):
 15    """Validator for Word document XML files against XSD schemas."""
 16
 17    # Word-specific namespace
 18    WORD_2006_NAMESPACE = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
 19
 20    # Word-specific element to relationship type mappings
 21    # Start with empty mapping - add specific cases as we discover them
 22    ELEMENT_RELATIONSHIP_TYPES = {}
 23
 24    def validate(self):
 25        """Run all validation checks and return True if all pass."""
 26        # Test 0: XML well-formedness
 27        if not self.validate_xml():
 28            return False
 29
 30        # Test 1: Namespace declarations
 31        all_valid = True
 32        if not self.validate_namespaces():
 33            all_valid = False
 34
 35        # Test 2: Unique IDs
 36        if not self.validate_unique_ids():
 37            all_valid = False
 38
 39        # Test 3: Relationship and file reference validation
 40        if not self.validate_file_references():
 41            all_valid = False
 42
 43        # Test 4: Content type declarations
 44        if not self.validate_content_types():
 45            all_valid = False
 46
 47        # Test 5: XSD schema validation
 48        if not self.validate_against_xsd():
 49            all_valid = False
 50
 51        # Test 6: Whitespace preservation
 52        if not self.validate_whitespace_preservation():
 53            all_valid = False
 54
 55        # Test 7: Deletion validation
 56        if not self.validate_deletions():
 57            all_valid = False
 58
 59        # Test 8: Insertion validation
 60        if not self.validate_insertions():
 61            all_valid = False
 62
 63        # Test 9: Relationship ID reference validation
 64        if not self.validate_all_relationship_ids():
 65            all_valid = False
 66
 67        # Count and compare paragraphs
 68        self.compare_paragraph_counts()
 69
 70        return all_valid
 71
 72    def validate_whitespace_preservation(self):
 73        """
 74        Validate that w:t elements with whitespace have xml:space='preserve'.
 75        """
 76        errors = []
 77
 78        for xml_file in self.xml_files:
 79            # Only check document.xml files
 80            if xml_file.name != "document.xml":
 81                continue
 82
 83            try:
 84                root = lxml.etree.parse(str(xml_file)).getroot()
 85
 86                # Find all w:t elements
 87                for elem in root.iter(f"{{{self.WORD_2006_NAMESPACE}}}t"):
 88                    if elem.text:
 89                        text = elem.text
 90                        # Check if text starts or ends with whitespace
 91                        if re.match(r"^\s.*", text) or re.match(r".*\s$", text):
 92                            # Check if xml:space="preserve" attribute exists
 93                            xml_space_attr = f"{{{self.XML_NAMESPACE}}}space"
 94                            if (
 95                                xml_space_attr not in elem.attrib
 96                                or elem.attrib[xml_space_attr] != "preserve"
 97                            ):
 98                                # Show a preview of the text
 99                                text_preview = (
100                                    repr(text)[:50] + "..."
101                                    if len(repr(text)) > 50
102                                    else repr(text)
103                                )
104                                errors.append(
105                                    f"  {xml_file.relative_to(self.unpacked_dir)}: "
106                                    f"Line {elem.sourceline}: w:t element with whitespace missing xml:space='preserve': {text_preview}"
107                                )
108
109            except (lxml.etree.XMLSyntaxError, Exception) as e:
110                errors.append(
111                    f"  {xml_file.relative_to(self.unpacked_dir)}: Error: {e}"
112                )
113
114        if errors:
115            print(f"FAILED - Found {len(errors)} whitespace preservation violations:")
116            for error in errors:
117                print(error)
118            return False
119        else:
120            if self.verbose:
121                print("PASSED - All whitespace is properly preserved")
122            return True
123
124    def validate_deletions(self):
125        """
126        Validate that w:t elements are not within w:del elements.
127        For some reason, XSD validation does not catch this, so we do it manually.
128        """
129        errors = []
130
131        for xml_file in self.xml_files:
132            # Only check document.xml files
133            if xml_file.name != "document.xml":
134                continue
135
136            try:
137                root = lxml.etree.parse(str(xml_file)).getroot()
138
139                # Find all w:t elements that are descendants of w:del elements
140                namespaces = {"w": self.WORD_2006_NAMESPACE}
141                xpath_expression = ".//w:del//w:t"
142                problematic_t_elements = root.xpath(
143                    xpath_expression, namespaces=namespaces
144                )
145                for t_elem in problematic_t_elements:
146                    if t_elem.text:
147                        # Show a preview of the text
148                        text_preview = (
149                            repr(t_elem.text)[:50] + "..."
150                            if len(repr(t_elem.text)) > 50
151                            else repr(t_elem.text)
152                        )
153                        errors.append(
154                            f"  {xml_file.relative_to(self.unpacked_dir)}: "
155                            f"Line {t_elem.sourceline}: <w:t> found within <w:del>: {text_preview}"
156                        )
157
158            except (lxml.etree.XMLSyntaxError, Exception) as e:
159                errors.append(
160                    f"  {xml_file.relative_to(self.unpacked_dir)}: Error: {e}"
161                )
162
163        if errors:
164            print(f"FAILED - Found {len(errors)} deletion validation violations:")
165            for error in errors:
166                print(error)
167            return False
168        else:
169            if self.verbose:
170                print("PASSED - No w:t elements found within w:del elements")
171            return True
172
173    def count_paragraphs_in_unpacked(self):
174        """Count the number of paragraphs in the unpacked document."""
175        count = 0
176
177        for xml_file in self.xml_files:
178            # Only check document.xml files
179            if xml_file.name != "document.xml":
180                continue
181
182            try:
183                root = lxml.etree.parse(str(xml_file)).getroot()
184                # Count all w:p elements
185                paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p")
186                count = len(paragraphs)
187            except Exception as e:
188                print(f"Error counting paragraphs in unpacked document: {e}")
189
190        return count
191
192    def count_paragraphs_in_original(self):
193        """Count the number of paragraphs in the original docx file."""
194        count = 0
195
196        try:
197            # Create temporary directory to unpack original
198            with tempfile.TemporaryDirectory() as temp_dir:
199                # Unpack original docx
200                with zipfile.ZipFile(self.original_file, "r") as zip_ref:
201                    zip_ref.extractall(temp_dir)
202
203                # Parse document.xml
204                doc_xml_path = temp_dir + "/word/document.xml"
205                root = lxml.etree.parse(doc_xml_path).getroot()
206
207                # Count all w:p elements
208                paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p")
209                count = len(paragraphs)
210
211        except Exception as e:
212            print(f"Error counting paragraphs in original document: {e}")
213
214        return count
215
216    def validate_insertions(self):
217        """
218        Validate that w:delText elements are not within w:ins elements.
219        w:delText is only allowed in w:ins if nested within a w:del.
220        """
221        errors = []
222
223        for xml_file in self.xml_files:
224            if xml_file.name != "document.xml":
225                continue
226
227            try:
228                root = lxml.etree.parse(str(xml_file)).getroot()
229                namespaces = {"w": self.WORD_2006_NAMESPACE}
230
231                # Find w:delText in w:ins that are NOT within w:del
232                invalid_elements = root.xpath(
233                    ".//w:ins//w:delText[not(ancestor::w:del)]",
234                    namespaces=namespaces
235                )
236
237                for elem in invalid_elements:
238                    text_preview = (
239                        repr(elem.text or "")[:50] + "..."
240                        if len(repr(elem.text or "")) > 50
241                        else repr(elem.text or "")
242                    )
243                    errors.append(
244                        f"  {xml_file.relative_to(self.unpacked_dir)}: "
245                        f"Line {elem.sourceline}: <w:delText> within <w:ins>: {text_preview}"
246                    )
247
248            except (lxml.etree.XMLSyntaxError, Exception) as e:
249                errors.append(
250                    f"  {xml_file.relative_to(self.unpacked_dir)}: Error: {e}"
251                )
252
253        if errors:
254            print(f"FAILED - Found {len(errors)} insertion validation violations:")
255            for error in errors:
256                print(error)
257            return False
258        else:
259            if self.verbose:
260                print("PASSED - No w:delText elements within w:ins elements")
261            return True
262
263    def compare_paragraph_counts(self):
264        """Compare paragraph counts between original and new document."""
265        original_count = self.count_paragraphs_in_original()
266        new_count = self.count_paragraphs_in_unpacked()
267
268        diff = new_count - original_count
269        diff_str = f"+{diff}" if diff > 0 else str(diff)
270        print(f"\nParagraphs: {original_count} → {new_count} ({diff_str})")
271
272
273if __name__ == "__main__":
274    raise RuntimeError("This module should not be run directly.")