main
1"""
2Validator for Word document XML files against XSD schemas.
3"""
4
5import re
6import tempfile
7import zipfile
8
9import lxml.etree
10
11from .base import BaseSchemaValidator
12
13
14class DOCXSchemaValidator(BaseSchemaValidator):
15 """Validator for Word document XML files against XSD schemas."""
16
17 # Word-specific namespace
18 WORD_2006_NAMESPACE = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
19
20 # Word-specific element to relationship type mappings
21 # Start with empty mapping - add specific cases as we discover them
22 ELEMENT_RELATIONSHIP_TYPES = {}
23
24 def validate(self):
25 """Run all validation checks and return True if all pass."""
26 # Test 0: XML well-formedness
27 if not self.validate_xml():
28 return False
29
30 # Test 1: Namespace declarations
31 all_valid = True
32 if not self.validate_namespaces():
33 all_valid = False
34
35 # Test 2: Unique IDs
36 if not self.validate_unique_ids():
37 all_valid = False
38
39 # Test 3: Relationship and file reference validation
40 if not self.validate_file_references():
41 all_valid = False
42
43 # Test 4: Content type declarations
44 if not self.validate_content_types():
45 all_valid = False
46
47 # Test 5: XSD schema validation
48 if not self.validate_against_xsd():
49 all_valid = False
50
51 # Test 6: Whitespace preservation
52 if not self.validate_whitespace_preservation():
53 all_valid = False
54
55 # Test 7: Deletion validation
56 if not self.validate_deletions():
57 all_valid = False
58
59 # Test 8: Insertion validation
60 if not self.validate_insertions():
61 all_valid = False
62
63 # Test 9: Relationship ID reference validation
64 if not self.validate_all_relationship_ids():
65 all_valid = False
66
67 # Count and compare paragraphs
68 self.compare_paragraph_counts()
69
70 return all_valid
71
72 def validate_whitespace_preservation(self):
73 """
74 Validate that w:t elements with whitespace have xml:space='preserve'.
75 """
76 errors = []
77
78 for xml_file in self.xml_files:
79 # Only check document.xml files
80 if xml_file.name != "document.xml":
81 continue
82
83 try:
84 root = lxml.etree.parse(str(xml_file)).getroot()
85
86 # Find all w:t elements
87 for elem in root.iter(f"{{{self.WORD_2006_NAMESPACE}}}t"):
88 if elem.text:
89 text = elem.text
90 # Check if text starts or ends with whitespace
91 if re.match(r"^\s.*", text) or re.match(r".*\s$", text):
92 # Check if xml:space="preserve" attribute exists
93 xml_space_attr = f"{{{self.XML_NAMESPACE}}}space"
94 if (
95 xml_space_attr not in elem.attrib
96 or elem.attrib[xml_space_attr] != "preserve"
97 ):
98 # Show a preview of the text
99 text_preview = (
100 repr(text)[:50] + "..."
101 if len(repr(text)) > 50
102 else repr(text)
103 )
104 errors.append(
105 f" {xml_file.relative_to(self.unpacked_dir)}: "
106 f"Line {elem.sourceline}: w:t element with whitespace missing xml:space='preserve': {text_preview}"
107 )
108
109 except (lxml.etree.XMLSyntaxError, Exception) as e:
110 errors.append(
111 f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}"
112 )
113
114 if errors:
115 print(f"FAILED - Found {len(errors)} whitespace preservation violations:")
116 for error in errors:
117 print(error)
118 return False
119 else:
120 if self.verbose:
121 print("PASSED - All whitespace is properly preserved")
122 return True
123
124 def validate_deletions(self):
125 """
126 Validate that w:t elements are not within w:del elements.
127 For some reason, XSD validation does not catch this, so we do it manually.
128 """
129 errors = []
130
131 for xml_file in self.xml_files:
132 # Only check document.xml files
133 if xml_file.name != "document.xml":
134 continue
135
136 try:
137 root = lxml.etree.parse(str(xml_file)).getroot()
138
139 # Find all w:t elements that are descendants of w:del elements
140 namespaces = {"w": self.WORD_2006_NAMESPACE}
141 xpath_expression = ".//w:del//w:t"
142 problematic_t_elements = root.xpath(
143 xpath_expression, namespaces=namespaces
144 )
145 for t_elem in problematic_t_elements:
146 if t_elem.text:
147 # Show a preview of the text
148 text_preview = (
149 repr(t_elem.text)[:50] + "..."
150 if len(repr(t_elem.text)) > 50
151 else repr(t_elem.text)
152 )
153 errors.append(
154 f" {xml_file.relative_to(self.unpacked_dir)}: "
155 f"Line {t_elem.sourceline}: <w:t> found within <w:del>: {text_preview}"
156 )
157
158 except (lxml.etree.XMLSyntaxError, Exception) as e:
159 errors.append(
160 f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}"
161 )
162
163 if errors:
164 print(f"FAILED - Found {len(errors)} deletion validation violations:")
165 for error in errors:
166 print(error)
167 return False
168 else:
169 if self.verbose:
170 print("PASSED - No w:t elements found within w:del elements")
171 return True
172
173 def count_paragraphs_in_unpacked(self):
174 """Count the number of paragraphs in the unpacked document."""
175 count = 0
176
177 for xml_file in self.xml_files:
178 # Only check document.xml files
179 if xml_file.name != "document.xml":
180 continue
181
182 try:
183 root = lxml.etree.parse(str(xml_file)).getroot()
184 # Count all w:p elements
185 paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p")
186 count = len(paragraphs)
187 except Exception as e:
188 print(f"Error counting paragraphs in unpacked document: {e}")
189
190 return count
191
192 def count_paragraphs_in_original(self):
193 """Count the number of paragraphs in the original docx file."""
194 count = 0
195
196 try:
197 # Create temporary directory to unpack original
198 with tempfile.TemporaryDirectory() as temp_dir:
199 # Unpack original docx
200 with zipfile.ZipFile(self.original_file, "r") as zip_ref:
201 zip_ref.extractall(temp_dir)
202
203 # Parse document.xml
204 doc_xml_path = temp_dir + "/word/document.xml"
205 root = lxml.etree.parse(doc_xml_path).getroot()
206
207 # Count all w:p elements
208 paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p")
209 count = len(paragraphs)
210
211 except Exception as e:
212 print(f"Error counting paragraphs in original document: {e}")
213
214 return count
215
216 def validate_insertions(self):
217 """
218 Validate that w:delText elements are not within w:ins elements.
219 w:delText is only allowed in w:ins if nested within a w:del.
220 """
221 errors = []
222
223 for xml_file in self.xml_files:
224 if xml_file.name != "document.xml":
225 continue
226
227 try:
228 root = lxml.etree.parse(str(xml_file)).getroot()
229 namespaces = {"w": self.WORD_2006_NAMESPACE}
230
231 # Find w:delText in w:ins that are NOT within w:del
232 invalid_elements = root.xpath(
233 ".//w:ins//w:delText[not(ancestor::w:del)]",
234 namespaces=namespaces
235 )
236
237 for elem in invalid_elements:
238 text_preview = (
239 repr(elem.text or "")[:50] + "..."
240 if len(repr(elem.text or "")) > 50
241 else repr(elem.text or "")
242 )
243 errors.append(
244 f" {xml_file.relative_to(self.unpacked_dir)}: "
245 f"Line {elem.sourceline}: <w:delText> within <w:ins>: {text_preview}"
246 )
247
248 except (lxml.etree.XMLSyntaxError, Exception) as e:
249 errors.append(
250 f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}"
251 )
252
253 if errors:
254 print(f"FAILED - Found {len(errors)} insertion validation violations:")
255 for error in errors:
256 print(error)
257 return False
258 else:
259 if self.verbose:
260 print("PASSED - No w:delText elements within w:ins elements")
261 return True
262
263 def compare_paragraph_counts(self):
264 """Compare paragraph counts between original and new document."""
265 original_count = self.count_paragraphs_in_original()
266 new_count = self.count_paragraphs_in_unpacked()
267
268 diff = new_count - original_count
269 diff_str = f"+{diff}" if diff > 0 else str(diff)
270 print(f"\nParagraphs: {original_count} → {new_count} ({diff_str})")
271
272
273if __name__ == "__main__":
274 raise RuntimeError("This module should not be run directly.")