main
1#!/usr/bin/env python3
2"""
3Tool to pack a directory into a .docx, .pptx, or .xlsx file with XML formatting undone.
4
5Example usage:
6 python pack.py <input_directory> <office_file> [--force]
7"""
8
9import argparse
10import shutil
11import subprocess
12import sys
13import tempfile
14import defusedxml.minidom
15import zipfile
16from pathlib import Path
17
18
19def main():
20 parser = argparse.ArgumentParser(description="Pack a directory into an Office file")
21 parser.add_argument("input_directory", help="Unpacked Office document directory")
22 parser.add_argument("output_file", help="Output Office file (.docx/.pptx/.xlsx)")
23 parser.add_argument("--force", action="store_true", help="Skip validation")
24 args = parser.parse_args()
25
26 try:
27 success = pack_document(
28 args.input_directory, args.output_file, validate=not args.force
29 )
30
31 # Show warning if validation was skipped
32 if args.force:
33 print("Warning: Skipped validation, file may be corrupt", file=sys.stderr)
34 # Exit with error if validation failed
35 elif not success:
36 print("Contents would produce a corrupt file.", file=sys.stderr)
37 print("Please validate XML before repacking.", file=sys.stderr)
38 print("Use --force to skip validation and pack anyway.", file=sys.stderr)
39 sys.exit(1)
40
41 except ValueError as e:
42 sys.exit(f"Error: {e}")
43
44
45def pack_document(input_dir, output_file, validate=False):
46 """Pack a directory into an Office file (.docx/.pptx/.xlsx).
47
48 Args:
49 input_dir: Path to unpacked Office document directory
50 output_file: Path to output Office file
51 validate: If True, validates with soffice (default: False)
52
53 Returns:
54 bool: True if successful, False if validation failed
55 """
56 input_dir = Path(input_dir)
57 output_file = Path(output_file)
58
59 if not input_dir.is_dir():
60 raise ValueError(f"{input_dir} is not a directory")
61 if output_file.suffix.lower() not in {".docx", ".pptx", ".xlsx"}:
62 raise ValueError(f"{output_file} must be a .docx, .pptx, or .xlsx file")
63
64 # Work in temporary directory to avoid modifying original
65 with tempfile.TemporaryDirectory() as temp_dir:
66 temp_content_dir = Path(temp_dir) / "content"
67 shutil.copytree(input_dir, temp_content_dir)
68
69 # Process XML files to remove pretty-printing whitespace
70 for pattern in ["*.xml", "*.rels"]:
71 for xml_file in temp_content_dir.rglob(pattern):
72 condense_xml(xml_file)
73
74 # Create final Office file as zip archive
75 output_file.parent.mkdir(parents=True, exist_ok=True)
76 with zipfile.ZipFile(output_file, "w", zipfile.ZIP_DEFLATED) as zf:
77 for f in temp_content_dir.rglob("*"):
78 if f.is_file():
79 zf.write(f, f.relative_to(temp_content_dir))
80
81 # Validate if requested
82 if validate:
83 if not validate_document(output_file):
84 output_file.unlink() # Delete the corrupt file
85 return False
86
87 return True
88
89
90def validate_document(doc_path):
91 """Validate document by converting to HTML with soffice."""
92 # Determine the correct filter based on file extension
93 match doc_path.suffix.lower():
94 case ".docx":
95 filter_name = "html:HTML"
96 case ".pptx":
97 filter_name = "html:impress_html_Export"
98 case ".xlsx":
99 filter_name = "html:HTML (StarCalc)"
100
101 with tempfile.TemporaryDirectory() as temp_dir:
102 try:
103 result = subprocess.run(
104 [
105 "soffice",
106 "--headless",
107 "--convert-to",
108 filter_name,
109 "--outdir",
110 temp_dir,
111 str(doc_path),
112 ],
113 capture_output=True,
114 timeout=10,
115 text=True,
116 )
117 if not (Path(temp_dir) / f"{doc_path.stem}.html").exists():
118 error_msg = result.stderr.strip() or "Document validation failed"
119 print(f"Validation error: {error_msg}", file=sys.stderr)
120 return False
121 return True
122 except FileNotFoundError:
123 print("Warning: soffice not found. Skipping validation.", file=sys.stderr)
124 return True
125 except subprocess.TimeoutExpired:
126 print("Validation error: Timeout during conversion", file=sys.stderr)
127 return False
128 except Exception as e:
129 print(f"Validation error: {e}", file=sys.stderr)
130 return False
131
132
133def condense_xml(xml_file):
134 """Strip unnecessary whitespace and remove comments."""
135 with open(xml_file, "r", encoding="utf-8") as f:
136 dom = defusedxml.minidom.parse(f)
137
138 # Process each element to remove whitespace and comments
139 for element in dom.getElementsByTagName("*"):
140 # Skip w:t elements and their processing
141 if element.tagName.endswith(":t"):
142 continue
143
144 # Remove whitespace-only text nodes and comment nodes
145 for child in list(element.childNodes):
146 if (
147 child.nodeType == child.TEXT_NODE
148 and child.nodeValue
149 and child.nodeValue.strip() == ""
150 ) or child.nodeType == child.COMMENT_NODE:
151 element.removeChild(child)
152
153 # Write back the condensed XML
154 with open(xml_file, "wb") as f:
155 f.write(dom.toxml(encoding="UTF-8"))
156
157
158if __name__ == "__main__":
159 main()