skills/skills/docx/ooxml/scripts/pack.py at main

  1#!/usr/bin/env python3
  2"""
  3Tool to pack a directory into a .docx, .pptx, or .xlsx file with XML formatting undone.
  4
  5Example usage:
  6    python pack.py <input_directory> <office_file> [--force]
  7"""
  8
  9import argparse
 10import shutil
 11import subprocess
 12import sys
 13import tempfile
 14import defusedxml.minidom
 15import zipfile
 16from pathlib import Path
 17
 18
 19def main():
 20    parser = argparse.ArgumentParser(description="Pack a directory into an Office file")
 21    parser.add_argument("input_directory", help="Unpacked Office document directory")
 22    parser.add_argument("output_file", help="Output Office file (.docx/.pptx/.xlsx)")
 23    parser.add_argument("--force", action="store_true", help="Skip validation")
 24    args = parser.parse_args()
 25
 26    try:
 27        success = pack_document(
 28            args.input_directory, args.output_file, validate=not args.force
 29        )
 30
 31        # Show warning if validation was skipped
 32        if args.force:
 33            print("Warning: Skipped validation, file may be corrupt", file=sys.stderr)
 34        # Exit with error if validation failed
 35        elif not success:
 36            print("Contents would produce a corrupt file.", file=sys.stderr)
 37            print("Please validate XML before repacking.", file=sys.stderr)
 38            print("Use --force to skip validation and pack anyway.", file=sys.stderr)
 39            sys.exit(1)
 40
 41    except ValueError as e:
 42        sys.exit(f"Error: {e}")
 43
 44
 45def pack_document(input_dir, output_file, validate=False):
 46    """Pack a directory into an Office file (.docx/.pptx/.xlsx).
 47
 48    Args:
 49        input_dir: Path to unpacked Office document directory
 50        output_file: Path to output Office file
 51        validate: If True, validates with soffice (default: False)
 52
 53    Returns:
 54        bool: True if successful, False if validation failed
 55    """
 56    input_dir = Path(input_dir)
 57    output_file = Path(output_file)
 58
 59    if not input_dir.is_dir():
 60        raise ValueError(f"{input_dir} is not a directory")
 61    if output_file.suffix.lower() not in {".docx", ".pptx", ".xlsx"}:
 62        raise ValueError(f"{output_file} must be a .docx, .pptx, or .xlsx file")
 63
 64    # Work in temporary directory to avoid modifying original
 65    with tempfile.TemporaryDirectory() as temp_dir:
 66        temp_content_dir = Path(temp_dir) / "content"
 67        shutil.copytree(input_dir, temp_content_dir)
 68
 69        # Process XML files to remove pretty-printing whitespace
 70        for pattern in ["*.xml", "*.rels"]:
 71            for xml_file in temp_content_dir.rglob(pattern):
 72                condense_xml(xml_file)
 73
 74        # Create final Office file as zip archive
 75        output_file.parent.mkdir(parents=True, exist_ok=True)
 76        with zipfile.ZipFile(output_file, "w", zipfile.ZIP_DEFLATED) as zf:
 77            for f in temp_content_dir.rglob("*"):
 78                if f.is_file():
 79                    zf.write(f, f.relative_to(temp_content_dir))
 80
 81        # Validate if requested
 82        if validate:
 83            if not validate_document(output_file):
 84                output_file.unlink()  # Delete the corrupt file
 85                return False
 86
 87    return True
 88
 89
 90def validate_document(doc_path):
 91    """Validate document by converting to HTML with soffice."""
 92    # Determine the correct filter based on file extension
 93    match doc_path.suffix.lower():
 94        case ".docx":
 95            filter_name = "html:HTML"
 96        case ".pptx":
 97            filter_name = "html:impress_html_Export"
 98        case ".xlsx":
 99            filter_name = "html:HTML (StarCalc)"
100
101    with tempfile.TemporaryDirectory() as temp_dir:
102        try:
103            result = subprocess.run(
104                [
105                    "soffice",
106                    "--headless",
107                    "--convert-to",
108                    filter_name,
109                    "--outdir",
110                    temp_dir,
111                    str(doc_path),
112                ],
113                capture_output=True,
114                timeout=10,
115                text=True,
116            )
117            if not (Path(temp_dir) / f"{doc_path.stem}.html").exists():
118                error_msg = result.stderr.strip() or "Document validation failed"
119                print(f"Validation error: {error_msg}", file=sys.stderr)
120                return False
121            return True
122        except FileNotFoundError:
123            print("Warning: soffice not found. Skipping validation.", file=sys.stderr)
124            return True
125        except subprocess.TimeoutExpired:
126            print("Validation error: Timeout during conversion", file=sys.stderr)
127            return False
128        except Exception as e:
129            print(f"Validation error: {e}", file=sys.stderr)
130            return False
131
132
133def condense_xml(xml_file):
134    """Strip unnecessary whitespace and remove comments."""
135    with open(xml_file, "r", encoding="utf-8") as f:
136        dom = defusedxml.minidom.parse(f)
137
138    # Process each element to remove whitespace and comments
139    for element in dom.getElementsByTagName("*"):
140        # Skip w:t elements and their processing
141        if element.tagName.endswith(":t"):
142            continue
143
144        # Remove whitespace-only text nodes and comment nodes
145        for child in list(element.childNodes):
146            if (
147                child.nodeType == child.TEXT_NODE
148                and child.nodeValue
149                and child.nodeValue.strip() == ""
150            ) or child.nodeType == child.COMMENT_NODE:
151                element.removeChild(child)
152
153    # Write back the condensed XML
154    with open(xml_file, "wb") as f:
155        f.write(dom.toxml(encoding="UTF-8"))
156
157
158if __name__ == "__main__":
159    main()