main
1#!/usr/bin/env python3
2"""
3Extract structured text content from PowerPoint presentations.
4
5This module provides functionality to:
6- Extract all text content from PowerPoint shapes
7- Preserve paragraph formatting (alignment, bullets, fonts, spacing)
8- Handle nested GroupShapes recursively with correct absolute positions
9- Sort shapes by visual position on slides
10- Filter out slide numbers and non-content placeholders
11- Export to JSON with clean, structured data
12
13Classes:
14 ParagraphData: Represents a text paragraph with formatting
15 ShapeData: Represents a shape with position and text content
16
17Main Functions:
18 extract_text_inventory: Extract all text from a presentation
19 save_inventory: Save extracted data to JSON
20
21Usage:
22 python inventory.py input.pptx output.json
23"""
24
25import argparse
26import json
27import platform
28import sys
29from dataclasses import dataclass
30from pathlib import Path
31from typing import Any, Dict, List, Optional, Tuple, Union
32
33from PIL import Image, ImageDraw, ImageFont
34from pptx import Presentation
35from pptx.enum.text import PP_ALIGN
36from pptx.shapes.base import BaseShape
37
38# Type aliases for cleaner signatures
39JsonValue = Union[str, int, float, bool, None]
40ParagraphDict = Dict[str, JsonValue]
41ShapeDict = Dict[
42 str, Union[str, float, bool, List[ParagraphDict], List[str], Dict[str, Any], None]
43]
44InventoryData = Dict[
45 str, Dict[str, "ShapeData"]
46] # Dict of slide_id -> {shape_id -> ShapeData}
47InventoryDict = Dict[str, Dict[str, ShapeDict]] # JSON-serializable inventory
48
49
50def main():
51 """Main entry point for command-line usage."""
52 parser = argparse.ArgumentParser(
53 description="Extract text inventory from PowerPoint with proper GroupShape support.",
54 formatter_class=argparse.RawDescriptionHelpFormatter,
55 epilog="""
56Examples:
57 python inventory.py presentation.pptx inventory.json
58 Extracts text inventory with correct absolute positions for grouped shapes
59
60 python inventory.py presentation.pptx inventory.json --issues-only
61 Extracts only text shapes that have overflow or overlap issues
62
63The output JSON includes:
64 - All text content organized by slide and shape
65 - Correct absolute positions for shapes in groups
66 - Visual position and size in inches
67 - Paragraph properties and formatting
68 - Issue detection: text overflow and shape overlaps
69 """,
70 )
71
72 parser.add_argument("input", help="Input PowerPoint file (.pptx)")
73 parser.add_argument("output", help="Output JSON file for inventory")
74 parser.add_argument(
75 "--issues-only",
76 action="store_true",
77 help="Include only text shapes that have overflow or overlap issues",
78 )
79
80 args = parser.parse_args()
81
82 input_path = Path(args.input)
83 if not input_path.exists():
84 print(f"Error: Input file not found: {args.input}")
85 sys.exit(1)
86
87 if not input_path.suffix.lower() == ".pptx":
88 print("Error: Input must be a PowerPoint file (.pptx)")
89 sys.exit(1)
90
91 try:
92 print(f"Extracting text inventory from: {args.input}")
93 if args.issues_only:
94 print(
95 "Filtering to include only text shapes with issues (overflow/overlap)"
96 )
97 inventory = extract_text_inventory(input_path, issues_only=args.issues_only)
98
99 output_path = Path(args.output)
100 output_path.parent.mkdir(parents=True, exist_ok=True)
101 save_inventory(inventory, output_path)
102
103 print(f"Output saved to: {args.output}")
104
105 # Report statistics
106 total_slides = len(inventory)
107 total_shapes = sum(len(shapes) for shapes in inventory.values())
108 if args.issues_only:
109 if total_shapes > 0:
110 print(
111 f"Found {total_shapes} text elements with issues in {total_slides} slides"
112 )
113 else:
114 print("No issues discovered")
115 else:
116 print(
117 f"Found text in {total_slides} slides with {total_shapes} text elements"
118 )
119
120 except Exception as e:
121 print(f"Error processing presentation: {e}")
122 import traceback
123
124 traceback.print_exc()
125 sys.exit(1)
126
127
128@dataclass
129class ShapeWithPosition:
130 """A shape with its absolute position on the slide."""
131
132 shape: BaseShape
133 absolute_left: int # in EMUs
134 absolute_top: int # in EMUs
135
136
137class ParagraphData:
138 """Data structure for paragraph properties extracted from a PowerPoint paragraph."""
139
140 def __init__(self, paragraph: Any):
141 """Initialize from a PowerPoint paragraph object.
142
143 Args:
144 paragraph: The PowerPoint paragraph object
145 """
146 self.text: str = paragraph.text.strip()
147 self.bullet: bool = False
148 self.level: Optional[int] = None
149 self.alignment: Optional[str] = None
150 self.space_before: Optional[float] = None
151 self.space_after: Optional[float] = None
152 self.font_name: Optional[str] = None
153 self.font_size: Optional[float] = None
154 self.bold: Optional[bool] = None
155 self.italic: Optional[bool] = None
156 self.underline: Optional[bool] = None
157 self.color: Optional[str] = None
158 self.theme_color: Optional[str] = None
159 self.line_spacing: Optional[float] = None
160
161 # Check for bullet formatting
162 if (
163 hasattr(paragraph, "_p")
164 and paragraph._p is not None
165 and paragraph._p.pPr is not None
166 ):
167 pPr = paragraph._p.pPr
168 ns = "{http://schemas.openxmlformats.org/drawingml/2006/main}"
169 if (
170 pPr.find(f"{ns}buChar") is not None
171 or pPr.find(f"{ns}buAutoNum") is not None
172 ):
173 self.bullet = True
174 if hasattr(paragraph, "level"):
175 self.level = paragraph.level
176
177 # Add alignment if not LEFT (default)
178 if hasattr(paragraph, "alignment") and paragraph.alignment is not None:
179 alignment_map = {
180 PP_ALIGN.CENTER: "CENTER",
181 PP_ALIGN.RIGHT: "RIGHT",
182 PP_ALIGN.JUSTIFY: "JUSTIFY",
183 }
184 if paragraph.alignment in alignment_map:
185 self.alignment = alignment_map[paragraph.alignment]
186
187 # Add spacing properties if set
188 if hasattr(paragraph, "space_before") and paragraph.space_before:
189 self.space_before = paragraph.space_before.pt
190 if hasattr(paragraph, "space_after") and paragraph.space_after:
191 self.space_after = paragraph.space_after.pt
192
193 # Extract font properties from first run
194 if paragraph.runs:
195 first_run = paragraph.runs[0]
196 if hasattr(first_run, "font"):
197 font = first_run.font
198 if font.name:
199 self.font_name = font.name
200 if font.size:
201 self.font_size = font.size.pt
202 if font.bold is not None:
203 self.bold = font.bold
204 if font.italic is not None:
205 self.italic = font.italic
206 if font.underline is not None:
207 self.underline = font.underline
208
209 # Handle color - both RGB and theme colors
210 try:
211 # Try RGB color first
212 if font.color.rgb:
213 self.color = str(font.color.rgb)
214 except (AttributeError, TypeError):
215 # Fall back to theme color
216 try:
217 if font.color.theme_color:
218 self.theme_color = font.color.theme_color.name
219 except (AttributeError, TypeError):
220 pass
221
222 # Add line spacing if set
223 if hasattr(paragraph, "line_spacing") and paragraph.line_spacing is not None:
224 if hasattr(paragraph.line_spacing, "pt"):
225 self.line_spacing = round(paragraph.line_spacing.pt, 2)
226 else:
227 # Multiplier - convert to points
228 font_size = self.font_size if self.font_size else 12.0
229 self.line_spacing = round(paragraph.line_spacing * font_size, 2)
230
231 def to_dict(self) -> ParagraphDict:
232 """Convert to dictionary for JSON serialization, excluding None values."""
233 result: ParagraphDict = {"text": self.text}
234
235 # Add optional fields only if they have values
236 if self.bullet:
237 result["bullet"] = self.bullet
238 if self.level is not None:
239 result["level"] = self.level
240 if self.alignment:
241 result["alignment"] = self.alignment
242 if self.space_before is not None:
243 result["space_before"] = self.space_before
244 if self.space_after is not None:
245 result["space_after"] = self.space_after
246 if self.font_name:
247 result["font_name"] = self.font_name
248 if self.font_size is not None:
249 result["font_size"] = self.font_size
250 if self.bold is not None:
251 result["bold"] = self.bold
252 if self.italic is not None:
253 result["italic"] = self.italic
254 if self.underline is not None:
255 result["underline"] = self.underline
256 if self.color:
257 result["color"] = self.color
258 if self.theme_color:
259 result["theme_color"] = self.theme_color
260 if self.line_spacing is not None:
261 result["line_spacing"] = self.line_spacing
262
263 return result
264
265
266class ShapeData:
267 """Data structure for shape properties extracted from a PowerPoint shape."""
268
269 @staticmethod
270 def emu_to_inches(emu: int) -> float:
271 """Convert EMUs (English Metric Units) to inches."""
272 return emu / 914400.0
273
274 @staticmethod
275 def inches_to_pixels(inches: float, dpi: int = 96) -> int:
276 """Convert inches to pixels at given DPI."""
277 return int(inches * dpi)
278
279 @staticmethod
280 def get_font_path(font_name: str) -> Optional[str]:
281 """Get the font file path for a given font name.
282
283 Args:
284 font_name: Name of the font (e.g., 'Arial', 'Calibri')
285
286 Returns:
287 Path to the font file, or None if not found
288 """
289 system = platform.system()
290
291 # Common font file variations to try
292 font_variations = [
293 font_name,
294 font_name.lower(),
295 font_name.replace(" ", ""),
296 font_name.replace(" ", "-"),
297 ]
298
299 # Define font directories and extensions by platform
300 if system == "Darwin": # macOS
301 font_dirs = [
302 "/System/Library/Fonts/",
303 "/Library/Fonts/",
304 "~/Library/Fonts/",
305 ]
306 extensions = [".ttf", ".otf", ".ttc", ".dfont"]
307 else: # Linux
308 font_dirs = [
309 "/usr/share/fonts/truetype/",
310 "/usr/local/share/fonts/",
311 "~/.fonts/",
312 ]
313 extensions = [".ttf", ".otf"]
314
315 # Try to find the font file
316 from pathlib import Path
317
318 for font_dir in font_dirs:
319 font_dir_path = Path(font_dir).expanduser()
320 if not font_dir_path.exists():
321 continue
322
323 # First try exact matches
324 for variant in font_variations:
325 for ext in extensions:
326 font_path = font_dir_path / f"{variant}{ext}"
327 if font_path.exists():
328 return str(font_path)
329
330 # Then try fuzzy matching - find files containing the font name
331 try:
332 for file_path in font_dir_path.iterdir():
333 if file_path.is_file():
334 file_name_lower = file_path.name.lower()
335 font_name_lower = font_name.lower().replace(" ", "")
336 if font_name_lower in file_name_lower and any(
337 file_name_lower.endswith(ext) for ext in extensions
338 ):
339 return str(file_path)
340 except (OSError, PermissionError):
341 continue
342
343 return None
344
345 @staticmethod
346 def get_slide_dimensions(slide: Any) -> tuple[Optional[int], Optional[int]]:
347 """Get slide dimensions from slide object.
348
349 Args:
350 slide: Slide object
351
352 Returns:
353 Tuple of (width_emu, height_emu) or (None, None) if not found
354 """
355 try:
356 prs = slide.part.package.presentation_part.presentation
357 return prs.slide_width, prs.slide_height
358 except (AttributeError, TypeError):
359 return None, None
360
361 @staticmethod
362 def get_default_font_size(shape: BaseShape, slide_layout: Any) -> Optional[float]:
363 """Extract default font size from slide layout for a placeholder shape.
364
365 Args:
366 shape: Placeholder shape
367 slide_layout: Slide layout containing the placeholder definition
368
369 Returns:
370 Default font size in points, or None if not found
371 """
372 try:
373 if not hasattr(shape, "placeholder_format"):
374 return None
375
376 shape_type = shape.placeholder_format.type # type: ignore
377 for layout_placeholder in slide_layout.placeholders:
378 if layout_placeholder.placeholder_format.type == shape_type:
379 # Find first defRPr element with sz (size) attribute
380 for elem in layout_placeholder.element.iter():
381 if "defRPr" in elem.tag and (sz := elem.get("sz")):
382 return float(sz) / 100.0 # Convert EMUs to points
383 break
384 except Exception:
385 pass
386 return None
387
388 def __init__(
389 self,
390 shape: BaseShape,
391 absolute_left: Optional[int] = None,
392 absolute_top: Optional[int] = None,
393 slide: Optional[Any] = None,
394 ):
395 """Initialize from a PowerPoint shape object.
396
397 Args:
398 shape: The PowerPoint shape object (should be pre-validated)
399 absolute_left: Absolute left position in EMUs (for shapes in groups)
400 absolute_top: Absolute top position in EMUs (for shapes in groups)
401 slide: Optional slide object to get dimensions and layout information
402 """
403 self.shape = shape # Store reference to original shape
404 self.shape_id: str = "" # Will be set after sorting
405
406 # Get slide dimensions from slide object
407 self.slide_width_emu, self.slide_height_emu = (
408 self.get_slide_dimensions(slide) if slide else (None, None)
409 )
410
411 # Get placeholder type if applicable
412 self.placeholder_type: Optional[str] = None
413 self.default_font_size: Optional[float] = None
414 if hasattr(shape, "is_placeholder") and shape.is_placeholder: # type: ignore
415 if shape.placeholder_format and shape.placeholder_format.type: # type: ignore
416 self.placeholder_type = (
417 str(shape.placeholder_format.type).split(".")[-1].split(" ")[0] # type: ignore
418 )
419
420 # Get default font size from layout
421 if slide and hasattr(slide, "slide_layout"):
422 self.default_font_size = self.get_default_font_size(
423 shape, slide.slide_layout
424 )
425
426 # Get position information
427 # Use absolute positions if provided (for shapes in groups), otherwise use shape's position
428 left_emu = (
429 absolute_left
430 if absolute_left is not None
431 else (shape.left if hasattr(shape, "left") else 0)
432 )
433 top_emu = (
434 absolute_top
435 if absolute_top is not None
436 else (shape.top if hasattr(shape, "top") else 0)
437 )
438
439 self.left: float = round(self.emu_to_inches(left_emu), 2) # type: ignore
440 self.top: float = round(self.emu_to_inches(top_emu), 2) # type: ignore
441 self.width: float = round(
442 self.emu_to_inches(shape.width if hasattr(shape, "width") else 0),
443 2, # type: ignore
444 )
445 self.height: float = round(
446 self.emu_to_inches(shape.height if hasattr(shape, "height") else 0),
447 2, # type: ignore
448 )
449
450 # Store EMU positions for overflow calculations
451 self.left_emu = left_emu
452 self.top_emu = top_emu
453 self.width_emu = shape.width if hasattr(shape, "width") else 0
454 self.height_emu = shape.height if hasattr(shape, "height") else 0
455
456 # Calculate overflow status
457 self.frame_overflow_bottom: Optional[float] = None
458 self.slide_overflow_right: Optional[float] = None
459 self.slide_overflow_bottom: Optional[float] = None
460 self.overlapping_shapes: Dict[
461 str, float
462 ] = {} # Dict of shape_id -> overlap area in sq inches
463 self.warnings: List[str] = []
464 self._estimate_frame_overflow()
465 self._calculate_slide_overflow()
466 self._detect_bullet_issues()
467
468 @property
469 def paragraphs(self) -> List[ParagraphData]:
470 """Calculate paragraphs from the shape's text frame."""
471 if not self.shape or not hasattr(self.shape, "text_frame"):
472 return []
473
474 paragraphs = []
475 for paragraph in self.shape.text_frame.paragraphs: # type: ignore
476 if paragraph.text.strip():
477 paragraphs.append(ParagraphData(paragraph))
478 return paragraphs
479
480 def _get_default_font_size(self) -> int:
481 """Get default font size from theme text styles or use conservative default."""
482 try:
483 if not (
484 hasattr(self.shape, "part") and hasattr(self.shape.part, "slide_layout")
485 ):
486 return 14
487
488 slide_master = self.shape.part.slide_layout.slide_master # type: ignore
489 if not hasattr(slide_master, "element"):
490 return 14
491
492 # Determine theme style based on placeholder type
493 style_name = "bodyStyle" # Default
494 if self.placeholder_type and "TITLE" in self.placeholder_type:
495 style_name = "titleStyle"
496
497 # Find font size in theme styles
498 for child in slide_master.element.iter():
499 tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
500 if tag == style_name:
501 for elem in child.iter():
502 if "sz" in elem.attrib:
503 return int(elem.attrib["sz"]) // 100
504 except Exception:
505 pass
506
507 return 14 # Conservative default for body text
508
509 def _get_usable_dimensions(self, text_frame) -> Tuple[int, int]:
510 """Get usable width and height in pixels after accounting for margins."""
511 # Default PowerPoint margins in inches
512 margins = {"top": 0.05, "bottom": 0.05, "left": 0.1, "right": 0.1}
513
514 # Override with actual margins if set
515 if hasattr(text_frame, "margin_top") and text_frame.margin_top:
516 margins["top"] = self.emu_to_inches(text_frame.margin_top)
517 if hasattr(text_frame, "margin_bottom") and text_frame.margin_bottom:
518 margins["bottom"] = self.emu_to_inches(text_frame.margin_bottom)
519 if hasattr(text_frame, "margin_left") and text_frame.margin_left:
520 margins["left"] = self.emu_to_inches(text_frame.margin_left)
521 if hasattr(text_frame, "margin_right") and text_frame.margin_right:
522 margins["right"] = self.emu_to_inches(text_frame.margin_right)
523
524 # Calculate usable area
525 usable_width = self.width - margins["left"] - margins["right"]
526 usable_height = self.height - margins["top"] - margins["bottom"]
527
528 # Convert to pixels
529 return (
530 self.inches_to_pixels(usable_width),
531 self.inches_to_pixels(usable_height),
532 )
533
534 def _wrap_text_line(self, line: str, max_width_px: int, draw, font) -> List[str]:
535 """Wrap a single line of text to fit within max_width_px."""
536 if not line:
537 return [""]
538
539 # Use textlength for efficient width calculation
540 if draw.textlength(line, font=font) <= max_width_px:
541 return [line]
542
543 # Need to wrap - split into words
544 wrapped = []
545 words = line.split(" ")
546 current_line = ""
547
548 for word in words:
549 test_line = current_line + (" " if current_line else "") + word
550 if draw.textlength(test_line, font=font) <= max_width_px:
551 current_line = test_line
552 else:
553 if current_line:
554 wrapped.append(current_line)
555 current_line = word
556
557 if current_line:
558 wrapped.append(current_line)
559
560 return wrapped
561
562 def _estimate_frame_overflow(self) -> None:
563 """Estimate if text overflows the shape bounds using PIL text measurement."""
564 if not self.shape or not hasattr(self.shape, "text_frame"):
565 return
566
567 text_frame = self.shape.text_frame # type: ignore
568 if not text_frame or not text_frame.paragraphs:
569 return
570
571 # Get usable dimensions after accounting for margins
572 usable_width_px, usable_height_px = self._get_usable_dimensions(text_frame)
573 if usable_width_px <= 0 or usable_height_px <= 0:
574 return
575
576 # Set up PIL for text measurement
577 dummy_img = Image.new("RGB", (1, 1))
578 draw = ImageDraw.Draw(dummy_img)
579
580 # Get default font size from placeholder or use conservative estimate
581 default_font_size = self._get_default_font_size()
582
583 # Calculate total height of all paragraphs
584 total_height_px = 0
585
586 for para_idx, paragraph in enumerate(text_frame.paragraphs):
587 if not paragraph.text.strip():
588 continue
589
590 para_data = ParagraphData(paragraph)
591
592 # Load font for this paragraph
593 font_name = para_data.font_name or "Arial"
594 font_size = int(para_data.font_size or default_font_size)
595
596 font = None
597 font_path = self.get_font_path(font_name)
598 if font_path:
599 try:
600 font = ImageFont.truetype(font_path, size=font_size)
601 except Exception:
602 font = ImageFont.load_default()
603 else:
604 font = ImageFont.load_default()
605
606 # Wrap all lines in this paragraph
607 all_wrapped_lines = []
608 for line in paragraph.text.split("\n"):
609 wrapped = self._wrap_text_line(line, usable_width_px, draw, font)
610 all_wrapped_lines.extend(wrapped)
611
612 if all_wrapped_lines:
613 # Calculate line height
614 if para_data.line_spacing:
615 # Custom line spacing explicitly set
616 line_height_px = para_data.line_spacing * 96 / 72
617 else:
618 # PowerPoint default single spacing (1.0x font size)
619 line_height_px = font_size * 96 / 72
620
621 # Add space_before (except first paragraph)
622 if para_idx > 0 and para_data.space_before:
623 total_height_px += para_data.space_before * 96 / 72
624
625 # Add paragraph text height
626 total_height_px += len(all_wrapped_lines) * line_height_px
627
628 # Add space_after
629 if para_data.space_after:
630 total_height_px += para_data.space_after * 96 / 72
631
632 # Check for overflow (ignore negligible overflows <= 0.05")
633 if total_height_px > usable_height_px:
634 overflow_px = total_height_px - usable_height_px
635 overflow_inches = round(overflow_px / 96.0, 2)
636 if overflow_inches > 0.05: # Only report significant overflows
637 self.frame_overflow_bottom = overflow_inches
638
639 def _calculate_slide_overflow(self) -> None:
640 """Calculate if shape overflows the slide boundaries."""
641 if self.slide_width_emu is None or self.slide_height_emu is None:
642 return
643
644 # Check right overflow (ignore negligible overflows <= 0.01")
645 right_edge_emu = self.left_emu + self.width_emu
646 if right_edge_emu > self.slide_width_emu:
647 overflow_emu = right_edge_emu - self.slide_width_emu
648 overflow_inches = round(self.emu_to_inches(overflow_emu), 2)
649 if overflow_inches > 0.01: # Only report significant overflows
650 self.slide_overflow_right = overflow_inches
651
652 # Check bottom overflow (ignore negligible overflows <= 0.01")
653 bottom_edge_emu = self.top_emu + self.height_emu
654 if bottom_edge_emu > self.slide_height_emu:
655 overflow_emu = bottom_edge_emu - self.slide_height_emu
656 overflow_inches = round(self.emu_to_inches(overflow_emu), 2)
657 if overflow_inches > 0.01: # Only report significant overflows
658 self.slide_overflow_bottom = overflow_inches
659
660 def _detect_bullet_issues(self) -> None:
661 """Detect bullet point formatting issues in paragraphs."""
662 if not self.shape or not hasattr(self.shape, "text_frame"):
663 return
664
665 text_frame = self.shape.text_frame # type: ignore
666 if not text_frame or not text_frame.paragraphs:
667 return
668
669 # Common bullet symbols that indicate manual bullets
670 bullet_symbols = ["•", "●", "○"]
671
672 for paragraph in text_frame.paragraphs:
673 text = paragraph.text.strip()
674 # Check for manual bullet symbols
675 if text and any(text.startswith(symbol + " ") for symbol in bullet_symbols):
676 self.warnings.append(
677 "manual_bullet_symbol: use proper bullet formatting"
678 )
679 break
680
681 @property
682 def has_any_issues(self) -> bool:
683 """Check if shape has any issues (overflow, overlap, or warnings)."""
684 return (
685 self.frame_overflow_bottom is not None
686 or self.slide_overflow_right is not None
687 or self.slide_overflow_bottom is not None
688 or len(self.overlapping_shapes) > 0
689 or len(self.warnings) > 0
690 )
691
692 def to_dict(self) -> ShapeDict:
693 """Convert to dictionary for JSON serialization."""
694 result: ShapeDict = {
695 "left": self.left,
696 "top": self.top,
697 "width": self.width,
698 "height": self.height,
699 }
700
701 # Add optional fields if present
702 if self.placeholder_type:
703 result["placeholder_type"] = self.placeholder_type
704
705 if self.default_font_size:
706 result["default_font_size"] = self.default_font_size
707
708 # Add overflow information only if there is overflow
709 overflow_data = {}
710
711 # Add frame overflow if present
712 if self.frame_overflow_bottom is not None:
713 overflow_data["frame"] = {"overflow_bottom": self.frame_overflow_bottom}
714
715 # Add slide overflow if present
716 slide_overflow = {}
717 if self.slide_overflow_right is not None:
718 slide_overflow["overflow_right"] = self.slide_overflow_right
719 if self.slide_overflow_bottom is not None:
720 slide_overflow["overflow_bottom"] = self.slide_overflow_bottom
721 if slide_overflow:
722 overflow_data["slide"] = slide_overflow
723
724 # Only add overflow field if there is overflow
725 if overflow_data:
726 result["overflow"] = overflow_data
727
728 # Add overlap field if there are overlapping shapes
729 if self.overlapping_shapes:
730 result["overlap"] = {"overlapping_shapes": self.overlapping_shapes}
731
732 # Add warnings field if there are warnings
733 if self.warnings:
734 result["warnings"] = self.warnings
735
736 # Add paragraphs after placeholder_type
737 result["paragraphs"] = [para.to_dict() for para in self.paragraphs]
738
739 return result
740
741
742def is_valid_shape(shape: BaseShape) -> bool:
743 """Check if a shape contains meaningful text content."""
744 # Must have a text frame with content
745 if not hasattr(shape, "text_frame") or not shape.text_frame: # type: ignore
746 return False
747
748 text = shape.text_frame.text.strip() # type: ignore
749 if not text:
750 return False
751
752 # Skip slide numbers and numeric footers
753 if hasattr(shape, "is_placeholder") and shape.is_placeholder: # type: ignore
754 if shape.placeholder_format and shape.placeholder_format.type: # type: ignore
755 placeholder_type = (
756 str(shape.placeholder_format.type).split(".")[-1].split(" ")[0] # type: ignore
757 )
758 if placeholder_type == "SLIDE_NUMBER":
759 return False
760 if placeholder_type == "FOOTER" and text.isdigit():
761 return False
762
763 return True
764
765
766def collect_shapes_with_absolute_positions(
767 shape: BaseShape, parent_left: int = 0, parent_top: int = 0
768) -> List[ShapeWithPosition]:
769 """Recursively collect all shapes with valid text, calculating absolute positions.
770
771 For shapes within groups, their positions are relative to the group.
772 This function calculates the absolute position on the slide by accumulating
773 parent group offsets.
774
775 Args:
776 shape: The shape to process
777 parent_left: Accumulated left offset from parent groups (in EMUs)
778 parent_top: Accumulated top offset from parent groups (in EMUs)
779
780 Returns:
781 List of ShapeWithPosition objects with absolute positions
782 """
783 if hasattr(shape, "shapes"): # GroupShape
784 result = []
785 # Get this group's position
786 group_left = shape.left if hasattr(shape, "left") else 0
787 group_top = shape.top if hasattr(shape, "top") else 0
788
789 # Calculate absolute position for this group
790 abs_group_left = parent_left + group_left
791 abs_group_top = parent_top + group_top
792
793 # Process children with accumulated offsets
794 for child in shape.shapes: # type: ignore
795 result.extend(
796 collect_shapes_with_absolute_positions(
797 child, abs_group_left, abs_group_top
798 )
799 )
800 return result
801
802 # Regular shape - check if it has valid text
803 if is_valid_shape(shape):
804 # Calculate absolute position
805 shape_left = shape.left if hasattr(shape, "left") else 0
806 shape_top = shape.top if hasattr(shape, "top") else 0
807
808 return [
809 ShapeWithPosition(
810 shape=shape,
811 absolute_left=parent_left + shape_left,
812 absolute_top=parent_top + shape_top,
813 )
814 ]
815
816 return []
817
818
819def sort_shapes_by_position(shapes: List[ShapeData]) -> List[ShapeData]:
820 """Sort shapes by visual position (top-to-bottom, left-to-right).
821
822 Shapes within 0.5 inches vertically are considered on the same row.
823 """
824 if not shapes:
825 return shapes
826
827 # Sort by top position first
828 shapes = sorted(shapes, key=lambda s: (s.top, s.left))
829
830 # Group shapes by row (within 0.5 inches vertically)
831 result = []
832 row = [shapes[0]]
833 row_top = shapes[0].top
834
835 for shape in shapes[1:]:
836 if abs(shape.top - row_top) <= 0.5:
837 row.append(shape)
838 else:
839 # Sort current row by left position and add to result
840 result.extend(sorted(row, key=lambda s: s.left))
841 row = [shape]
842 row_top = shape.top
843
844 # Don't forget the last row
845 result.extend(sorted(row, key=lambda s: s.left))
846 return result
847
848
849def calculate_overlap(
850 rect1: Tuple[float, float, float, float],
851 rect2: Tuple[float, float, float, float],
852 tolerance: float = 0.05,
853) -> Tuple[bool, float]:
854 """Calculate if and how much two rectangles overlap.
855
856 Args:
857 rect1: (left, top, width, height) of first rectangle in inches
858 rect2: (left, top, width, height) of second rectangle in inches
859 tolerance: Minimum overlap in inches to consider as overlapping (default: 0.05")
860
861 Returns:
862 Tuple of (overlaps, overlap_area) where:
863 - overlaps: True if rectangles overlap by more than tolerance
864 - overlap_area: Area of overlap in square inches
865 """
866 left1, top1, w1, h1 = rect1
867 left2, top2, w2, h2 = rect2
868
869 # Calculate overlap dimensions
870 overlap_width = min(left1 + w1, left2 + w2) - max(left1, left2)
871 overlap_height = min(top1 + h1, top2 + h2) - max(top1, top2)
872
873 # Check if there's meaningful overlap (more than tolerance)
874 if overlap_width > tolerance and overlap_height > tolerance:
875 # Calculate overlap area in square inches
876 overlap_area = overlap_width * overlap_height
877 return True, round(overlap_area, 2)
878
879 return False, 0
880
881
882def detect_overlaps(shapes: List[ShapeData]) -> None:
883 """Detect overlapping shapes and update their overlapping_shapes dictionaries.
884
885 This function requires each ShapeData to have its shape_id already set.
886 It modifies the shapes in-place, adding shape IDs with overlap areas in square inches.
887
888 Args:
889 shapes: List of ShapeData objects with shape_id attributes set
890 """
891 n = len(shapes)
892
893 # Compare each pair of shapes
894 for i in range(n):
895 for j in range(i + 1, n):
896 shape1 = shapes[i]
897 shape2 = shapes[j]
898
899 # Ensure shape IDs are set
900 assert shape1.shape_id, f"Shape at index {i} has no shape_id"
901 assert shape2.shape_id, f"Shape at index {j} has no shape_id"
902
903 rect1 = (shape1.left, shape1.top, shape1.width, shape1.height)
904 rect2 = (shape2.left, shape2.top, shape2.width, shape2.height)
905
906 overlaps, overlap_area = calculate_overlap(rect1, rect2)
907
908 if overlaps:
909 # Add shape IDs with overlap area in square inches
910 shape1.overlapping_shapes[shape2.shape_id] = overlap_area
911 shape2.overlapping_shapes[shape1.shape_id] = overlap_area
912
913
914def extract_text_inventory(
915 pptx_path: Path, prs: Optional[Any] = None, issues_only: bool = False
916) -> InventoryData:
917 """Extract text content from all slides in a PowerPoint presentation.
918
919 Args:
920 pptx_path: Path to the PowerPoint file
921 prs: Optional Presentation object to use. If not provided, will load from pptx_path.
922 issues_only: If True, only include shapes that have overflow or overlap issues
923
924 Returns a nested dictionary: {slide-N: {shape-N: ShapeData}}
925 Shapes are sorted by visual position (top-to-bottom, left-to-right).
926 The ShapeData objects contain the full shape information and can be
927 converted to dictionaries for JSON serialization using to_dict().
928 """
929 if prs is None:
930 prs = Presentation(str(pptx_path))
931 inventory: InventoryData = {}
932
933 for slide_idx, slide in enumerate(prs.slides):
934 # Collect all valid shapes from this slide with absolute positions
935 shapes_with_positions = []
936 for shape in slide.shapes: # type: ignore
937 shapes_with_positions.extend(collect_shapes_with_absolute_positions(shape))
938
939 if not shapes_with_positions:
940 continue
941
942 # Convert to ShapeData with absolute positions and slide reference
943 shape_data_list = [
944 ShapeData(
945 swp.shape,
946 swp.absolute_left,
947 swp.absolute_top,
948 slide,
949 )
950 for swp in shapes_with_positions
951 ]
952
953 # Sort by visual position and assign stable IDs in one step
954 sorted_shapes = sort_shapes_by_position(shape_data_list)
955 for idx, shape_data in enumerate(sorted_shapes):
956 shape_data.shape_id = f"shape-{idx}"
957
958 # Detect overlaps using the stable shape IDs
959 if len(sorted_shapes) > 1:
960 detect_overlaps(sorted_shapes)
961
962 # Filter for issues only if requested (after overlap detection)
963 if issues_only:
964 sorted_shapes = [sd for sd in sorted_shapes if sd.has_any_issues]
965
966 if not sorted_shapes:
967 continue
968
969 # Create slide inventory using the stable shape IDs
970 inventory[f"slide-{slide_idx}"] = {
971 shape_data.shape_id: shape_data for shape_data in sorted_shapes
972 }
973
974 return inventory
975
976
977def get_inventory_as_dict(pptx_path: Path, issues_only: bool = False) -> InventoryDict:
978 """Extract text inventory and return as JSON-serializable dictionaries.
979
980 This is a convenience wrapper around extract_text_inventory that returns
981 dictionaries instead of ShapeData objects, useful for testing and direct
982 JSON serialization.
983
984 Args:
985 pptx_path: Path to the PowerPoint file
986 issues_only: If True, only include shapes that have overflow or overlap issues
987
988 Returns:
989 Nested dictionary with all data serialized for JSON
990 """
991 inventory = extract_text_inventory(pptx_path, issues_only=issues_only)
992
993 # Convert ShapeData objects to dictionaries
994 dict_inventory: InventoryDict = {}
995 for slide_key, shapes in inventory.items():
996 dict_inventory[slide_key] = {
997 shape_key: shape_data.to_dict() for shape_key, shape_data in shapes.items()
998 }
999
1000 return dict_inventory
1001
1002
1003def save_inventory(inventory: InventoryData, output_path: Path) -> None:
1004 """Save inventory to JSON file with proper formatting.
1005
1006 Converts ShapeData objects to dictionaries for JSON serialization.
1007 """
1008 # Convert ShapeData objects to dictionaries
1009 json_inventory: InventoryDict = {}
1010 for slide_key, shapes in inventory.items():
1011 json_inventory[slide_key] = {
1012 shape_key: shape_data.to_dict() for shape_key, shape_data in shapes.items()
1013 }
1014
1015 with open(output_path, "w", encoding="utf-8") as f:
1016 json.dump(json_inventory, f, indent=2, ensure_ascii=False)
1017
1018
1019if __name__ == "__main__":
1020 main()