""" Template parser engine for the universal data report generator. Reads any .pptx template and outputs a structured TemplateProfile describing master slide types, placeholders, colors, fonts, and layout geometry. """ from __future__ import annotations import os import re from dataclasses import dataclass, field from pathlib import Path from typing import Optional from pptx import Presentation from pptx.dml.color import RGBColor from pptx.util import Emu # ============================================================================== # DATA MODELS # ============================================================================== @dataclass class MasterSlideInfo: slide_index: int master_type: str # 'cover' | 'content' | 'toc' | 'end' | 'unknown' placeholders: list[str] = field(default_factory=list) content_top: int = 0 # EMU has_footer: bool = False has_background: bool = False shape_count: int = 0 @dataclass class TemplateProfile: path: str is_builtin: bool slide_width: int slide_height: int master_slides: list[MasterSlideInfo] = field(default_factory=list) placeholder_map: dict[str, list[int]] = field(default_factory=dict) detected_theme: dict[str, str] = field(default_factory=dict) detected_fonts: dict[str, str] = field(default_factory=dict) safe_margins: dict[str, int] = field(default_factory=dict) def get_master_for(self, page_type: str) -> Optional[MasterSlideInfo]: """Return the first master slide matching page_type, or None.""" for ms in self.master_slides: if ms.master_type == page_type: return ms return None def get_content_top(self, page_type: str = "content") -> int: """Return content_top for the given page_type, or best guess.""" ms = self.get_master_for(page_type) if ms and ms.content_top > 0: return ms.content_top # Fallback to any content page for ms in self.master_slides: if ms.master_type == "content" and ms.content_top > 0: return ms.content_top # Hard fallback return int(Emu(1422400)) def get_master_index_for(self, page_type: str) -> int: """Return slide index for page_type, with fallback rules.""" ms = self.get_master_for(page_type) if ms: return ms.slide_index # Fallback heuristics if page_type == "cover" and self.master_slides: return self.master_slides[0].slide_index if page_type == "end" and self.master_slides: return self.master_slides[-1].slide_index if page_type == "toc" and len(self.master_slides) >= 3: return self.master_slides[2].slide_index if len(self.master_slides) >= 2: return self.master_slides[1].slide_index return 0 # ============================================================================== # PLACEHOLDER DETECTION # ============================================================================== _PLACEHOLDER_RE = re.compile(r"\{[^{}]+\}") # Canonical placeholder -> list of aliases (including itself) PLACEHOLDER_ALIASES: dict[str, list[str]] = { "{report_title}": ["{report_title}", "{标题}", "{title}", "{报告标题}"], "{report_type}": ["{report_type}", "{报告类型}", "{type}"], "{date}": ["{date}", "{日期}", "{report_date}", "{报告日期}"], "{department}": ["{department}", "{部门}", "{source}", "{来源}", "{dept}"], "{period}": ["{period}", "{周期}", "{report_period}", "{时间周期}"], "{gen_time}": ["{gen_time}", "{生成时间}", "{generated_time}"], "{page_title}": ["{page_title}", "{页面标题}", "{subtitle}", "{page_header}"], "{source}": ["{source}", "{数据来源}", "{data_source}"], "{page_num}": ["{page_num}", "{页码}", "{page_number}"], } # Chapter placeholders are generated dynamically for i in range(1, 13): PLACEHOLDER_ALIASES[f"{{chapter{i}_title}}"] = [f"{{chapter{i}_title}}", f"{{章节{i}标题}}"] PLACEHOLDER_ALIASES[f"{{chapter{i}_desc}}"] = [f"{{chapter{i}_desc}}", f"{{章节{i}描述}}"] # KPI placeholders for i in range(1, 13): PLACEHOLDER_ALIASES[f"{{kpi{i}_label}}"] = [f"{{kpi{i}_label}}", f"{{kpi{i}_name}}"] PLACEHOLDER_ALIASES[f"{{kpi{i}_value}}"] = [f"{{kpi{i}_value}}", f"{{kpi{i}_val}}"] def _scan_placeholders(slide) -> list[str]: """Scan a slide for all placeholder-like strings {xxx}.""" found = set() for shape in slide.shapes: if shape.has_text_frame: text = shape.text_frame.text or "" for match in _PLACEHOLDER_RE.finditer(text): found.add(match.group(0)) return sorted(found) def _normalize_placeholder(raw: str) -> Optional[str]: """Map a raw placeholder to its canonical form, if known.""" raw_lower = raw.lower() for canonical, aliases in PLACEHOLDER_ALIASES.items(): if raw_lower in [a.lower() for a in aliases]: return canonical return None # ============================================================================== # MASTER SLIDE TYPE DETECTION # ============================================================================== _TYPE_KEYWORDS: dict[str, list[str]] = { "cover": ["{report_title}", "{date}", "{department}", "{report_type}", "{gen_time}"], "content": ["{page_title}", "{source}", "{page_num}", "{period}"], "toc": ["{chapter", "contents", "目录", "catalog", "agenda"], "end": ["{report_title}", "感谢", "thank", "结语", "尾页", "end"], } def _detect_master_type(slide, slide_index: int, total_slides: int) -> str: """Detect the semantic type of a master slide.""" texts = [] placeholders = [] for shape in slide.shapes: if shape.has_text_frame: t = (shape.text_frame.text or "").strip() if t: texts.append(t.lower()) placeholders.extend(_PLACEHOLDER_RE.findall(t)) text_block = " ".join(texts) ph_block = " ".join(placeholders).lower() scores: dict[str, int] = {"cover": 0, "content": 0, "toc": 0, "end": 0, "unknown": 0} # Score by keywords for ptype, keywords in _TYPE_KEYWORDS.items(): for kw in keywords: if kw.lower() in ph_block or kw.lower() in text_block: scores[ptype] += 1 # Position heuristics if slide_index == 0: scores["cover"] += 2 if slide_index == total_slides - 1: scores["end"] += 2 if total_slides >= 3 and slide_index == 2: scores["toc"] += 1 # Content page has page_title but not report_title (cover does) if "{page_title}" in ph_block: if "{report_title}" in ph_block: # Could be cover with both; check position of report_title # If report_title is at top-left small text, it's a header → content scores["cover"] += 1 else: scores["content"] += 3 # TOC strongly signaled by chapter placeholders if "{chapter" in ph_block: scores["toc"] += 5 # Distinguish end from cover: end usually lacks date/department placeholders if "{date}" in ph_block and "{department}" in ph_block: scores["cover"] += 2 scores["end"] -= 1 # Cover usually has KPI placeholders if "{kpi1_label}" in ph_block: scores["cover"] += 2 best = max(scores, key=lambda k: scores[k]) if scores[best] == 0: # Default fallback by position if slide_index == 0: return "cover" if slide_index == total_slides - 1: return "end" return "content" return best # ============================================================================== # CONTENT TOP DETECTION # ============================================================================== def _detect_content_top(slide, default_gap: int = 381000) -> int: """Detect content start Y by finding page_title placeholder bottom + gap.""" page_title_bottom = None for shape in slide.shapes: if not shape.has_text_frame: continue text = shape.text_frame.text or "" # Match any page_title alias if _matches_any_placeholder(text, "{page_title}"): page_title_bottom = int(shape.top) + int(shape.height) break if page_title_bottom is not None: return page_title_bottom + default_gap # Fallback: find any text shape in the upper area that looks like a title for shape in slide.shapes: if not shape.has_text_frame: continue if int(shape.top) > Emu(500000) and int(shape.top) < Emu(1500000): text = (shape.text_frame.text or "").strip() if text and len(text) < 40 and "{" not in text: return int(shape.top) + int(shape.height) + default_gap return int(Emu(1422400)) def _matches_any_placeholder(text: str, canonical: str) -> bool: aliases = PLACEHOLDER_ALIASES.get(canonical, [canonical]) for alias in aliases: if alias in text: return True return False # ============================================================================== # COLOR EXTRACTION # ============================================================================== def _extract_colors(slide) -> dict[str, str]: """Extract dominant colors from a slide's shapes and theme.""" colors: dict[str, str] = {} # Try theme color scheme first try: theme = slide.slide_layout.slide_master.theme cs = theme.color_scheme # Map theme colors theme_map = { "primary": cs.accent1, "accent": cs.accent2, "accent2": cs.accent3, "accent_neg": cs.accent6, # often red/orange "text": cs.text1, "background": cs.background1, } for key, color_obj in theme_map.items(): try: rgb = color_obj.rgb if rgb: colors[key] = _rgb_to_hex(rgb) except Exception: pass except Exception: pass # Extract from shape fills (heuristic for primary color) fill_colors: dict[str, int] = {} text_colors: dict[str, int] = {} for shape in slide.shapes: # Fill colors try: if hasattr(shape, "fill") and shape.fill.type is not None: if hasattr(shape.fill, "fore_color") and shape.fill.fore_color: rgb = getattr(shape.fill.fore_color, "rgb", None) if rgb: hex_str = _rgb_to_hex(rgb) fill_colors[hex_str] = fill_colors.get(hex_str, 0) + 1 # Weight by area area = int(shape.width) * int(shape.height) fill_colors[hex_str] += area // 1000000000 except Exception: pass # Text colors try: if shape.has_text_frame: for para in shape.text_frame.paragraphs: for run in para.runs: if run.font.color and run.font.color.rgb: hex_str = _rgb_to_hex(run.font.color.rgb) text_colors[hex_str] = text_colors.get(hex_str, 0) + 1 except Exception: pass # Determine primary from most common dark fill dark_fills = {h: c for h, c in fill_colors.items() if _is_dark_color(h)} if dark_fills: primary = max(dark_fills, key=lambda k: dark_fills[k]) colors["primary"] = primary # Determine accent from bright fills bright_fills = {h: c for h, c in fill_colors.items() if _is_bright_color(h) and not _is_dark_color(h)} if bright_fills: accent = max(bright_fills, key=lambda k: bright_fills[k]) colors["accent"] = accent # Text color if text_colors: text_col = max(text_colors, key=lambda k: text_colors[k]) if text_col.upper() not in ("FFFFFF", "000000") or len(text_colors) == 1: colors["text"] = text_col return colors def _rgb_to_hex(rgb) -> str: if rgb is None: return "#333333" try: return f"#{rgb[0]:02X}{rgb[1]:02X}{rgb[2]:02X}" except Exception: try: return f"#{int(rgb):06X}" except Exception: return "#333333" def _is_dark_color(hex_str: str) -> bool: hex_str = hex_str.lstrip("#") if len(hex_str) != 6: return False try: r, g, b = int(hex_str[0:2], 16), int(hex_str[2:4], 16), int(hex_str[4:6], 16) luminance = 0.299 * r + 0.587 * g + 0.114 * b return luminance < 120 except Exception: return False def _is_bright_color(hex_str: str) -> bool: hex_str = hex_str.lstrip("#") if len(hex_str) != 6: return False try: r, g, b = int(hex_str[0:2], 16), int(hex_str[2:4], 16), int(hex_str[4:6], 16) saturation = max(r, g, b) - min(r, g, b) return saturation > 40 except Exception: return False # ============================================================================== # FONT EXTRACTION # ============================================================================== def _extract_fonts(slide) -> dict[str, str]: """Extract dominant title and body fonts from a slide.""" title_fonts: dict[str, int] = {} body_fonts: dict[str, int] = {} for shape in slide.shapes: if not shape.has_text_frame: continue top = int(shape.top) for para in shape.text_frame.paragraphs: for run in para.runs: font_name = run.font.name if not font_name: continue # Title area: top < ~1.5M EMU (approx 3.8cm) if top < Emu(1500000): title_fonts[font_name] = title_fonts.get(font_name, 0) + 1 else: body_fonts[font_name] = body_fonts.get(font_name, 0) + 1 result: dict[str, str] = {} if title_fonts: result["title_font"] = max(title_fonts, key=lambda k: title_fonts[k]) if body_fonts: result["body_font"] = max(body_fonts, key=lambda k: body_fonts[k]) # Number font often same as body or Arial; keep it simple result["number_font"] = result.get("body_font", "Arial") return result # ============================================================================== # SAFE MARGIN DETECTION # ============================================================================== def _extract_safe_margins(slide) -> dict[str, int]: """Estimate safe margins by looking at leftmost/topmost shapes.""" lefts = [] tops = [] for shape in slide.shapes: try: l = int(shape.left) t = int(shape.top) if l > 0 and l < Emu(2000000): lefts.append(l) if t > 0 and t < Emu(2000000): tops.append(t) except Exception: pass margins = {} if lefts: margins["left"] = min(lefts) margins["right"] = min(lefts) if tops: margins["top"] = min(tops) # Bottom margin harder to detect; use default margins["bottom"] = int(Emu(254000)) return margins # ============================================================================== # BACKGROUND DETECTION # ============================================================================== def _has_background(slide) -> bool: """Check if slide has explicit background shapes or images.""" try: if slide.background.fill.type is not None: return True except Exception: pass for shape in slide.shapes: try: if int(shape.left) == 0 and int(shape.top) == 0: if int(shape.width) > Emu(10000000) and int(shape.height) > Emu(5000000): return True except Exception: pass return False def _has_footer(slide) -> bool: """Check if slide has footer-like text at bottom.""" for shape in slide.shapes: if not shape.has_text_frame: continue try: top = int(shape.top) if top > Emu(8000000): text = (shape.text_frame.text or "").strip() if text and ("{source}" in text or "{period}" in text or "{page_num}" in text): return True except Exception: pass return False # ============================================================================== # MAIN PARSER # ============================================================================== def parse_template(path: str) -> TemplateProfile: """Parse a .pptx template file and return a TemplateProfile.""" abs_path = os.path.abspath(path) prs = Presentation(abs_path) total_slides = len(prs.slides) is_builtin = "assets" in abs_path.replace("\\", "/").lower() master_slides: list[MasterSlideInfo] = [] placeholder_map: dict[str, list[int]] = {} all_colors: dict[str, dict[str, int]] = {} all_fonts: dict[str, dict[str, int]] = {} for idx, slide in enumerate(prs.slides): mtype = _detect_master_type(slide, idx, total_slides) placeholders = _scan_placeholders(slide) content_top = _detect_content_top(slide) ms = MasterSlideInfo( slide_index=idx, master_type=mtype, placeholders=placeholders, content_top=content_top, has_footer=_has_footer(slide), has_background=_has_background(slide), shape_count=len(list(slide.shapes)), ) master_slides.append(ms) # Build placeholder -> master index map for ph in placeholders: canonical = _normalize_placeholder(ph) or ph if canonical not in placeholder_map: placeholder_map[canonical] = [] if idx not in placeholder_map[canonical]: placeholder_map[canonical].append(idx) # Aggregate colors colors = _extract_colors(slide) for k, v in colors.items(): if k not in all_colors: all_colors[k] = {} all_colors[k][v] = all_colors[k].get(v, 0) + 1 # Aggregate fonts fonts = _extract_fonts(slide) for k, v in fonts.items(): if k not in all_fonts: all_fonts[k] = {} all_fonts[k][v] = all_fonts[k].get(v, 0) + 1 # Determine final detected_theme by voting across master slides detected_theme: dict[str, str] = {} for key, vote in all_colors.items(): if vote: detected_theme[key] = max(vote, key=lambda k: vote[k]) # Determine final detected_fonts by voting detected_fonts: dict[str, str] = {} for key, vote in all_fonts.items(): if vote: detected_fonts[key] = max(vote, key=lambda k: vote[k]) # Safe margins: use first content-like slide or cover safe_margins: dict[str, int] = {} for ms in master_slides: if ms.master_type in ("content", "cover"): slide = prs.slides[ms.slide_index] safe_margins = _extract_safe_margins(slide) break if not safe_margins: safe_margins = {"left": int(Emu(762000)), "right": int(Emu(762000)), "top": int(Emu(254000)), "bottom": int(Emu(254000))} # Resolve slide dimensions slide_width = int(prs.slide_width) if prs.slide_width else 16256000 slide_height = int(prs.slide_height) if prs.slide_height else 9144000 return TemplateProfile( path=abs_path, is_builtin=is_builtin, slide_width=slide_width, slide_height=slide_height, master_slides=master_slides, placeholder_map=placeholder_map, detected_theme=detected_theme, detected_fonts=detected_fonts, safe_margins=safe_margins, ) def get_builtin_template_profile(report_type: str = "daily") -> TemplateProfile: """Parse a built-in template and return its profile.""" base = os.path.join(os.path.dirname(__file__), "..", "assets") template_map = { "daily": os.path.join(base, "report-master.pptx"), "weekly": os.path.join(base, "weekly-master.pptx"), "monthly": os.path.join(base, "monthly-master.pptx"), } path = template_map.get(report_type, template_map["daily"]) return parse_template(path) # ============================================================================== # DEBUG # ============================================================================== if __name__ == "__main__": import json for rtype in ["daily", "weekly", "monthly"]: profile = get_builtin_template_profile(rtype) print(f"\n=== {rtype.upper()} TEMPLATE PROFILE ===") print(f" Path: {profile.path}") print(f" Size: {profile.slide_width} x {profile.slide_height}") print(f" Masters:") for ms in profile.master_slides: print(f" [{ms.slide_index}] {ms.master_type}: placeholders={ms.placeholders}, content_top={ms.content_top}") print(f" Theme: {profile.detected_theme}") print(f" Fonts: {profile.detected_fonts}") print(f" Margins: {profile.safe_margins}")