| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587 |
- """
- Template parser engine for the universal data report generator.
- Reads any .pptx template and outputs a structured TemplateProfile describing
- master slide types, placeholders, colors, fonts, and layout geometry.
- """
- from __future__ import annotations
- import os
- import re
- from dataclasses import dataclass, field
- from pathlib import Path
- from typing import Optional
- from pptx import Presentation
- from pptx.dml.color import RGBColor
- from pptx.util import Emu
- # ==============================================================================
- # DATA MODELS
- # ==============================================================================
- @dataclass
- class MasterSlideInfo:
- slide_index: int
- master_type: str # 'cover' | 'content' | 'toc' | 'end' | 'unknown'
- placeholders: list[str] = field(default_factory=list)
- content_top: int = 0 # EMU
- has_footer: bool = False
- has_background: bool = False
- shape_count: int = 0
- @dataclass
- class TemplateProfile:
- path: str
- is_builtin: bool
- slide_width: int
- slide_height: int
- master_slides: list[MasterSlideInfo] = field(default_factory=list)
- placeholder_map: dict[str, list[int]] = field(default_factory=dict)
- detected_theme: dict[str, str] = field(default_factory=dict)
- detected_fonts: dict[str, str] = field(default_factory=dict)
- safe_margins: dict[str, int] = field(default_factory=dict)
- def get_master_for(self, page_type: str) -> Optional[MasterSlideInfo]:
- """Return the first master slide matching page_type, or None."""
- for ms in self.master_slides:
- if ms.master_type == page_type:
- return ms
- return None
- def get_content_top(self, page_type: str = "content") -> int:
- """Return content_top for the given page_type, or best guess."""
- ms = self.get_master_for(page_type)
- if ms and ms.content_top > 0:
- return ms.content_top
- # Fallback to any content page
- for ms in self.master_slides:
- if ms.master_type == "content" and ms.content_top > 0:
- return ms.content_top
- # Hard fallback
- return int(Emu(1422400))
- def get_master_index_for(self, page_type: str) -> int:
- """Return slide index for page_type, with fallback rules."""
- ms = self.get_master_for(page_type)
- if ms:
- return ms.slide_index
- # Fallback heuristics
- if page_type == "cover" and self.master_slides:
- return self.master_slides[0].slide_index
- if page_type == "end" and self.master_slides:
- return self.master_slides[-1].slide_index
- if page_type == "toc" and len(self.master_slides) >= 3:
- return self.master_slides[2].slide_index
- if len(self.master_slides) >= 2:
- return self.master_slides[1].slide_index
- return 0
- # ==============================================================================
- # PLACEHOLDER DETECTION
- # ==============================================================================
- _PLACEHOLDER_RE = re.compile(r"\{[^{}]+\}")
- # Canonical placeholder -> list of aliases (including itself)
- PLACEHOLDER_ALIASES: dict[str, list[str]] = {
- "{report_title}": ["{report_title}", "{标题}", "{title}", "{报告标题}"],
- "{report_type}": ["{report_type}", "{报告类型}", "{type}"],
- "{date}": ["{date}", "{日期}", "{report_date}", "{报告日期}"],
- "{department}": ["{department}", "{部门}", "{source}", "{来源}", "{dept}"],
- "{period}": ["{period}", "{周期}", "{report_period}", "{时间周期}"],
- "{gen_time}": ["{gen_time}", "{生成时间}", "{generated_time}"],
- "{page_title}": ["{page_title}", "{页面标题}", "{subtitle}", "{page_header}"],
- "{source}": ["{source}", "{数据来源}", "{data_source}"],
- "{page_num}": ["{page_num}", "{页码}", "{page_number}"],
- }
- # Chapter placeholders are generated dynamically
- for i in range(1, 13):
- PLACEHOLDER_ALIASES[f"{{chapter{i}_title}}"] = [f"{{chapter{i}_title}}", f"{{章节{i}标题}}"]
- PLACEHOLDER_ALIASES[f"{{chapter{i}_desc}}"] = [f"{{chapter{i}_desc}}", f"{{章节{i}描述}}"]
- # KPI placeholders
- for i in range(1, 13):
- PLACEHOLDER_ALIASES[f"{{kpi{i}_label}}"] = [f"{{kpi{i}_label}}", f"{{kpi{i}_name}}"]
- PLACEHOLDER_ALIASES[f"{{kpi{i}_value}}"] = [f"{{kpi{i}_value}}", f"{{kpi{i}_val}}"]
- def _scan_placeholders(slide) -> list[str]:
- """Scan a slide for all placeholder-like strings {xxx}."""
- found = set()
- for shape in slide.shapes:
- if shape.has_text_frame:
- text = shape.text_frame.text or ""
- for match in _PLACEHOLDER_RE.finditer(text):
- found.add(match.group(0))
- return sorted(found)
- def _normalize_placeholder(raw: str) -> Optional[str]:
- """Map a raw placeholder to its canonical form, if known."""
- raw_lower = raw.lower()
- for canonical, aliases in PLACEHOLDER_ALIASES.items():
- if raw_lower in [a.lower() for a in aliases]:
- return canonical
- return None
- # ==============================================================================
- # MASTER SLIDE TYPE DETECTION
- # ==============================================================================
- _TYPE_KEYWORDS: dict[str, list[str]] = {
- "cover": ["{report_title}", "{date}", "{department}", "{report_type}", "{gen_time}"],
- "content": ["{page_title}", "{source}", "{page_num}", "{period}"],
- "toc": ["{chapter", "contents", "目录", "catalog", "agenda"],
- "end": ["{report_title}", "感谢", "thank", "结语", "尾页", "end"],
- }
- def _detect_master_type(slide, slide_index: int, total_slides: int) -> str:
- """Detect the semantic type of a master slide."""
- texts = []
- placeholders = []
- for shape in slide.shapes:
- if shape.has_text_frame:
- t = (shape.text_frame.text or "").strip()
- if t:
- texts.append(t.lower())
- placeholders.extend(_PLACEHOLDER_RE.findall(t))
- text_block = " ".join(texts)
- ph_block = " ".join(placeholders).lower()
- scores: dict[str, int] = {"cover": 0, "content": 0, "toc": 0, "end": 0, "unknown": 0}
- # Score by keywords
- for ptype, keywords in _TYPE_KEYWORDS.items():
- for kw in keywords:
- if kw.lower() in ph_block or kw.lower() in text_block:
- scores[ptype] += 1
- # Position heuristics
- if slide_index == 0:
- scores["cover"] += 2
- if slide_index == total_slides - 1:
- scores["end"] += 2
- if total_slides >= 3 and slide_index == 2:
- scores["toc"] += 1
- # Content page has page_title but not report_title (cover does)
- if "{page_title}" in ph_block:
- if "{report_title}" in ph_block:
- # Could be cover with both; check position of report_title
- # If report_title is at top-left small text, it's a header → content
- scores["cover"] += 1
- else:
- scores["content"] += 3
- # TOC strongly signaled by chapter placeholders
- if "{chapter" in ph_block:
- scores["toc"] += 5
- # Distinguish end from cover: end usually lacks date/department placeholders
- if "{date}" in ph_block and "{department}" in ph_block:
- scores["cover"] += 2
- scores["end"] -= 1
- # Cover usually has KPI placeholders
- if "{kpi1_label}" in ph_block:
- scores["cover"] += 2
- best = max(scores, key=lambda k: scores[k])
- if scores[best] == 0:
- # Default fallback by position
- if slide_index == 0:
- return "cover"
- if slide_index == total_slides - 1:
- return "end"
- return "content"
- return best
- # ==============================================================================
- # CONTENT TOP DETECTION
- # ==============================================================================
- def _detect_content_top(slide, default_gap: int = 381000) -> int:
- """Detect content start Y by finding page_title placeholder bottom + gap."""
- page_title_bottom = None
- for shape in slide.shapes:
- if not shape.has_text_frame:
- continue
- text = shape.text_frame.text or ""
- # Match any page_title alias
- if _matches_any_placeholder(text, "{page_title}"):
- page_title_bottom = int(shape.top) + int(shape.height)
- break
- if page_title_bottom is not None:
- return page_title_bottom + default_gap
- # Fallback: find any text shape in the upper area that looks like a title
- for shape in slide.shapes:
- if not shape.has_text_frame:
- continue
- if int(shape.top) > Emu(500000) and int(shape.top) < Emu(1500000):
- text = (shape.text_frame.text or "").strip()
- if text and len(text) < 40 and "{" not in text:
- return int(shape.top) + int(shape.height) + default_gap
- return int(Emu(1422400))
- def _matches_any_placeholder(text: str, canonical: str) -> bool:
- aliases = PLACEHOLDER_ALIASES.get(canonical, [canonical])
- for alias in aliases:
- if alias in text:
- return True
- return False
- # ==============================================================================
- # COLOR EXTRACTION
- # ==============================================================================
- def _extract_colors(slide) -> dict[str, str]:
- """Extract dominant colors from a slide's shapes and theme."""
- colors: dict[str, str] = {}
- # Try theme color scheme first
- try:
- theme = slide.slide_layout.slide_master.theme
- cs = theme.color_scheme
- # Map theme colors
- theme_map = {
- "primary": cs.accent1,
- "accent": cs.accent2,
- "accent2": cs.accent3,
- "accent_neg": cs.accent6, # often red/orange
- "text": cs.text1,
- "background": cs.background1,
- }
- for key, color_obj in theme_map.items():
- try:
- rgb = color_obj.rgb
- if rgb:
- colors[key] = _rgb_to_hex(rgb)
- except Exception:
- pass
- except Exception:
- pass
- # Extract from shape fills (heuristic for primary color)
- fill_colors: dict[str, int] = {}
- text_colors: dict[str, int] = {}
- for shape in slide.shapes:
- # Fill colors
- try:
- if hasattr(shape, "fill") and shape.fill.type is not None:
- if hasattr(shape.fill, "fore_color") and shape.fill.fore_color:
- rgb = getattr(shape.fill.fore_color, "rgb", None)
- if rgb:
- hex_str = _rgb_to_hex(rgb)
- fill_colors[hex_str] = fill_colors.get(hex_str, 0) + 1
- # Weight by area
- area = int(shape.width) * int(shape.height)
- fill_colors[hex_str] += area // 1000000000
- except Exception:
- pass
- # Text colors
- try:
- if shape.has_text_frame:
- for para in shape.text_frame.paragraphs:
- for run in para.runs:
- if run.font.color and run.font.color.rgb:
- hex_str = _rgb_to_hex(run.font.color.rgb)
- text_colors[hex_str] = text_colors.get(hex_str, 0) + 1
- except Exception:
- pass
- # Determine primary from most common dark fill
- dark_fills = {h: c for h, c in fill_colors.items() if _is_dark_color(h)}
- if dark_fills:
- primary = max(dark_fills, key=lambda k: dark_fills[k])
- colors["primary"] = primary
- # Determine accent from bright fills
- bright_fills = {h: c for h, c in fill_colors.items() if _is_bright_color(h) and not _is_dark_color(h)}
- if bright_fills:
- accent = max(bright_fills, key=lambda k: bright_fills[k])
- colors["accent"] = accent
- # Text color
- if text_colors:
- text_col = max(text_colors, key=lambda k: text_colors[k])
- if text_col.upper() not in ("FFFFFF", "000000") or len(text_colors) == 1:
- colors["text"] = text_col
- return colors
- def _rgb_to_hex(rgb) -> str:
- if rgb is None:
- return "#333333"
- try:
- return f"#{rgb[0]:02X}{rgb[1]:02X}{rgb[2]:02X}"
- except Exception:
- try:
- return f"#{int(rgb):06X}"
- except Exception:
- return "#333333"
- def _is_dark_color(hex_str: str) -> bool:
- hex_str = hex_str.lstrip("#")
- if len(hex_str) != 6:
- return False
- try:
- r, g, b = int(hex_str[0:2], 16), int(hex_str[2:4], 16), int(hex_str[4:6], 16)
- luminance = 0.299 * r + 0.587 * g + 0.114 * b
- return luminance < 120
- except Exception:
- return False
- def _is_bright_color(hex_str: str) -> bool:
- hex_str = hex_str.lstrip("#")
- if len(hex_str) != 6:
- return False
- try:
- r, g, b = int(hex_str[0:2], 16), int(hex_str[2:4], 16), int(hex_str[4:6], 16)
- saturation = max(r, g, b) - min(r, g, b)
- return saturation > 40
- except Exception:
- return False
- # ==============================================================================
- # FONT EXTRACTION
- # ==============================================================================
- def _extract_fonts(slide) -> dict[str, str]:
- """Extract dominant title and body fonts from a slide."""
- title_fonts: dict[str, int] = {}
- body_fonts: dict[str, int] = {}
- for shape in slide.shapes:
- if not shape.has_text_frame:
- continue
- top = int(shape.top)
- for para in shape.text_frame.paragraphs:
- for run in para.runs:
- font_name = run.font.name
- if not font_name:
- continue
- # Title area: top < ~1.5M EMU (approx 3.8cm)
- if top < Emu(1500000):
- title_fonts[font_name] = title_fonts.get(font_name, 0) + 1
- else:
- body_fonts[font_name] = body_fonts.get(font_name, 0) + 1
- result: dict[str, str] = {}
- if title_fonts:
- result["title_font"] = max(title_fonts, key=lambda k: title_fonts[k])
- if body_fonts:
- result["body_font"] = max(body_fonts, key=lambda k: body_fonts[k])
- # Number font often same as body or Arial; keep it simple
- result["number_font"] = result.get("body_font", "Arial")
- return result
- # ==============================================================================
- # SAFE MARGIN DETECTION
- # ==============================================================================
- def _extract_safe_margins(slide) -> dict[str, int]:
- """Estimate safe margins by looking at leftmost/topmost shapes."""
- lefts = []
- tops = []
- for shape in slide.shapes:
- try:
- l = int(shape.left)
- t = int(shape.top)
- if l > 0 and l < Emu(2000000):
- lefts.append(l)
- if t > 0 and t < Emu(2000000):
- tops.append(t)
- except Exception:
- pass
- margins = {}
- if lefts:
- margins["left"] = min(lefts)
- margins["right"] = min(lefts)
- if tops:
- margins["top"] = min(tops)
- # Bottom margin harder to detect; use default
- margins["bottom"] = int(Emu(254000))
- return margins
- # ==============================================================================
- # BACKGROUND DETECTION
- # ==============================================================================
- def _has_background(slide) -> bool:
- """Check if slide has explicit background shapes or images."""
- try:
- if slide.background.fill.type is not None:
- return True
- except Exception:
- pass
- for shape in slide.shapes:
- try:
- if int(shape.left) == 0 and int(shape.top) == 0:
- if int(shape.width) > Emu(10000000) and int(shape.height) > Emu(5000000):
- return True
- except Exception:
- pass
- return False
- def _has_footer(slide) -> bool:
- """Check if slide has footer-like text at bottom."""
- for shape in slide.shapes:
- if not shape.has_text_frame:
- continue
- try:
- top = int(shape.top)
- if top > Emu(8000000):
- text = (shape.text_frame.text or "").strip()
- if text and ("{source}" in text or "{period}" in text or "{page_num}" in text):
- return True
- except Exception:
- pass
- return False
- # ==============================================================================
- # MAIN PARSER
- # ==============================================================================
- def parse_template(path: str) -> TemplateProfile:
- """Parse a .pptx template file and return a TemplateProfile."""
- abs_path = os.path.abspath(path)
- prs = Presentation(abs_path)
- total_slides = len(prs.slides)
- is_builtin = "assets" in abs_path.replace("\\", "/").lower()
- master_slides: list[MasterSlideInfo] = []
- placeholder_map: dict[str, list[int]] = {}
- all_colors: dict[str, dict[str, int]] = {}
- all_fonts: dict[str, dict[str, int]] = {}
- for idx, slide in enumerate(prs.slides):
- mtype = _detect_master_type(slide, idx, total_slides)
- placeholders = _scan_placeholders(slide)
- content_top = _detect_content_top(slide)
- ms = MasterSlideInfo(
- slide_index=idx,
- master_type=mtype,
- placeholders=placeholders,
- content_top=content_top,
- has_footer=_has_footer(slide),
- has_background=_has_background(slide),
- shape_count=len(list(slide.shapes)),
- )
- master_slides.append(ms)
- # Build placeholder -> master index map
- for ph in placeholders:
- canonical = _normalize_placeholder(ph) or ph
- if canonical not in placeholder_map:
- placeholder_map[canonical] = []
- if idx not in placeholder_map[canonical]:
- placeholder_map[canonical].append(idx)
- # Aggregate colors
- colors = _extract_colors(slide)
- for k, v in colors.items():
- if k not in all_colors:
- all_colors[k] = {}
- all_colors[k][v] = all_colors[k].get(v, 0) + 1
- # Aggregate fonts
- fonts = _extract_fonts(slide)
- for k, v in fonts.items():
- if k not in all_fonts:
- all_fonts[k] = {}
- all_fonts[k][v] = all_fonts[k].get(v, 0) + 1
- # Determine final detected_theme by voting across master slides
- detected_theme: dict[str, str] = {}
- for key, vote in all_colors.items():
- if vote:
- detected_theme[key] = max(vote, key=lambda k: vote[k])
- # Determine final detected_fonts by voting
- detected_fonts: dict[str, str] = {}
- for key, vote in all_fonts.items():
- if vote:
- detected_fonts[key] = max(vote, key=lambda k: vote[k])
- # Safe margins: use first content-like slide or cover
- safe_margins: dict[str, int] = {}
- for ms in master_slides:
- if ms.master_type in ("content", "cover"):
- slide = prs.slides[ms.slide_index]
- safe_margins = _extract_safe_margins(slide)
- break
- if not safe_margins:
- safe_margins = {"left": int(Emu(762000)), "right": int(Emu(762000)), "top": int(Emu(254000)), "bottom": int(Emu(254000))}
- # Resolve slide dimensions
- slide_width = int(prs.slide_width) if prs.slide_width else 16256000
- slide_height = int(prs.slide_height) if prs.slide_height else 9144000
- return TemplateProfile(
- path=abs_path,
- is_builtin=is_builtin,
- slide_width=slide_width,
- slide_height=slide_height,
- master_slides=master_slides,
- placeholder_map=placeholder_map,
- detected_theme=detected_theme,
- detected_fonts=detected_fonts,
- safe_margins=safe_margins,
- )
- def get_builtin_template_profile(report_type: str = "daily") -> TemplateProfile:
- """Parse a built-in template and return its profile."""
- base = os.path.join(os.path.dirname(__file__), "..", "assets")
- template_map = {
- "daily": os.path.join(base, "report-master.pptx"),
- "weekly": os.path.join(base, "weekly-master.pptx"),
- "monthly": os.path.join(base, "monthly-master.pptx"),
- }
- path = template_map.get(report_type, template_map["daily"])
- return parse_template(path)
- # ==============================================================================
- # DEBUG
- # ==============================================================================
- if __name__ == "__main__":
- import json
- for rtype in ["daily", "weekly", "monthly"]:
- profile = get_builtin_template_profile(rtype)
- print(f"\n=== {rtype.upper()} TEMPLATE PROFILE ===")
- print(f" Path: {profile.path}")
- print(f" Size: {profile.slide_width} x {profile.slide_height}")
- print(f" Masters:")
- for ms in profile.master_slides:
- print(f" [{ms.slide_index}] {ms.master_type}: placeholders={ms.placeholders}, content_top={ms.content_top}")
- print(f" Theme: {profile.detected_theme}")
- print(f" Fonts: {profile.detected_fonts}")
- print(f" Margins: {profile.safe_margins}")
|