template_parser.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587
  1. """
  2. Template parser engine for the universal data report generator.
  3. Reads any .pptx template and outputs a structured TemplateProfile describing
  4. master slide types, placeholders, colors, fonts, and layout geometry.
  5. """
  6. from __future__ import annotations
  7. import os
  8. import re
  9. from dataclasses import dataclass, field
  10. from pathlib import Path
  11. from typing import Optional
  12. from pptx import Presentation
  13. from pptx.dml.color import RGBColor
  14. from pptx.util import Emu
  15. # ==============================================================================
  16. # DATA MODELS
  17. # ==============================================================================
  18. @dataclass
  19. class MasterSlideInfo:
  20. slide_index: int
  21. master_type: str # 'cover' | 'content' | 'toc' | 'end' | 'unknown'
  22. placeholders: list[str] = field(default_factory=list)
  23. content_top: int = 0 # EMU
  24. has_footer: bool = False
  25. has_background: bool = False
  26. shape_count: int = 0
  27. @dataclass
  28. class TemplateProfile:
  29. path: str
  30. is_builtin: bool
  31. slide_width: int
  32. slide_height: int
  33. master_slides: list[MasterSlideInfo] = field(default_factory=list)
  34. placeholder_map: dict[str, list[int]] = field(default_factory=dict)
  35. detected_theme: dict[str, str] = field(default_factory=dict)
  36. detected_fonts: dict[str, str] = field(default_factory=dict)
  37. safe_margins: dict[str, int] = field(default_factory=dict)
  38. def get_master_for(self, page_type: str) -> Optional[MasterSlideInfo]:
  39. """Return the first master slide matching page_type, or None."""
  40. for ms in self.master_slides:
  41. if ms.master_type == page_type:
  42. return ms
  43. return None
  44. def get_content_top(self, page_type: str = "content") -> int:
  45. """Return content_top for the given page_type, or best guess."""
  46. ms = self.get_master_for(page_type)
  47. if ms and ms.content_top > 0:
  48. return ms.content_top
  49. # Fallback to any content page
  50. for ms in self.master_slides:
  51. if ms.master_type == "content" and ms.content_top > 0:
  52. return ms.content_top
  53. # Hard fallback
  54. return int(Emu(1422400))
  55. def get_master_index_for(self, page_type: str) -> int:
  56. """Return slide index for page_type, with fallback rules."""
  57. ms = self.get_master_for(page_type)
  58. if ms:
  59. return ms.slide_index
  60. # Fallback heuristics
  61. if page_type == "cover" and self.master_slides:
  62. return self.master_slides[0].slide_index
  63. if page_type == "end" and self.master_slides:
  64. return self.master_slides[-1].slide_index
  65. if page_type == "toc" and len(self.master_slides) >= 3:
  66. return self.master_slides[2].slide_index
  67. if len(self.master_slides) >= 2:
  68. return self.master_slides[1].slide_index
  69. return 0
  70. # ==============================================================================
  71. # PLACEHOLDER DETECTION
  72. # ==============================================================================
  73. _PLACEHOLDER_RE = re.compile(r"\{[^{}]+\}")
  74. # Canonical placeholder -> list of aliases (including itself)
  75. PLACEHOLDER_ALIASES: dict[str, list[str]] = {
  76. "{report_title}": ["{report_title}", "{标题}", "{title}", "{报告标题}"],
  77. "{report_type}": ["{report_type}", "{报告类型}", "{type}"],
  78. "{date}": ["{date}", "{日期}", "{report_date}", "{报告日期}"],
  79. "{department}": ["{department}", "{部门}", "{source}", "{来源}", "{dept}"],
  80. "{period}": ["{period}", "{周期}", "{report_period}", "{时间周期}"],
  81. "{gen_time}": ["{gen_time}", "{生成时间}", "{generated_time}"],
  82. "{page_title}": ["{page_title}", "{页面标题}", "{subtitle}", "{page_header}"],
  83. "{source}": ["{source}", "{数据来源}", "{data_source}"],
  84. "{page_num}": ["{page_num}", "{页码}", "{page_number}"],
  85. }
  86. # Chapter placeholders are generated dynamically
  87. for i in range(1, 13):
  88. PLACEHOLDER_ALIASES[f"{{chapter{i}_title}}"] = [f"{{chapter{i}_title}}", f"{{章节{i}标题}}"]
  89. PLACEHOLDER_ALIASES[f"{{chapter{i}_desc}}"] = [f"{{chapter{i}_desc}}", f"{{章节{i}描述}}"]
  90. # KPI placeholders
  91. for i in range(1, 13):
  92. PLACEHOLDER_ALIASES[f"{{kpi{i}_label}}"] = [f"{{kpi{i}_label}}", f"{{kpi{i}_name}}"]
  93. PLACEHOLDER_ALIASES[f"{{kpi{i}_value}}"] = [f"{{kpi{i}_value}}", f"{{kpi{i}_val}}"]
  94. def _scan_placeholders(slide) -> list[str]:
  95. """Scan a slide for all placeholder-like strings {xxx}."""
  96. found = set()
  97. for shape in slide.shapes:
  98. if shape.has_text_frame:
  99. text = shape.text_frame.text or ""
  100. for match in _PLACEHOLDER_RE.finditer(text):
  101. found.add(match.group(0))
  102. return sorted(found)
  103. def _normalize_placeholder(raw: str) -> Optional[str]:
  104. """Map a raw placeholder to its canonical form, if known."""
  105. raw_lower = raw.lower()
  106. for canonical, aliases in PLACEHOLDER_ALIASES.items():
  107. if raw_lower in [a.lower() for a in aliases]:
  108. return canonical
  109. return None
  110. # ==============================================================================
  111. # MASTER SLIDE TYPE DETECTION
  112. # ==============================================================================
  113. _TYPE_KEYWORDS: dict[str, list[str]] = {
  114. "cover": ["{report_title}", "{date}", "{department}", "{report_type}", "{gen_time}"],
  115. "content": ["{page_title}", "{source}", "{page_num}", "{period}"],
  116. "toc": ["{chapter", "contents", "目录", "catalog", "agenda"],
  117. "end": ["{report_title}", "感谢", "thank", "结语", "尾页", "end"],
  118. }
  119. def _detect_master_type(slide, slide_index: int, total_slides: int) -> str:
  120. """Detect the semantic type of a master slide."""
  121. texts = []
  122. placeholders = []
  123. for shape in slide.shapes:
  124. if shape.has_text_frame:
  125. t = (shape.text_frame.text or "").strip()
  126. if t:
  127. texts.append(t.lower())
  128. placeholders.extend(_PLACEHOLDER_RE.findall(t))
  129. text_block = " ".join(texts)
  130. ph_block = " ".join(placeholders).lower()
  131. scores: dict[str, int] = {"cover": 0, "content": 0, "toc": 0, "end": 0, "unknown": 0}
  132. # Score by keywords
  133. for ptype, keywords in _TYPE_KEYWORDS.items():
  134. for kw in keywords:
  135. if kw.lower() in ph_block or kw.lower() in text_block:
  136. scores[ptype] += 1
  137. # Position heuristics
  138. if slide_index == 0:
  139. scores["cover"] += 2
  140. if slide_index == total_slides - 1:
  141. scores["end"] += 2
  142. if total_slides >= 3 and slide_index == 2:
  143. scores["toc"] += 1
  144. # Content page has page_title but not report_title (cover does)
  145. if "{page_title}" in ph_block:
  146. if "{report_title}" in ph_block:
  147. # Could be cover with both; check position of report_title
  148. # If report_title is at top-left small text, it's a header → content
  149. scores["cover"] += 1
  150. else:
  151. scores["content"] += 3
  152. # TOC strongly signaled by chapter placeholders
  153. if "{chapter" in ph_block:
  154. scores["toc"] += 5
  155. # Distinguish end from cover: end usually lacks date/department placeholders
  156. if "{date}" in ph_block and "{department}" in ph_block:
  157. scores["cover"] += 2
  158. scores["end"] -= 1
  159. # Cover usually has KPI placeholders
  160. if "{kpi1_label}" in ph_block:
  161. scores["cover"] += 2
  162. best = max(scores, key=lambda k: scores[k])
  163. if scores[best] == 0:
  164. # Default fallback by position
  165. if slide_index == 0:
  166. return "cover"
  167. if slide_index == total_slides - 1:
  168. return "end"
  169. return "content"
  170. return best
  171. # ==============================================================================
  172. # CONTENT TOP DETECTION
  173. # ==============================================================================
  174. def _detect_content_top(slide, default_gap: int = 381000) -> int:
  175. """Detect content start Y by finding page_title placeholder bottom + gap."""
  176. page_title_bottom = None
  177. for shape in slide.shapes:
  178. if not shape.has_text_frame:
  179. continue
  180. text = shape.text_frame.text or ""
  181. # Match any page_title alias
  182. if _matches_any_placeholder(text, "{page_title}"):
  183. page_title_bottom = int(shape.top) + int(shape.height)
  184. break
  185. if page_title_bottom is not None:
  186. return page_title_bottom + default_gap
  187. # Fallback: find any text shape in the upper area that looks like a title
  188. for shape in slide.shapes:
  189. if not shape.has_text_frame:
  190. continue
  191. if int(shape.top) > Emu(500000) and int(shape.top) < Emu(1500000):
  192. text = (shape.text_frame.text or "").strip()
  193. if text and len(text) < 40 and "{" not in text:
  194. return int(shape.top) + int(shape.height) + default_gap
  195. return int(Emu(1422400))
  196. def _matches_any_placeholder(text: str, canonical: str) -> bool:
  197. aliases = PLACEHOLDER_ALIASES.get(canonical, [canonical])
  198. for alias in aliases:
  199. if alias in text:
  200. return True
  201. return False
  202. # ==============================================================================
  203. # COLOR EXTRACTION
  204. # ==============================================================================
  205. def _extract_colors(slide) -> dict[str, str]:
  206. """Extract dominant colors from a slide's shapes and theme."""
  207. colors: dict[str, str] = {}
  208. # Try theme color scheme first
  209. try:
  210. theme = slide.slide_layout.slide_master.theme
  211. cs = theme.color_scheme
  212. # Map theme colors
  213. theme_map = {
  214. "primary": cs.accent1,
  215. "accent": cs.accent2,
  216. "accent2": cs.accent3,
  217. "accent_neg": cs.accent6, # often red/orange
  218. "text": cs.text1,
  219. "background": cs.background1,
  220. }
  221. for key, color_obj in theme_map.items():
  222. try:
  223. rgb = color_obj.rgb
  224. if rgb:
  225. colors[key] = _rgb_to_hex(rgb)
  226. except Exception:
  227. pass
  228. except Exception:
  229. pass
  230. # Extract from shape fills (heuristic for primary color)
  231. fill_colors: dict[str, int] = {}
  232. text_colors: dict[str, int] = {}
  233. for shape in slide.shapes:
  234. # Fill colors
  235. try:
  236. if hasattr(shape, "fill") and shape.fill.type is not None:
  237. if hasattr(shape.fill, "fore_color") and shape.fill.fore_color:
  238. rgb = getattr(shape.fill.fore_color, "rgb", None)
  239. if rgb:
  240. hex_str = _rgb_to_hex(rgb)
  241. fill_colors[hex_str] = fill_colors.get(hex_str, 0) + 1
  242. # Weight by area
  243. area = int(shape.width) * int(shape.height)
  244. fill_colors[hex_str] += area // 1000000000
  245. except Exception:
  246. pass
  247. # Text colors
  248. try:
  249. if shape.has_text_frame:
  250. for para in shape.text_frame.paragraphs:
  251. for run in para.runs:
  252. if run.font.color and run.font.color.rgb:
  253. hex_str = _rgb_to_hex(run.font.color.rgb)
  254. text_colors[hex_str] = text_colors.get(hex_str, 0) + 1
  255. except Exception:
  256. pass
  257. # Determine primary from most common dark fill
  258. dark_fills = {h: c for h, c in fill_colors.items() if _is_dark_color(h)}
  259. if dark_fills:
  260. primary = max(dark_fills, key=lambda k: dark_fills[k])
  261. colors["primary"] = primary
  262. # Determine accent from bright fills
  263. bright_fills = {h: c for h, c in fill_colors.items() if _is_bright_color(h) and not _is_dark_color(h)}
  264. if bright_fills:
  265. accent = max(bright_fills, key=lambda k: bright_fills[k])
  266. colors["accent"] = accent
  267. # Text color
  268. if text_colors:
  269. text_col = max(text_colors, key=lambda k: text_colors[k])
  270. if text_col.upper() not in ("FFFFFF", "000000") or len(text_colors) == 1:
  271. colors["text"] = text_col
  272. return colors
  273. def _rgb_to_hex(rgb) -> str:
  274. if rgb is None:
  275. return "#333333"
  276. try:
  277. return f"#{rgb[0]:02X}{rgb[1]:02X}{rgb[2]:02X}"
  278. except Exception:
  279. try:
  280. return f"#{int(rgb):06X}"
  281. except Exception:
  282. return "#333333"
  283. def _is_dark_color(hex_str: str) -> bool:
  284. hex_str = hex_str.lstrip("#")
  285. if len(hex_str) != 6:
  286. return False
  287. try:
  288. r, g, b = int(hex_str[0:2], 16), int(hex_str[2:4], 16), int(hex_str[4:6], 16)
  289. luminance = 0.299 * r + 0.587 * g + 0.114 * b
  290. return luminance < 120
  291. except Exception:
  292. return False
  293. def _is_bright_color(hex_str: str) -> bool:
  294. hex_str = hex_str.lstrip("#")
  295. if len(hex_str) != 6:
  296. return False
  297. try:
  298. r, g, b = int(hex_str[0:2], 16), int(hex_str[2:4], 16), int(hex_str[4:6], 16)
  299. saturation = max(r, g, b) - min(r, g, b)
  300. return saturation > 40
  301. except Exception:
  302. return False
  303. # ==============================================================================
  304. # FONT EXTRACTION
  305. # ==============================================================================
  306. def _extract_fonts(slide) -> dict[str, str]:
  307. """Extract dominant title and body fonts from a slide."""
  308. title_fonts: dict[str, int] = {}
  309. body_fonts: dict[str, int] = {}
  310. for shape in slide.shapes:
  311. if not shape.has_text_frame:
  312. continue
  313. top = int(shape.top)
  314. for para in shape.text_frame.paragraphs:
  315. for run in para.runs:
  316. font_name = run.font.name
  317. if not font_name:
  318. continue
  319. # Title area: top < ~1.5M EMU (approx 3.8cm)
  320. if top < Emu(1500000):
  321. title_fonts[font_name] = title_fonts.get(font_name, 0) + 1
  322. else:
  323. body_fonts[font_name] = body_fonts.get(font_name, 0) + 1
  324. result: dict[str, str] = {}
  325. if title_fonts:
  326. result["title_font"] = max(title_fonts, key=lambda k: title_fonts[k])
  327. if body_fonts:
  328. result["body_font"] = max(body_fonts, key=lambda k: body_fonts[k])
  329. # Number font often same as body or Arial; keep it simple
  330. result["number_font"] = result.get("body_font", "Arial")
  331. return result
  332. # ==============================================================================
  333. # SAFE MARGIN DETECTION
  334. # ==============================================================================
  335. def _extract_safe_margins(slide) -> dict[str, int]:
  336. """Estimate safe margins by looking at leftmost/topmost shapes."""
  337. lefts = []
  338. tops = []
  339. for shape in slide.shapes:
  340. try:
  341. l = int(shape.left)
  342. t = int(shape.top)
  343. if l > 0 and l < Emu(2000000):
  344. lefts.append(l)
  345. if t > 0 and t < Emu(2000000):
  346. tops.append(t)
  347. except Exception:
  348. pass
  349. margins = {}
  350. if lefts:
  351. margins["left"] = min(lefts)
  352. margins["right"] = min(lefts)
  353. if tops:
  354. margins["top"] = min(tops)
  355. # Bottom margin harder to detect; use default
  356. margins["bottom"] = int(Emu(254000))
  357. return margins
  358. # ==============================================================================
  359. # BACKGROUND DETECTION
  360. # ==============================================================================
  361. def _has_background(slide) -> bool:
  362. """Check if slide has explicit background shapes or images."""
  363. try:
  364. if slide.background.fill.type is not None:
  365. return True
  366. except Exception:
  367. pass
  368. for shape in slide.shapes:
  369. try:
  370. if int(shape.left) == 0 and int(shape.top) == 0:
  371. if int(shape.width) > Emu(10000000) and int(shape.height) > Emu(5000000):
  372. return True
  373. except Exception:
  374. pass
  375. return False
  376. def _has_footer(slide) -> bool:
  377. """Check if slide has footer-like text at bottom."""
  378. for shape in slide.shapes:
  379. if not shape.has_text_frame:
  380. continue
  381. try:
  382. top = int(shape.top)
  383. if top > Emu(8000000):
  384. text = (shape.text_frame.text or "").strip()
  385. if text and ("{source}" in text or "{period}" in text or "{page_num}" in text):
  386. return True
  387. except Exception:
  388. pass
  389. return False
  390. # ==============================================================================
  391. # MAIN PARSER
  392. # ==============================================================================
  393. def parse_template(path: str) -> TemplateProfile:
  394. """Parse a .pptx template file and return a TemplateProfile."""
  395. abs_path = os.path.abspath(path)
  396. prs = Presentation(abs_path)
  397. total_slides = len(prs.slides)
  398. is_builtin = "assets" in abs_path.replace("\\", "/").lower()
  399. master_slides: list[MasterSlideInfo] = []
  400. placeholder_map: dict[str, list[int]] = {}
  401. all_colors: dict[str, dict[str, int]] = {}
  402. all_fonts: dict[str, dict[str, int]] = {}
  403. for idx, slide in enumerate(prs.slides):
  404. mtype = _detect_master_type(slide, idx, total_slides)
  405. placeholders = _scan_placeholders(slide)
  406. content_top = _detect_content_top(slide)
  407. ms = MasterSlideInfo(
  408. slide_index=idx,
  409. master_type=mtype,
  410. placeholders=placeholders,
  411. content_top=content_top,
  412. has_footer=_has_footer(slide),
  413. has_background=_has_background(slide),
  414. shape_count=len(list(slide.shapes)),
  415. )
  416. master_slides.append(ms)
  417. # Build placeholder -> master index map
  418. for ph in placeholders:
  419. canonical = _normalize_placeholder(ph) or ph
  420. if canonical not in placeholder_map:
  421. placeholder_map[canonical] = []
  422. if idx not in placeholder_map[canonical]:
  423. placeholder_map[canonical].append(idx)
  424. # Aggregate colors
  425. colors = _extract_colors(slide)
  426. for k, v in colors.items():
  427. if k not in all_colors:
  428. all_colors[k] = {}
  429. all_colors[k][v] = all_colors[k].get(v, 0) + 1
  430. # Aggregate fonts
  431. fonts = _extract_fonts(slide)
  432. for k, v in fonts.items():
  433. if k not in all_fonts:
  434. all_fonts[k] = {}
  435. all_fonts[k][v] = all_fonts[k].get(v, 0) + 1
  436. # Determine final detected_theme by voting across master slides
  437. detected_theme: dict[str, str] = {}
  438. for key, vote in all_colors.items():
  439. if vote:
  440. detected_theme[key] = max(vote, key=lambda k: vote[k])
  441. # Determine final detected_fonts by voting
  442. detected_fonts: dict[str, str] = {}
  443. for key, vote in all_fonts.items():
  444. if vote:
  445. detected_fonts[key] = max(vote, key=lambda k: vote[k])
  446. # Safe margins: use first content-like slide or cover
  447. safe_margins: dict[str, int] = {}
  448. for ms in master_slides:
  449. if ms.master_type in ("content", "cover"):
  450. slide = prs.slides[ms.slide_index]
  451. safe_margins = _extract_safe_margins(slide)
  452. break
  453. if not safe_margins:
  454. safe_margins = {"left": int(Emu(762000)), "right": int(Emu(762000)), "top": int(Emu(254000)), "bottom": int(Emu(254000))}
  455. # Resolve slide dimensions
  456. slide_width = int(prs.slide_width) if prs.slide_width else 16256000
  457. slide_height = int(prs.slide_height) if prs.slide_height else 9144000
  458. return TemplateProfile(
  459. path=abs_path,
  460. is_builtin=is_builtin,
  461. slide_width=slide_width,
  462. slide_height=slide_height,
  463. master_slides=master_slides,
  464. placeholder_map=placeholder_map,
  465. detected_theme=detected_theme,
  466. detected_fonts=detected_fonts,
  467. safe_margins=safe_margins,
  468. )
  469. def get_builtin_template_profile(report_type: str = "daily") -> TemplateProfile:
  470. """Parse a built-in template and return its profile."""
  471. base = os.path.join(os.path.dirname(__file__), "..", "assets")
  472. template_map = {
  473. "daily": os.path.join(base, "report-master.pptx"),
  474. "weekly": os.path.join(base, "weekly-master.pptx"),
  475. "monthly": os.path.join(base, "monthly-master.pptx"),
  476. }
  477. path = template_map.get(report_type, template_map["daily"])
  478. return parse_template(path)
  479. # ==============================================================================
  480. # DEBUG
  481. # ==============================================================================
  482. if __name__ == "__main__":
  483. import json
  484. for rtype in ["daily", "weekly", "monthly"]:
  485. profile = get_builtin_template_profile(rtype)
  486. print(f"\n=== {rtype.upper()} TEMPLATE PROFILE ===")
  487. print(f" Path: {profile.path}")
  488. print(f" Size: {profile.slide_width} x {profile.slide_height}")
  489. print(f" Masters:")
  490. for ms in profile.master_slides:
  491. print(f" [{ms.slide_index}] {ms.master_type}: placeholders={ms.placeholders}, content_top={ms.content_top}")
  492. print(f" Theme: {profile.detected_theme}")
  493. print(f" Fonts: {profile.detected_fonts}")
  494. print(f" Margins: {profile.safe_margins}")