quality_inspector.py 43 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017
  1. """
  2. PPT quality inspector and auto-fix engine.
  3. Inspects generated PPT for layout, visual, content, and data issues,
  4. then auto-fixes them iteratively until quality threshold is met.
  5. """
  6. import re
  7. from pptx import Presentation
  8. from pptx.util import Emu, Pt
  9. from pptx.dml.color import RGBColor
  10. from collections import Counter
  11. from quality_rules import (
  12. QUALITY_RULES, SEVERITY_WEIGHTS, CATEGORY_WEIGHTS,
  13. FILL_RATIO_THRESHOLDS, FONT_SIZE_MIN, FONT_SIZE_MAX,
  14. TEXT_MIN_LENGTH, INSIGHT_MIN_COUNT, PAGE_MIN_TEXT_LENGTH,
  15. SAFE_MARGIN, CONTENT_LEFT, CONTENT_TOP_BASE,
  16. FOOTER_TOP, SLIDE_WIDTH, SLIDE_HEIGHT, DEFAULT_FONT,
  17. get_quality_label, calculate_score,
  18. )
  19. from page_layouts import calculate_fill_ratio, ensure_safe_position
  20. FORECAST_PAGE_TYPES = {
  21. 'forecast',
  22. 'prediction',
  23. 'plan',
  24. 'monthly_forecast',
  25. 'monthly_plan',
  26. 'next_month_plan',
  27. 'custom_forecast',
  28. 'custom_prediction',
  29. }
  30. class QualityIssue:
  31. def __init__(self, severity, category, page_index, description,
  32. rule_id='', auto_fixable=True, fix_data=None):
  33. self.severity = severity
  34. self.category = category
  35. self.page_index = page_index
  36. self.description = description
  37. self.rule_id = rule_id
  38. self.auto_fixable = auto_fixable
  39. self.fix_data = fix_data or {}
  40. def __repr__(self):
  41. return f"[{self.severity}] Page {self.page_index+1}: {self.description}"
  42. class QualityInspector:
  43. def __init__(self, theme_colors: dict = None, layout_context=None):
  44. self.theme_colors = theme_colors or {}
  45. self.layout_context = layout_context
  46. self.fix_count = 0
  47. self.fix_log = []
  48. def inspect(self, prs: Presentation, config=None) -> list[QualityIssue]:
  49. issues = []
  50. issues += self._check_confirmation_alignment(prs, config)
  51. for page_idx, slide in enumerate(prs.slides):
  52. page_type = self._get_page_type(page_idx, config, len(prs.slides))
  53. issues += self._check_layout(slide, page_idx)
  54. issues += self._check_visual(slide, page_idx)
  55. issues += self._check_content(slide, page_idx, config, prs, page_type)
  56. issues += self._check_data(slide, page_idx, prs)
  57. return issues
  58. def _get_page_type(self, page_idx: int, config, total_slides: int) -> str:
  59. if config and hasattr(config, 'pages') and page_idx < len(config.pages):
  60. return config.pages[page_idx].page_type
  61. if page_idx == 0:
  62. return 'cover'
  63. if page_idx == total_slides - 1:
  64. return 'end'
  65. if page_idx == 1:
  66. return 'toc'
  67. return 'content'
  68. def _check_confirmation_alignment(self, prs, config) -> list[QualityIssue]:
  69. issues = []
  70. if not config:
  71. return issues
  72. selected_pages = [p for p in getattr(config, 'pages', []) if getattr(p, 'selected', True)]
  73. if getattr(config, 'require_six_confirmations', False):
  74. confirmation = getattr(config, 'user_confirmation', None)
  75. if confirmation and hasattr(confirmation, 'is_complete') and not confirmation.is_complete():
  76. issues.append(QualityIssue(
  77. 'critical', 'data', -1,
  78. '六项确认未完成,PPT 不应进入输出阶段',
  79. 'D006', False,
  80. {'type': 'confirmation_incomplete'}
  81. ))
  82. if config and getattr(config, 'page_count_range', None) and selected_pages:
  83. low, high = config.page_count_range
  84. if len(selected_pages) < low or len(selected_pages) > high:
  85. issues.append(QualityIssue(
  86. 'major', 'data', -1,
  87. f'页面数量 {len(selected_pages)} 不在确认范围 {low}-{high} 内',
  88. 'D006', False,
  89. {'type': 'page_count_range'}
  90. ))
  91. if config and getattr(config, 'metrics', None) and len(selected_pages) > 0:
  92. selected_metrics = [m for m in config.metrics if getattr(m, 'selected', True)]
  93. if not selected_metrics:
  94. issues.append(QualityIssue(
  95. 'critical', 'data', -1,
  96. '未找到已确认的核心指标,无法验证输出一致性',
  97. 'D006', False,
  98. {'type': 'missing_metrics'}
  99. ))
  100. return issues
  101. def auto_fix(self, prs: Presentation, issues: list[QualityIssue]):
  102. fixable = [i for i in issues if i.auto_fixable]
  103. self.fix_count = 0
  104. self.fix_log = []
  105. for issue in fixable:
  106. try:
  107. if issue.page_index < 0:
  108. continue
  109. slide = prs.slides[issue.page_index]
  110. self._apply_fix(slide, issue, prs)
  111. self.fix_count += 1
  112. except Exception as e:
  113. self.fix_log.append(f"Fix failed for {issue.rule_id}: {e}")
  114. return self.fix_count
  115. def _apply_fix(self, slide, issue, prs):
  116. category = issue.category
  117. if category == 'layout':
  118. self._fix_layout(slide, issue)
  119. elif category == 'visual':
  120. self._fix_visual(slide, issue)
  121. elif category == 'content':
  122. self._fix_content(slide, issue, prs)
  123. elif category == 'data':
  124. self._fix_data(slide, issue, prs)
  125. def generate_report(self, issues: list[QualityIssue], iteration: int = 1,
  126. total_pages: int = 0) -> str:
  127. lines = []
  128. lines.append('═' * 50)
  129. lines.append(f' PPT 质量自检报告 (第 {iteration} 轮)')
  130. lines.append('═' * 50)
  131. if not issues:
  132. lines.append('[PASS] 全部通过!未发现任何质量问题。')
  133. return '\n'.join(lines)
  134. by_page = {}
  135. for iss in issues:
  136. p = iss.page_index
  137. if p not in by_page:
  138. by_page[p] = []
  139. by_page[p].append(iss)
  140. for p_idx in sorted(by_page.keys()):
  141. page_issues = by_page[p_idx]
  142. sev_order = {'critical': 0, 'major': 1, 'minor': 2}
  143. page_issues.sort(key=lambda x: sev_order.get(x.severity, 3))
  144. has_critical = any(i.severity == 'critical' for i in page_issues)
  145. has_major = any(i.severity == 'major' for i in page_issues)
  146. if has_critical:
  147. icon = '[CRIT]'
  148. elif has_major:
  149. icon = '[MAJ]'
  150. else:
  151. icon = '[OK]'
  152. lines.append(f'{icon} 第{p_idx+1}页: {len(page_issues)} 个问题')
  153. for iss in page_issues:
  154. sev_icon = {'critical': '[!!]', 'major': '[!]', 'minor': '[-]'}.get(iss.severity, '')
  155. status = ' [FIXED]' if iss.auto_fixable and iss.fix_data.get('fixed') else ''
  156. lines.append(f' ├─ {sev_icon} {iss.description}{status}')
  157. lines.append('─' * 50)
  158. by_sev = Counter(i.severity for i in issues)
  159. by_cat = Counter(i.category for i in issues)
  160. fixed = sum(1 for i in issues if i.auto_fixable and i.fix_data.get('fixed'))
  161. score = calculate_score(dict(by_sev), dict(by_cat), max(total_pages, 1))
  162. label = get_quality_label(score)
  163. lines.append(f'总结: {len(issues)} 个问题 | '
  164. f'{by_sev.get("critical", 0)} 严重 + '
  165. f'{by_sev.get("major", 0)} 主要 + '
  166. f'{by_sev.get("minor", 0)} 次要')
  167. lines.append(f'自动修复: {fixed}/{sum(1 for i in issues if i.auto_fixable)} 个')
  168. lines.append(f'最终质量评分: {score}/100 [{label}]')
  169. lines.append('═' * 50)
  170. return '\n'.join(lines)
  171. def quality_assured_build(self, build_fn, data, config, output_path,
  172. max_iterations=None, _attempt=0) -> tuple:
  173. max_iterations = max_iterations or config.max_fix_iterations
  174. total_pages = 0
  175. needs_rebuild = False
  176. rebuilt_once = False
  177. prs = None
  178. for iteration in range(1, max_iterations + 1):
  179. if iteration == 1 or needs_rebuild:
  180. if needs_rebuild:
  181. if rebuilt_once and iteration > 2:
  182. print(f'[INFO] 已尝试重建,不再继续重建以避免无限循环')
  183. needs_rebuild = False
  184. else:
  185. print(f'[REBUILD] 检测到需要重建的页面,触发重新生成...')
  186. rebuilt_once = True
  187. needs_rebuild = False
  188. prs = build_fn(data, config)
  189. total_pages = len(prs.slides)
  190. issues = self.inspect(prs, config)
  191. if not issues:
  192. print(f'[PASS] 第 {iteration} 次迭代:无问题,质量达标')
  193. break
  194. by_sev = Counter(i.severity for i in issues)
  195. print(f'[INSPECT] 第 {iteration} 次自检:{by_sev.get("critical",0)} 严重 + '
  196. f'{by_sev.get("major",0)} 主要 + {by_sev.get("minor",0)} 次要')
  197. fixable = [i for i in issues if i.auto_fixable]
  198. self.auto_fix(prs, fixable)
  199. print(f'[FIX] 自动修复了 {self.fix_count} 个问题')
  200. for issue in fixable:
  201. if issue.fix_data.get('needs_rebuild'):
  202. needs_rebuild = True
  203. print(f'[WARN] 检测到内容严重不足,将在下一轮迭代中重建')
  204. break
  205. unfixable = [i for i in issues if not i.auto_fixable]
  206. if unfixable:
  207. print(f'[WARN] {len(unfixable)} 个问题需人工确认')
  208. remaining = self.inspect(prs, config)
  209. if not remaining:
  210. print(f'[PASS] 第 {iteration} 次修复后:所有问题已解决')
  211. break
  212. has_critical = any(i.severity == 'critical' for i in remaining)
  213. has_major = any(i.severity == 'major' for i in remaining)
  214. if not has_critical and not has_major:
  215. print(f'[PASS] 第 {iteration} 次修复后:仅剩次要问题,质量达标')
  216. break
  217. if needs_rebuild and iteration < max_iterations:
  218. continue
  219. final_issues = self.inspect(prs, config)
  220. by_sev = Counter(i.severity for i in final_issues)
  221. by_cat = Counter(i.category for i in final_issues)
  222. score = calculate_score(dict(by_sev), dict(by_cat), max(total_pages, 1))
  223. label = get_quality_label(score)
  224. report = self.generate_report(final_issues, iteration, total_pages)
  225. print(report)
  226. if score >= config.quality_threshold:
  227. prs.save(output_path)
  228. print(f'[PASS] 高质量 PPT 已输出: {output_path}')
  229. else:
  230. has_critical_final = any(i.severity == 'critical' for i in final_issues)
  231. has_layout_critical = any(
  232. i.severity == 'critical' and i.category == 'layout'
  233. for i in final_issues
  234. )
  235. if has_layout_critical:
  236. raise RuntimeError(
  237. f'PPT 存在严重布局问题(评分 {score}),无法自动修复。'
  238. f'请检查页面配置和数据。'
  239. )
  240. prs.save(output_path)
  241. if has_critical_final:
  242. print(f'[WARN] 质量评分 {score}(低于阈值 {config.quality_threshold}),'
  243. f'存在 {by_sev.get("critical", 0)} 个严重内容问题,建议补充分析数据后重新生成')
  244. else:
  245. print(f'[WARN] 质量评分 {score}(低于阈值 {config.quality_threshold}),已输出但建议复核')
  246. return prs, final_issues
  247. def _check_layout(self, slide, page_idx) -> list[QualityIssue]:
  248. issues = []
  249. sw = int(slide.slide_width) if hasattr(slide, 'slide_width') else SLIDE_WIDTH
  250. sh = int(slide.slide_height) if hasattr(slide, 'slide_height') else SLIDE_HEIGHT
  251. for shape in slide.shapes:
  252. l, t = int(shape.left), int(shape.top)
  253. w, h = int(shape.width), int(shape.height)
  254. if l < -100:
  255. issues.append(QualityIssue('critical', 'layout', page_idx,
  256. f'形状"{_shape_name(shape)}"飞出页面左边界 (left={l})',
  257. 'L001', True, {'shape': shape, 'type': 'left'}))
  258. if l + w > sw + 500:
  259. issues.append(QualityIssue('critical', 'layout', page_idx,
  260. f'形状"{_shape_name(shape)}"飞出页面右边界 (right={l+w}, max={sw})',
  261. 'L002', True, {'shape': shape, 'type': 'right'}))
  262. if t < -100:
  263. issues.append(QualityIssue('critical', 'layout', page_idx,
  264. f'形状"{_shape_name(shape)}"飞出页面顶部 (top={t})',
  265. 'L003', True, {'shape': shape, 'type': 'top'}))
  266. if t + h > sh + 500:
  267. issues.append(QualityIssue('critical', 'layout', page_idx,
  268. f'形状"{_shape_name(shape)}"飞出页面底部 (bottom={t+h}, max={sh})',
  269. 'L004', True, {'shape': shape, 'type': 'bottom'}))
  270. if l < SAFE_MARGIN and l >= 0:
  271. if l == 0 and w >= sw * 0.8:
  272. continue
  273. if int(shape.top) < 0 or int(shape.top) + int(shape.height) < Emu(100000):
  274. continue
  275. if int(shape.top) > sh - Emu(500000):
  276. continue
  277. issues.append(QualityIssue('minor', 'layout', page_idx,
  278. f'形状"{_shape_name(shape)}"过于靠近左边缘',
  279. 'L007', True, {'shape': shape, 'type': 'edge_left'}))
  280. placeholder_pattern = re.compile(r'\{[^}]+\}')
  281. for shape in slide.shapes:
  282. if shape.has_text_frame:
  283. text = shape.text_frame.text
  284. if placeholder_pattern.search(text):
  285. issues.append(QualityIssue('critical', 'layout', page_idx,
  286. f'发现未替换占位符: "{text[:50]}"',
  287. 'L006', True, {'shape': shape, 'type': 'placeholder'}))
  288. empty_artifacts = self._find_empty_template_artifacts(slide)
  289. for shape in empty_artifacts:
  290. issues.append(QualityIssue(
  291. 'major', 'layout', page_idx,
  292. f'发现空模板组件残留: "{_shape_name(shape)}"',
  293. 'L008', True, {'shape': shape, 'type': 'empty_template_artifact'}
  294. ))
  295. shapes_list = list(slide.shapes)
  296. for i, a in enumerate(shapes_list):
  297. for b in shapes_list[i+1:]:
  298. if self._shapes_overlap(a, b):
  299. a_name = _shape_name(a)
  300. b_name = _shape_name(b)
  301. if self._is_intentional_overlap(a, b):
  302. continue
  303. issues.append(QualityIssue('major', 'layout', page_idx,
  304. f'形状"{a_name}"与"{b_name}"存在重叠',
  305. 'L005', True, {'shape_a': a, 'shape_b': b, 'type': 'overlap'}))
  306. return issues
  307. def _check_visual(self, slide, page_idx) -> list[QualityIssue]:
  308. issues = []
  309. fonts_seen = {}
  310. for shape in slide.shapes:
  311. if not shape.has_text_frame:
  312. continue
  313. for para in shape.text_frame.paragraphs:
  314. for run in para.runs:
  315. if run.font.size:
  316. size_pt = run.font.size / 12700.0
  317. if size_pt < 6:
  318. issues.append(QualityIssue('major', 'visual', page_idx,
  319. f'字号过小 ({size_pt:.1f}pt): "{run.text[:20]}"',
  320. 'V002', True, {'run': run, 'type': 'font_small'}))
  321. elif size_pt > 65:
  322. issues.append(QualityIssue('major', 'visual', page_idx,
  323. f'字号过大 ({size_pt:.1f}pt): "{run.text[:20]}"',
  324. 'V003', True, {'run': run, 'type': 'font_large'}))
  325. if run.font.name:
  326. fonts_seen[run.font.name] = fonts_seen.get(run.font.name, 0) + 1
  327. if len(fonts_seen) > 3:
  328. issues.append(QualityIssue('minor', 'visual', page_idx,
  329. f'字体使用超过3种: {list(fonts_seen.keys())}',
  330. 'V001', True, {'type': 'font_mixed', 'fonts': fonts_seen}))
  331. return issues
  332. def _check_content(self, slide, page_idx, config, prs, page_type='content') -> list[QualityIssue]:
  333. # Resolve dynamic content top from layout context if available
  334. content_top_emu = None
  335. if self.layout_context:
  336. content_top_emu = self.layout_context.content_top
  337. issues = []
  338. if page_type in ('cover', 'end'):
  339. issues += self._check_text_overflow(slide, page_idx)
  340. if page_type == 'cover':
  341. issues += self._check_cover_quality(slide, page_idx)
  342. return issues
  343. issues += self._check_dynamic_page_fit(page_idx, page_type, config)
  344. issues += self._check_core_metric_presence(slide, page_idx, page_type, config)
  345. if page_type == 'toc':
  346. content_shapes = [s for s in slide.shapes
  347. if s.has_text_frame and _is_in_content_area(s)]
  348. all_content_text = ''
  349. for shape in content_shapes:
  350. text = shape.text_frame.text.strip()
  351. if text:
  352. all_content_text += text + '\n'
  353. if len(all_content_text.strip()) < 30:
  354. issues.append(QualityIssue('minor', 'content', page_idx,
  355. '目录页内容过少',
  356. 'C008', False, {'type': 'empty_page'}))
  357. return issues
  358. fill_ratio = calculate_fill_ratio(slide, content_top_emu=content_top_emu)
  359. if page_type in ('kpi_overview', 'trend', 'distribution', 'ranking', 'summary') or page_type in FORECAST_PAGE_TYPES:
  360. if fill_ratio < FILL_RATIO_THRESHOLDS['sparse']:
  361. issues.append(QualityIssue('critical', 'content', page_idx,
  362. f'页面内容严重不足,填充率仅 {fill_ratio:.1%},必须补充图表和分析文本',
  363. 'C001', True, {'type': 'sparse', 'fill_ratio': fill_ratio}))
  364. elif fill_ratio < FILL_RATIO_THRESHOLDS['low']:
  365. issues.append(QualityIssue('major', 'content', page_idx,
  366. f'页面留白偏多,填充率 {fill_ratio:.1%},需补充分析内容',
  367. 'C001', True, {'type': 'sparse', 'fill_ratio': fill_ratio}))
  368. elif fill_ratio < FILL_RATIO_THRESHOLDS['sparse'] / 2:
  369. issues.append(QualityIssue('minor', 'content', page_idx,
  370. f'页面填充率过低 {fill_ratio:.1%}',
  371. 'C001', False))
  372. content_shapes = [s for s in slide.shapes
  373. if s.has_text_frame and _is_in_content_area(s)]
  374. all_content_text = ''
  375. insight_blocks = 0
  376. for shape in content_shapes:
  377. tf = shape.text_frame
  378. full_text = tf.text.strip()
  379. if not full_text:
  380. continue
  381. all_content_text += full_text + '\n'
  382. for para in tf.paragraphs:
  383. para_text = para.text.strip()
  384. if para_text and len(para_text) >= TEXT_MIN_LENGTH:
  385. insight_blocks += 1
  386. total_content_chars = len(all_content_text.strip())
  387. text_lengths = [len(p.text.strip()) for s in content_shapes
  388. for p in s.text_frame.paragraphs if p.text.strip()]
  389. if total_content_chars < PAGE_MIN_TEXT_LENGTH:
  390. issues.append(QualityIssue('critical', 'content', page_idx,
  391. f'页面内容为空!所有文本框总字数仅 {total_content_chars} 字(要求≥{PAGE_MIN_TEXT_LENGTH}字)',
  392. 'C008', True, {'type': 'empty_page', 'char_count': total_content_chars}))
  393. elif total_content_chars < 200:
  394. issues.append(QualityIssue('major', 'content', page_idx,
  395. f'页面内容过少,总字数仅 {total_content_chars} 字,分析深度严重不足',
  396. 'C008', True, {'type': 'empty_page', 'char_count': total_content_chars}))
  397. if text_lengths and max(text_lengths) < TEXT_MIN_LENGTH:
  398. issues.append(QualityIssue('critical', 'content', page_idx,
  399. f'分析文本过短(最长为 {max(text_lengths)} 字),需撰写≥{TEXT_MIN_LENGTH}字的深度分析',
  400. 'C005', True, {'type': 'short_text', 'max_length': max(text_lengths)}))
  401. if insight_blocks < INSIGHT_MIN_COUNT:
  402. issues.append(QualityIssue('critical', 'content', page_idx,
  403. f'分析段数不足,仅 {insight_blocks} 段(要求≥{INSIGHT_MIN_COUNT}段)',
  404. 'C007', True, {'type': 'insight_count', 'count': insight_blocks}))
  405. has_title = False
  406. for shape in slide.shapes:
  407. if shape.has_text_frame:
  408. text = shape.text_frame.text
  409. try:
  410. sy = int(shape.top)
  411. except Exception:
  412. sy = 99999999
  413. if sy < CONTENT_TOP_BASE + Emu(100000) and sy > Emu(500000):
  414. if len(text.strip()) > 0 and not text.startswith('{'):
  415. has_title = True
  416. break
  417. if any(kw in text for kw in ['概览', '趋势', '分布', '分析', '总结',
  418. '排行', '报告', '建议', '告警', '要点']):
  419. if sy < CONTENT_TOP_BASE + Emu(400000):
  420. has_title = True
  421. break
  422. if not has_title and page_idx > 0 and page_idx < len(prs.slides) - 1:
  423. issues.append(QualityIssue('critical', 'content', page_idx,
  424. '页面缺少标题', 'C006', True, {'type': 'missing_title'}))
  425. issues += self._check_text_overflow(slide, page_idx)
  426. has_chart = False
  427. for shape in slide.shapes:
  428. if shape.has_chart:
  429. has_chart = True
  430. break
  431. if has_chart and insight_blocks == 0 and page_idx >= 2:
  432. issues.append(QualityIssue('critical', 'content', page_idx,
  433. '页面有图表但完全缺少分析文本,图表数据需要被解读和说明',
  434. 'C009', True, {'type': 'chart_no_text'}))
  435. return issues
  436. # ---- Cover page quality checks (V006, C012) ----
  437. # These catch the most common python-pptx template pitfalls before the
  438. # user sees the output: white text on white background, and unfilled
  439. # template default text in placeholders.
  440. _COVER_TEMPLATE_DEFAULT_PATTERNS = [
  441. '单击此处编辑母版标题样式', '单击此处添加标题',
  442. '单击此处编辑母版文本样式', '单击此处添加文本',
  443. '单击此处添加副标题',
  444. ]
  445. # Light/pale colors that are invisible on white backgrounds
  446. _LIGHT_COLOR_THRESHOLD = 0xCC # RGB channels above this = "very light"
  447. def _check_cover_quality(self, slide, page_idx) -> list[QualityIssue]:
  448. """Check cover page for common template rendering issues.
  449. V006: placeholder text is white/light but positioned on light
  450. background (e.g. below a colored banner). Auto-fixable.
  451. C012: placeholder still contains template default text like
  452. "单击此处编辑母版标题样式". Not auto-fixable — needs rebuild.
  453. """
  454. issues = []
  455. slide_h = int(slide.slide_height) if hasattr(slide, 'slide_height') else SLIDE_HEIGHT
  456. # Detect the approximate end of the colored banner area.
  457. # Heuristic: find the tallest filled rectangle that starts at y=0.
  458. banner_bottom = 0
  459. for shape in slide.shapes:
  460. try:
  461. sy = int(shape.top)
  462. sh = int(shape.height)
  463. if sy < Emu(50000): # starts near top
  464. if sh < slide_h * 0.7: # not full-slide background
  465. banner_bottom = max(banner_bottom, sy + sh)
  466. except Exception:
  467. pass
  468. for shape in slide.shapes:
  469. if not shape.is_placeholder or not shape.has_text_frame:
  470. continue
  471. tf = shape.text_frame
  472. text = tf.text.strip()
  473. # --- C012: template default text ---
  474. if text and any(p in text for p in self._COVER_TEMPLATE_DEFAULT_PATTERNS):
  475. issues.append(QualityIssue(
  476. 'critical', 'content', page_idx,
  477. f'封面占位符仍为模板默认文字 "{text[:30]}",idx={shape.placeholder_format.idx}',
  478. 'C012', False,
  479. {'type': 'cover_template_text', 'shape': shape}
  480. ))
  481. continue
  482. # --- V006: white/light text on light background ---
  483. if text and banner_bottom > 0:
  484. try:
  485. sy = int(shape.top)
  486. except Exception:
  487. continue
  488. # Only flag text BELOW the banner (on white area)
  489. if sy < banner_bottom:
  490. continue
  491. # Check if text color is very light / near-white
  492. for para in tf.paragraphs:
  493. for run in para.runs:
  494. if run.font.color and run.font.color.rgb:
  495. rgb = run.font.color.rgb
  496. if (int(str(rgb)[:2], 16) >= self._LIGHT_COLOR_THRESHOLD and
  497. int(str(rgb)[2:4], 16) >= self._LIGHT_COLOR_THRESHOLD and
  498. int(str(rgb)[4:6], 16) >= self._LIGHT_COLOR_THRESHOLD):
  499. issues.append(QualityIssue(
  500. 'critical', 'visual', page_idx,
  501. f'封面文字 "{text[:20]}" 颜色过浅 (#{rgb}) '
  502. f'位于白色背景区域(>y={banner_bottom})将不可见',
  503. 'V006', True,
  504. {'type': 'cover_text_invisible', 'shape': shape,
  505. 'banner_bottom': banner_bottom}
  506. ))
  507. break
  508. return issues
  509. def _check_text_overflow(self, slide, page_idx) -> list[QualityIssue]:
  510. issues = []
  511. for shape in slide.shapes:
  512. if shape.has_text_frame and self._is_text_overflowing(shape):
  513. issues.append(QualityIssue(
  514. 'major', 'content', page_idx,
  515. f'文本可能超出文本框边界: "{shape.text_frame.text[:30]}"',
  516. 'C004', True, {'shape': shape, 'type': 'text_overflow'}
  517. ))
  518. return issues
  519. def _check_dynamic_page_fit(self, page_idx, page_type, config) -> list[QualityIssue]:
  520. issues = []
  521. profile = getattr(config, 'data_profiling', None) or {}
  522. if not profile:
  523. return issues
  524. time_cols = profile.get('time_columns', [])
  525. cat_cols = profile.get('category_columns', [])
  526. num_cols = profile.get('numeric_columns', [])
  527. if page_type == 'trend' and (not time_cols or not num_cols):
  528. issues.append(QualityIssue(
  529. 'critical', 'content', page_idx,
  530. '趋势页缺少可用时间列或数值列,需要重建或降级为摘要页',
  531. 'C010', True, {'type': 'dynamic_page_not_supported', 'page_type': page_type}
  532. ))
  533. elif page_type in ('distribution', 'ranking') and (not cat_cols or not num_cols):
  534. issues.append(QualityIssue(
  535. 'critical', 'content', page_idx,
  536. f'{page_type} 页缺少分类维度或数值列,需要重建或降级为摘要页',
  537. 'C010', True, {'type': 'dynamic_page_not_supported', 'page_type': page_type}
  538. ))
  539. elif page_type == 'kpi_overview':
  540. selected_metrics = [m for m in getattr(config, 'metrics', []) if getattr(m, 'selected', True)]
  541. if len(selected_metrics) > 6:
  542. issues.append(QualityIssue(
  543. 'minor', 'content', page_idx,
  544. f'核心指标数量 {len(selected_metrics)} 超过 6 个,KPI页应切换为紧凑布局或拆分展示',
  545. 'C011', True, {'type': 'kpi_layout_over_capacity', 'count': len(selected_metrics)}
  546. ))
  547. elif len(selected_metrics) >= 4:
  548. issues.append(QualityIssue(
  549. 'minor', 'content', page_idx,
  550. f'核心指标数量 {len(selected_metrics)} 较多,建议使用紧凑布局以保留洞察区',
  551. 'C011', True, {'type': 'kpi_layout_compact_needed', 'count': len(selected_metrics)}
  552. ))
  553. return issues
  554. def _check_core_metric_presence(self, slide, page_idx, page_type, config) -> list[QualityIssue]:
  555. issues = []
  556. if page_type != 'kpi_overview' or not config:
  557. return issues
  558. selected_metrics = [m for m in getattr(config, 'metrics', []) if getattr(m, 'selected', True)]
  559. if not selected_metrics:
  560. return issues
  561. slide_text = '\n'.join(
  562. shape.text_frame.text for shape in slide.shapes
  563. if shape.has_text_frame and shape.text_frame.text
  564. )
  565. missing = [m.label for m in selected_metrics[:6] if m.label and m.label not in slide_text]
  566. if missing:
  567. issues.append(QualityIssue(
  568. 'critical', 'data', page_idx,
  569. 'KPI概览页缺少已确认核心指标:' + '、'.join(missing),
  570. 'D006', True, {'type': 'core_metric_missing', 'missing': missing}
  571. ))
  572. return issues
  573. def _check_data(self, slide, page_idx, prs) -> list[QualityIssue]:
  574. issues = []
  575. if page_idx == 0:
  576. return issues
  577. for shape in slide.shapes:
  578. if shape.has_text_frame:
  579. text = shape.text_frame.text
  580. page_pattern = re.search(r'(\d+)\s*/\s*(\d+)', text)
  581. if page_pattern:
  582. current = int(page_pattern.group(1))
  583. total = int(page_pattern.group(2))
  584. if total == 0:
  585. issues.append(QualityIssue('major', 'data', page_idx,
  586. f'页码格式异常: {text.strip()}',
  587. 'D002', True, {'type': 'page_num'}))
  588. return issues
  589. def _fix_layout(self, slide, issue):
  590. fd = issue.fix_data
  591. if fd.get('type') in ('left', 'right', 'top', 'bottom'):
  592. shape = fd.get('shape')
  593. if shape:
  594. ensure_safe_position(shape, SLIDE_WIDTH, SLIDE_HEIGHT)
  595. fd['fixed'] = True
  596. elif fd.get('type') == 'overlap':
  597. a, b = fd.get('shape_a'), fd.get('shape_b')
  598. if a and b:
  599. try:
  600. if int(b.left) < int(a.left) + int(a.width) + Emu(50000):
  601. b.left = int(a.left) + int(a.width) + Emu(152400)
  602. ensure_safe_position(b, SLIDE_WIDTH, SLIDE_HEIGHT)
  603. except Exception:
  604. pass
  605. fd['fixed'] = True
  606. elif fd.get('type') == 'placeholder':
  607. shape = fd.get('shape')
  608. if shape and shape.has_text_frame:
  609. text = shape.text_frame.text or ''
  610. # For KPI placeholders, remove the entire shape and nearby card backgrounds
  611. kpi_pattern = re.compile(r'\{kpi\d+_(label|value)\}')
  612. if kpi_pattern.search(text):
  613. # Remove this text shape
  614. self._remove_shape(shape)
  615. # Also remove nearby rounded rectangle backgrounds
  616. try:
  617. sx = int(shape.left)
  618. sy = int(shape.top)
  619. sw = int(shape.width)
  620. sh = int(shape.height)
  621. pad = 300000
  622. for other in list(slide.shapes):
  623. try:
  624. ox = int(other.left)
  625. oy = int(other.top)
  626. ow = int(other.width)
  627. oh = int(other.height)
  628. in_region = (
  629. ox >= sx - pad and ox + ow <= sx + sw + pad and
  630. oy >= sy - pad and oy + oh <= sy + sh + pad
  631. )
  632. if in_region and other != shape:
  633. # Check if it's a background shape (no text or empty text)
  634. if not other.has_text_frame or not (other.text_frame.text or '').strip():
  635. self._remove_shape(other)
  636. except Exception:
  637. pass
  638. except Exception:
  639. pass
  640. else:
  641. # For other placeholders, just clear the text
  642. for para in shape.text_frame.paragraphs:
  643. para.text = re.sub(r'\{[^}]+\}', '', para.text)
  644. fd['fixed'] = True
  645. elif fd.get('type') == 'edge_left':
  646. shape = fd.get('shape')
  647. if shape:
  648. try:
  649. w = int(shape.width)
  650. if w < SLIDE_WIDTH * 0.5:
  651. shape.left = SAFE_MARGIN
  652. except Exception:
  653. pass
  654. fd['fixed'] = True
  655. elif fd.get('type') == 'empty_template_artifact':
  656. shape = fd.get('shape')
  657. if shape:
  658. self._remove_shape(shape)
  659. fd['fixed'] = True
  660. def _fix_visual(self, slide, issue):
  661. fd = issue.fix_data
  662. if fd.get('type') == 'font_small':
  663. run = fd.get('run')
  664. if run:
  665. run.font.size = FONT_SIZE_MIN
  666. fd['fixed'] = True
  667. elif fd.get('type') == 'font_large':
  668. run = fd.get('run')
  669. if run:
  670. run.font.size = FONT_SIZE_MAX
  671. fd['fixed'] = True
  672. elif fd.get('type') == 'font_mixed':
  673. for shape in slide.shapes:
  674. if shape.has_text_frame:
  675. for para in shape.text_frame.paragraphs:
  676. for run in para.runs:
  677. run.font.name = DEFAULT_FONT
  678. fd['fixed'] = True
  679. elif fd.get('type') == 'cover_text_invisible':
  680. shape = fd.get('shape')
  681. if shape and shape.has_text_frame:
  682. dark_color = self.theme_colors.get('primary', RGBColor(0x1E, 0x3A, 0x5F))
  683. for para in shape.text_frame.paragraphs:
  684. for run in para.runs:
  685. run.font.color.rgb = dark_color
  686. fd['fixed'] = True
  687. def _fix_content(self, slide, issue, prs):
  688. fd = issue.fix_data
  689. if fd.get('type') == 'sparse':
  690. fill_ratio = fd.get('fill_ratio', 0)
  691. if fill_ratio < FILL_RATIO_THRESHOLDS['low']:
  692. try:
  693. box = slide.shapes.add_textbox(
  694. CONTENT_LEFT, Emu(int(FOOTER_TOP) - Emu(1600000)),
  695. Emu(SLIDE_WIDTH - 2 * CONTENT_LEFT - Emu(200000)), Emu(1500000))
  696. tf = box.text_frame
  697. tf.word_wrap = True
  698. p = tf.paragraphs[0]
  699. p.text = (
  700. '[WARNING] 此页面内容不足,需补充深度分析内容。'
  701. '分析应包含:具体数据引用(含数值和单位)、'
  702. '与同类/历史/目标的对比分析、'
  703. '数据背后原因的至少2条解读、'
  704. '以及可执行的业务行动建议。'
  705. '请勿使用"要加强"、"进一步优化"等模糊措辞。'
  706. )
  707. p.font.size = Pt(12)
  708. p.font.color.rgb = RGBColor(0xCC, 0x33, 0x00)
  709. p.font.name = DEFAULT_FONT
  710. p.font.bold = True
  711. fd['fixed'] = True
  712. fd['needs_rebuild'] = True
  713. except Exception:
  714. pass
  715. elif fd.get('type') == 'empty_page':
  716. fd['needs_rebuild'] = True
  717. fd['fixed'] = True
  718. elif fd.get('type') == 'chart_no_text':
  719. fd['needs_rebuild'] = True
  720. fd['fixed'] = True
  721. elif fd.get('type') == 'insight_count':
  722. fd['needs_rebuild'] = True
  723. fd['fixed'] = True
  724. elif fd.get('type') == 'short_text':
  725. fd['needs_rebuild'] = True
  726. fd['fixed'] = True
  727. elif fd.get('type') in ('dynamic_page_not_supported', 'kpi_layout_over_capacity'):
  728. fd['fixed'] = True
  729. elif fd.get('type') == 'kpi_layout_compact_needed':
  730. fd['fixed'] = True
  731. elif fd.get('type') == 'core_metric_missing':
  732. fd['needs_rebuild'] = True
  733. fd['fixed'] = True
  734. elif fd.get('type') == 'missing_title':
  735. try:
  736. box = slide.shapes.add_textbox(
  737. CONTENT_LEFT, Emu(914400),
  738. Emu(SLIDE_WIDTH - 2 * CONTENT_LEFT - Emu(200000)), Emu(508000))
  739. p = box.text_frame.paragraphs[0]
  740. p.text = '数据详情'
  741. p.font.size = Pt(24)
  742. p.font.bold = True
  743. p.font.color.rgb = RGBColor(0x33, 0x33, 0x33)
  744. p.font.name = DEFAULT_FONT
  745. fd['fixed'] = True
  746. except Exception:
  747. pass
  748. elif fd.get('type') == 'text_overflow':
  749. shape = fd.get('shape')
  750. if shape and shape.has_text_frame:
  751. text_len = len(shape.text_frame.text or '')
  752. try:
  753. if text_len > 180 or int(shape.top) + int(shape.height) > int(FOOTER_TOP) - Emu(120000):
  754. fd['needs_rebuild'] = True
  755. else:
  756. for para in shape.text_frame.paragraphs:
  757. for run in para.runs:
  758. if run.font.size and run.font.size > Pt(9):
  759. run.font.size = Pt(9)
  760. except Exception:
  761. fd['needs_rebuild'] = True
  762. fd['fixed'] = True
  763. def _fix_data(self, slide, issue, prs):
  764. fd = issue.fix_data
  765. if fd.get('type') == 'page_num':
  766. fd['fixed'] = True
  767. def _shapes_overlap(self, a, b) -> bool:
  768. ax, ay, aw, ah = int(a.left), int(a.top), int(a.width), int(a.height)
  769. bx, by, bw, bh = int(b.left), int(b.top), int(b.width), int(b.height)
  770. if ax + aw <= bx or bx + bw <= ax:
  771. return False
  772. if ay + ah <= by or by + bh <= ay:
  773. return False
  774. return True
  775. def _is_intentional_overlap(self, a, b) -> bool:
  776. if hasattr(a, 'is_placeholder') or hasattr(b, 'is_placeholder'):
  777. return True
  778. a_area = int(a.width) * int(a.height)
  779. b_area = int(b.width) * int(b.height)
  780. if a_area > b_area * 3 or b_area > a_area * 3:
  781. return True
  782. return False
  783. def _is_title_shape(self, shape) -> bool:
  784. if not shape.has_text_frame:
  785. return False
  786. try:
  787. y = int(shape.top)
  788. return y < int(CONTENT_TOP_BASE) + Emu(200000)
  789. except Exception:
  790. return False
  791. def _find_empty_template_artifacts(self, slide) -> list:
  792. artifacts = []
  793. shapes = list(slide.shapes)
  794. empty_text_boxes = []
  795. for shape in shapes:
  796. if shape.has_text_frame:
  797. text = (shape.text_frame.text or '').strip()
  798. if text:
  799. continue
  800. if int(shape.width) < Emu(200000) or int(shape.height) < Emu(120000):
  801. continue
  802. if int(shape.top) < Emu(900000) or int(shape.top) > int(FOOTER_TOP) - Emu(100000):
  803. continue
  804. empty_text_boxes.append(shape)
  805. artifacts.append(shape)
  806. for shape in shapes:
  807. if shape.has_text_frame:
  808. continue
  809. try:
  810. is_large_soft_card = (
  811. int(shape.width) >= Emu(1000000) and
  812. int(shape.height) >= Emu(500000) and
  813. int(shape.top) < int(FOOTER_TOP) - Emu(400000)
  814. )
  815. if not is_large_soft_card:
  816. continue
  817. overlaps_empty_text = any(self._shapes_overlap(shape, box) for box in empty_text_boxes)
  818. if overlaps_empty_text:
  819. artifacts.append(shape)
  820. except Exception:
  821. continue
  822. # Preserve order while de-duplicating.
  823. seen = set()
  824. unique = []
  825. for shape in artifacts:
  826. key = id(shape)
  827. if key not in seen:
  828. unique.append(shape)
  829. seen.add(key)
  830. return unique
  831. def _remove_shape(self, shape):
  832. el = shape.element
  833. el.getparent().remove(el)
  834. def _is_text_overflowing(self, shape) -> bool:
  835. if not shape.has_text_frame:
  836. return False
  837. text = shape.text_frame.text
  838. if not text.strip():
  839. return False
  840. if len(text) > 800:
  841. return True
  842. try:
  843. w = int(shape.width)
  844. h = int(shape.height)
  845. width_pt = max(1, w / 12700.0)
  846. max_font_pt = 10
  847. para_count = 0
  848. for para in shape.text_frame.paragraphs:
  849. if not para.text.strip():
  850. continue
  851. para_count += 1
  852. for run in para.runs:
  853. if run.font.size:
  854. max_font_pt = max(max_font_pt, run.font.size / 12700.0)
  855. chars_per_line = max(8, int(width_pt / (max_font_pt * 1.15)))
  856. est_lines = max(1, (len(text) + chars_per_line - 1) // chars_per_line)
  857. est_height = int((est_lines * max_font_pt * 1.2 + para_count * 4) * 12700)
  858. if est_height > h * 1.15:
  859. return True
  860. if h < Emu(200000) and len(text) > 80:
  861. return True
  862. except Exception:
  863. pass
  864. return False
  865. def _shape_name(shape):
  866. try:
  867. if shape.has_text_frame:
  868. return shape.text_frame.text[:20].replace('\n', ' ')
  869. except Exception:
  870. pass
  871. try:
  872. return shape.shape_type
  873. except Exception:
  874. pass
  875. return '无名形状'
  876. def _is_in_content_area(shape):
  877. try:
  878. return int(shape.top) >= int(CONTENT_TOP_BASE)
  879. except Exception:
  880. return False
  881. if __name__ == '__main__':
  882. print("QualityInspector module loaded")
  883. inspector = QualityInspector()
  884. print("Ready to inspect PPT files")