quality_inspector.py 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882
  1. """
  2. PPT quality inspector and auto-fix engine.
  3. Inspects generated PPT for layout, visual, content, and data issues,
  4. then auto-fixes them iteratively until quality threshold is met.
  5. """
  6. import re
  7. from pptx import Presentation
  8. from pptx.util import Emu, Pt
  9. from pptx.dml.color import RGBColor
  10. from collections import Counter
  11. from quality_rules import (
  12. QUALITY_RULES, SEVERITY_WEIGHTS, CATEGORY_WEIGHTS,
  13. FILL_RATIO_THRESHOLDS, FONT_SIZE_MIN, FONT_SIZE_MAX,
  14. TEXT_MIN_LENGTH, INSIGHT_MIN_COUNT, PAGE_MIN_TEXT_LENGTH,
  15. SAFE_MARGIN, CONTENT_LEFT, CONTENT_TOP_BASE,
  16. FOOTER_TOP, SLIDE_WIDTH, SLIDE_HEIGHT, DEFAULT_FONT,
  17. get_quality_label, calculate_score,
  18. )
  19. from page_layouts import calculate_fill_ratio, ensure_safe_position
  20. FORECAST_PAGE_TYPES = {
  21. 'forecast',
  22. 'prediction',
  23. 'plan',
  24. 'monthly_forecast',
  25. 'monthly_plan',
  26. 'next_month_plan',
  27. 'custom_forecast',
  28. 'custom_prediction',
  29. }
  30. class QualityIssue:
  31. def __init__(self, severity, category, page_index, description,
  32. rule_id='', auto_fixable=True, fix_data=None):
  33. self.severity = severity
  34. self.category = category
  35. self.page_index = page_index
  36. self.description = description
  37. self.rule_id = rule_id
  38. self.auto_fixable = auto_fixable
  39. self.fix_data = fix_data or {}
  40. def __repr__(self):
  41. return f"[{self.severity}] Page {self.page_index+1}: {self.description}"
  42. class QualityInspector:
  43. def __init__(self, theme_colors: dict = None):
  44. self.theme_colors = theme_colors or {}
  45. self.fix_count = 0
  46. self.fix_log = []
  47. def inspect(self, prs: Presentation, config=None) -> list[QualityIssue]:
  48. issues = []
  49. issues += self._check_confirmation_alignment(prs, config)
  50. for page_idx, slide in enumerate(prs.slides):
  51. page_type = self._get_page_type(page_idx, config, len(prs.slides))
  52. issues += self._check_layout(slide, page_idx)
  53. issues += self._check_visual(slide, page_idx)
  54. issues += self._check_content(slide, page_idx, config, prs, page_type)
  55. issues += self._check_data(slide, page_idx, prs)
  56. return issues
  57. def _get_page_type(self, page_idx: int, config, total_slides: int) -> str:
  58. if config and hasattr(config, 'pages') and page_idx < len(config.pages):
  59. return config.pages[page_idx].page_type
  60. if page_idx == 0:
  61. return 'cover'
  62. if page_idx == total_slides - 1:
  63. return 'end'
  64. if page_idx == 1:
  65. return 'toc'
  66. return 'content'
  67. def _check_confirmation_alignment(self, prs, config) -> list[QualityIssue]:
  68. issues = []
  69. if not config:
  70. return issues
  71. selected_pages = [p for p in getattr(config, 'pages', []) if getattr(p, 'selected', True)]
  72. if getattr(config, 'require_six_confirmations', False):
  73. confirmation = getattr(config, 'user_confirmation', None)
  74. if confirmation and hasattr(confirmation, 'is_complete') and not confirmation.is_complete():
  75. issues.append(QualityIssue(
  76. 'critical', 'data', -1,
  77. '六项确认未完成,PPT 不应进入输出阶段',
  78. 'D006', False,
  79. {'type': 'confirmation_incomplete'}
  80. ))
  81. if config and getattr(config, 'page_count_range', None) and selected_pages:
  82. low, high = config.page_count_range
  83. if len(selected_pages) < low or len(selected_pages) > high:
  84. issues.append(QualityIssue(
  85. 'major', 'data', -1,
  86. f'页面数量 {len(selected_pages)} 不在确认范围 {low}-{high} 内',
  87. 'D006', False,
  88. {'type': 'page_count_range'}
  89. ))
  90. if config and getattr(config, 'metrics', None) and len(selected_pages) > 0:
  91. selected_metrics = [m for m in config.metrics if getattr(m, 'selected', True)]
  92. if not selected_metrics:
  93. issues.append(QualityIssue(
  94. 'critical', 'data', -1,
  95. '未找到已确认的核心指标,无法验证输出一致性',
  96. 'D006', False,
  97. {'type': 'missing_metrics'}
  98. ))
  99. return issues
  100. def auto_fix(self, prs: Presentation, issues: list[QualityIssue]):
  101. fixable = [i for i in issues if i.auto_fixable]
  102. self.fix_count = 0
  103. self.fix_log = []
  104. for issue in fixable:
  105. try:
  106. if issue.page_index < 0:
  107. continue
  108. slide = prs.slides[issue.page_index]
  109. self._apply_fix(slide, issue, prs)
  110. self.fix_count += 1
  111. except Exception as e:
  112. self.fix_log.append(f"Fix failed for {issue.rule_id}: {e}")
  113. return self.fix_count
  114. def _apply_fix(self, slide, issue, prs):
  115. category = issue.category
  116. if category == 'layout':
  117. self._fix_layout(slide, issue)
  118. elif category == 'visual':
  119. self._fix_visual(slide, issue)
  120. elif category == 'content':
  121. self._fix_content(slide, issue, prs)
  122. elif category == 'data':
  123. self._fix_data(slide, issue, prs)
  124. def generate_report(self, issues: list[QualityIssue], iteration: int = 1,
  125. total_pages: int = 0) -> str:
  126. lines = []
  127. lines.append('═' * 50)
  128. lines.append(f' PPT 质量自检报告 (第 {iteration} 轮)')
  129. lines.append('═' * 50)
  130. if not issues:
  131. lines.append('[PASS] 全部通过!未发现任何质量问题。')
  132. return '\n'.join(lines)
  133. by_page = {}
  134. for iss in issues:
  135. p = iss.page_index
  136. if p not in by_page:
  137. by_page[p] = []
  138. by_page[p].append(iss)
  139. for p_idx in sorted(by_page.keys()):
  140. page_issues = by_page[p_idx]
  141. sev_order = {'critical': 0, 'major': 1, 'minor': 2}
  142. page_issues.sort(key=lambda x: sev_order.get(x.severity, 3))
  143. has_critical = any(i.severity == 'critical' for i in page_issues)
  144. has_major = any(i.severity == 'major' for i in page_issues)
  145. if has_critical:
  146. icon = '[CRIT]'
  147. elif has_major:
  148. icon = '[MAJ]'
  149. else:
  150. icon = '[OK]'
  151. lines.append(f'{icon} 第{p_idx+1}页: {len(page_issues)} 个问题')
  152. for iss in page_issues:
  153. sev_icon = {'critical': '[!!]', 'major': '[!]', 'minor': '[-]'}.get(iss.severity, '')
  154. status = ' [FIXED]' if iss.auto_fixable and iss.fix_data.get('fixed') else ''
  155. lines.append(f' ├─ {sev_icon} {iss.description}{status}')
  156. lines.append('─' * 50)
  157. by_sev = Counter(i.severity for i in issues)
  158. by_cat = Counter(i.category for i in issues)
  159. fixed = sum(1 for i in issues if i.auto_fixable and i.fix_data.get('fixed'))
  160. score = calculate_score(dict(by_sev), dict(by_cat), max(total_pages, 1))
  161. label = get_quality_label(score)
  162. lines.append(f'总结: {len(issues)} 个问题 | '
  163. f'{by_sev.get("critical", 0)} 严重 + '
  164. f'{by_sev.get("major", 0)} 主要 + '
  165. f'{by_sev.get("minor", 0)} 次要')
  166. lines.append(f'自动修复: {fixed}/{sum(1 for i in issues if i.auto_fixable)} 个')
  167. lines.append(f'最终质量评分: {score}/100 [{label}]')
  168. lines.append('═' * 50)
  169. return '\n'.join(lines)
  170. def quality_assured_build(self, build_fn, data, config, output_path,
  171. max_iterations=None, _attempt=0) -> tuple:
  172. max_iterations = max_iterations or config.max_fix_iterations
  173. total_pages = 0
  174. needs_rebuild = False
  175. rebuilt_once = False
  176. prs = None
  177. for iteration in range(1, max_iterations + 1):
  178. if iteration == 1 or needs_rebuild:
  179. if needs_rebuild:
  180. if rebuilt_once and iteration > 2:
  181. print(f'[INFO] 已尝试重建,不再继续重建以避免无限循环')
  182. needs_rebuild = False
  183. else:
  184. print(f'[REBUILD] 检测到需要重建的页面,触发重新生成...')
  185. rebuilt_once = True
  186. needs_rebuild = False
  187. prs = build_fn(data, config)
  188. total_pages = len(prs.slides)
  189. issues = self.inspect(prs, config)
  190. if not issues:
  191. print(f'[PASS] 第 {iteration} 次迭代:无问题,质量达标')
  192. break
  193. by_sev = Counter(i.severity for i in issues)
  194. print(f'[INSPECT] 第 {iteration} 次自检:{by_sev.get("critical",0)} 严重 + '
  195. f'{by_sev.get("major",0)} 主要 + {by_sev.get("minor",0)} 次要')
  196. fixable = [i for i in issues if i.auto_fixable]
  197. self.auto_fix(prs, fixable)
  198. print(f'[FIX] 自动修复了 {self.fix_count} 个问题')
  199. for issue in fixable:
  200. if issue.fix_data.get('needs_rebuild'):
  201. needs_rebuild = True
  202. print(f'[WARN] 检测到内容严重不足,将在下一轮迭代中重建')
  203. break
  204. unfixable = [i for i in issues if not i.auto_fixable]
  205. if unfixable:
  206. print(f'[WARN] {len(unfixable)} 个问题需人工确认')
  207. remaining = self.inspect(prs, config)
  208. if not remaining:
  209. print(f'[PASS] 第 {iteration} 次修复后:所有问题已解决')
  210. break
  211. has_critical = any(i.severity == 'critical' for i in remaining)
  212. has_major = any(i.severity == 'major' for i in remaining)
  213. if not has_critical and not has_major:
  214. print(f'[PASS] 第 {iteration} 次修复后:仅剩次要问题,质量达标')
  215. break
  216. if needs_rebuild and iteration < max_iterations:
  217. continue
  218. final_issues = self.inspect(prs, config)
  219. by_sev = Counter(i.severity for i in final_issues)
  220. by_cat = Counter(i.category for i in final_issues)
  221. score = calculate_score(dict(by_sev), dict(by_cat), max(total_pages, 1))
  222. label = get_quality_label(score)
  223. report = self.generate_report(final_issues, iteration, total_pages)
  224. print(report)
  225. if score >= config.quality_threshold:
  226. prs.save(output_path)
  227. print(f'[PASS] 高质量 PPT 已输出: {output_path}')
  228. else:
  229. has_critical_final = any(i.severity == 'critical' for i in final_issues)
  230. has_layout_critical = any(
  231. i.severity == 'critical' and i.category == 'layout'
  232. for i in final_issues
  233. )
  234. if has_layout_critical:
  235. raise RuntimeError(
  236. f'PPT 存在严重布局问题(评分 {score}),无法自动修复。'
  237. f'请检查页面配置和数据。'
  238. )
  239. prs.save(output_path)
  240. if has_critical_final:
  241. print(f'[WARN] 质量评分 {score}(低于阈值 {config.quality_threshold}),'
  242. f'存在 {by_sev.get("critical", 0)} 个严重内容问题,建议补充分析数据后重新生成')
  243. else:
  244. print(f'[WARN] 质量评分 {score}(低于阈值 {config.quality_threshold}),已输出但建议复核')
  245. return prs, final_issues
  246. def _check_layout(self, slide, page_idx) -> list[QualityIssue]:
  247. issues = []
  248. sw = int(slide.slide_width) if hasattr(slide, 'slide_width') else SLIDE_WIDTH
  249. sh = int(slide.slide_height) if hasattr(slide, 'slide_height') else SLIDE_HEIGHT
  250. for shape in slide.shapes:
  251. l, t = int(shape.left), int(shape.top)
  252. w, h = int(shape.width), int(shape.height)
  253. if l < -100:
  254. issues.append(QualityIssue('critical', 'layout', page_idx,
  255. f'形状"{_shape_name(shape)}"飞出页面左边界 (left={l})',
  256. 'L001', True, {'shape': shape, 'type': 'left'}))
  257. if l + w > sw + 500:
  258. issues.append(QualityIssue('critical', 'layout', page_idx,
  259. f'形状"{_shape_name(shape)}"飞出页面右边界 (right={l+w}, max={sw})',
  260. 'L002', True, {'shape': shape, 'type': 'right'}))
  261. if t < -100:
  262. issues.append(QualityIssue('critical', 'layout', page_idx,
  263. f'形状"{_shape_name(shape)}"飞出页面顶部 (top={t})',
  264. 'L003', True, {'shape': shape, 'type': 'top'}))
  265. if t + h > sh + 500:
  266. issues.append(QualityIssue('critical', 'layout', page_idx,
  267. f'形状"{_shape_name(shape)}"飞出页面底部 (bottom={t+h}, max={sh})',
  268. 'L004', True, {'shape': shape, 'type': 'bottom'}))
  269. if l < SAFE_MARGIN and l >= 0:
  270. if l == 0 and w >= sw * 0.8:
  271. continue
  272. if int(shape.top) < 0 or int(shape.top) + int(shape.height) < Emu(100000):
  273. continue
  274. if int(shape.top) > sh - Emu(500000):
  275. continue
  276. issues.append(QualityIssue('minor', 'layout', page_idx,
  277. f'形状"{_shape_name(shape)}"过于靠近左边缘',
  278. 'L007', True, {'shape': shape, 'type': 'edge_left'}))
  279. placeholder_pattern = re.compile(r'\{[^}]+\}')
  280. for shape in slide.shapes:
  281. if shape.has_text_frame:
  282. text = shape.text_frame.text
  283. if placeholder_pattern.search(text):
  284. issues.append(QualityIssue('critical', 'layout', page_idx,
  285. f'发现未替换占位符: "{text[:50]}"',
  286. 'L006', True, {'shape': shape, 'type': 'placeholder'}))
  287. empty_artifacts = self._find_empty_template_artifacts(slide)
  288. for shape in empty_artifacts:
  289. issues.append(QualityIssue(
  290. 'major', 'layout', page_idx,
  291. f'发现空模板组件残留: "{_shape_name(shape)}"',
  292. 'L008', True, {'shape': shape, 'type': 'empty_template_artifact'}
  293. ))
  294. shapes_list = list(slide.shapes)
  295. for i, a in enumerate(shapes_list):
  296. for b in shapes_list[i+1:]:
  297. if self._shapes_overlap(a, b):
  298. a_name = _shape_name(a)
  299. b_name = _shape_name(b)
  300. if self._is_intentional_overlap(a, b):
  301. continue
  302. issues.append(QualityIssue('major', 'layout', page_idx,
  303. f'形状"{a_name}"与"{b_name}"存在重叠',
  304. 'L005', True, {'shape_a': a, 'shape_b': b, 'type': 'overlap'}))
  305. return issues
  306. def _check_visual(self, slide, page_idx) -> list[QualityIssue]:
  307. issues = []
  308. fonts_seen = {}
  309. for shape in slide.shapes:
  310. if not shape.has_text_frame:
  311. continue
  312. for para in shape.text_frame.paragraphs:
  313. for run in para.runs:
  314. if run.font.size:
  315. size_pt = run.font.size / 12700.0
  316. if size_pt < 6:
  317. issues.append(QualityIssue('major', 'visual', page_idx,
  318. f'字号过小 ({size_pt:.1f}pt): "{run.text[:20]}"',
  319. 'V002', True, {'run': run, 'type': 'font_small'}))
  320. elif size_pt > 65:
  321. issues.append(QualityIssue('major', 'visual', page_idx,
  322. f'字号过大 ({size_pt:.1f}pt): "{run.text[:20]}"',
  323. 'V003', True, {'run': run, 'type': 'font_large'}))
  324. if run.font.name:
  325. fonts_seen[run.font.name] = fonts_seen.get(run.font.name, 0) + 1
  326. if len(fonts_seen) > 3:
  327. issues.append(QualityIssue('minor', 'visual', page_idx,
  328. f'字体使用超过3种: {list(fonts_seen.keys())}',
  329. 'V001', True, {'type': 'font_mixed', 'fonts': fonts_seen}))
  330. return issues
  331. def _check_content(self, slide, page_idx, config, prs, page_type='content') -> list[QualityIssue]:
  332. issues = []
  333. if page_type in ('cover', 'end'):
  334. issues += self._check_text_overflow(slide, page_idx)
  335. return issues
  336. issues += self._check_dynamic_page_fit(page_idx, page_type, config)
  337. issues += self._check_core_metric_presence(slide, page_idx, page_type, config)
  338. if page_type == 'toc':
  339. content_shapes = [s for s in slide.shapes
  340. if s.has_text_frame and _is_in_content_area(s)]
  341. all_content_text = ''
  342. for shape in content_shapes:
  343. text = shape.text_frame.text.strip()
  344. if text:
  345. all_content_text += text + '\n'
  346. if len(all_content_text.strip()) < 30:
  347. issues.append(QualityIssue('minor', 'content', page_idx,
  348. '目录页内容过少',
  349. 'C008', False, {'type': 'empty_page'}))
  350. return issues
  351. fill_ratio = calculate_fill_ratio(slide)
  352. if page_type in ('kpi_overview', 'trend', 'distribution', 'ranking', 'summary') or page_type in FORECAST_PAGE_TYPES:
  353. if fill_ratio < FILL_RATIO_THRESHOLDS['sparse']:
  354. issues.append(QualityIssue('critical', 'content', page_idx,
  355. f'页面内容严重不足,填充率仅 {fill_ratio:.1%},必须补充图表和分析文本',
  356. 'C001', True, {'type': 'sparse', 'fill_ratio': fill_ratio}))
  357. elif fill_ratio < FILL_RATIO_THRESHOLDS['low']:
  358. issues.append(QualityIssue('major', 'content', page_idx,
  359. f'页面留白偏多,填充率 {fill_ratio:.1%},需补充分析内容',
  360. 'C001', True, {'type': 'sparse', 'fill_ratio': fill_ratio}))
  361. elif fill_ratio < FILL_RATIO_THRESHOLDS['sparse'] / 2:
  362. issues.append(QualityIssue('minor', 'content', page_idx,
  363. f'页面填充率过低 {fill_ratio:.1%}',
  364. 'C001', False))
  365. content_shapes = [s for s in slide.shapes
  366. if s.has_text_frame and _is_in_content_area(s)]
  367. all_content_text = ''
  368. insight_blocks = 0
  369. for shape in content_shapes:
  370. tf = shape.text_frame
  371. full_text = tf.text.strip()
  372. if not full_text:
  373. continue
  374. all_content_text += full_text + '\n'
  375. for para in tf.paragraphs:
  376. para_text = para.text.strip()
  377. if para_text and len(para_text) >= TEXT_MIN_LENGTH:
  378. insight_blocks += 1
  379. total_content_chars = len(all_content_text.strip())
  380. text_lengths = [len(p.text.strip()) for s in content_shapes
  381. for p in s.text_frame.paragraphs if p.text.strip()]
  382. if total_content_chars < PAGE_MIN_TEXT_LENGTH:
  383. issues.append(QualityIssue('critical', 'content', page_idx,
  384. f'页面内容为空!所有文本框总字数仅 {total_content_chars} 字(要求≥{PAGE_MIN_TEXT_LENGTH}字)',
  385. 'C008', True, {'type': 'empty_page', 'char_count': total_content_chars}))
  386. elif total_content_chars < 200:
  387. issues.append(QualityIssue('major', 'content', page_idx,
  388. f'页面内容过少,总字数仅 {total_content_chars} 字,分析深度严重不足',
  389. 'C008', True, {'type': 'empty_page', 'char_count': total_content_chars}))
  390. if text_lengths and max(text_lengths) < TEXT_MIN_LENGTH:
  391. issues.append(QualityIssue('critical', 'content', page_idx,
  392. f'分析文本过短(最长为 {max(text_lengths)} 字),需撰写≥{TEXT_MIN_LENGTH}字的深度分析',
  393. 'C005', True, {'type': 'short_text', 'max_length': max(text_lengths)}))
  394. if insight_blocks < INSIGHT_MIN_COUNT:
  395. issues.append(QualityIssue('critical', 'content', page_idx,
  396. f'分析段数不足,仅 {insight_blocks} 段(要求≥{INSIGHT_MIN_COUNT}段)',
  397. 'C007', True, {'type': 'insight_count', 'count': insight_blocks}))
  398. has_title = False
  399. for shape in slide.shapes:
  400. if shape.has_text_frame:
  401. text = shape.text_frame.text
  402. try:
  403. sy = int(shape.top)
  404. except Exception:
  405. sy = 99999999
  406. if sy < CONTENT_TOP_BASE + Emu(100000) and sy > Emu(500000):
  407. if len(text.strip()) > 0 and not text.startswith('{'):
  408. has_title = True
  409. break
  410. if any(kw in text for kw in ['概览', '趋势', '分布', '分析', '总结',
  411. '排行', '报告', '建议', '告警', '要点']):
  412. if sy < CONTENT_TOP_BASE + Emu(400000):
  413. has_title = True
  414. break
  415. if not has_title and page_idx > 0 and page_idx < len(prs.slides) - 1:
  416. issues.append(QualityIssue('critical', 'content', page_idx,
  417. '页面缺少标题', 'C006', True, {'type': 'missing_title'}))
  418. issues += self._check_text_overflow(slide, page_idx)
  419. has_chart = False
  420. for shape in slide.shapes:
  421. if shape.has_chart:
  422. has_chart = True
  423. break
  424. if has_chart and insight_blocks == 0 and page_idx >= 2:
  425. issues.append(QualityIssue('critical', 'content', page_idx,
  426. '页面有图表但完全缺少分析文本,图表数据需要被解读和说明',
  427. 'C009', True, {'type': 'chart_no_text'}))
  428. return issues
  429. def _check_text_overflow(self, slide, page_idx) -> list[QualityIssue]:
  430. issues = []
  431. for shape in slide.shapes:
  432. if shape.has_text_frame and self._is_text_overflowing(shape):
  433. issues.append(QualityIssue(
  434. 'major', 'content', page_idx,
  435. f'文本可能超出文本框边界: "{shape.text_frame.text[:30]}"',
  436. 'C004', True, {'shape': shape, 'type': 'text_overflow'}
  437. ))
  438. return issues
  439. def _check_dynamic_page_fit(self, page_idx, page_type, config) -> list[QualityIssue]:
  440. issues = []
  441. profile = getattr(config, 'data_profiling', None) or {}
  442. if not profile:
  443. return issues
  444. time_cols = profile.get('time_columns', [])
  445. cat_cols = profile.get('category_columns', [])
  446. num_cols = profile.get('numeric_columns', [])
  447. if page_type == 'trend' and (not time_cols or not num_cols):
  448. issues.append(QualityIssue(
  449. 'critical', 'content', page_idx,
  450. '趋势页缺少可用时间列或数值列,需要重建或降级为摘要页',
  451. 'C010', True, {'type': 'dynamic_page_not_supported', 'page_type': page_type}
  452. ))
  453. elif page_type in ('distribution', 'ranking') and (not cat_cols or not num_cols):
  454. issues.append(QualityIssue(
  455. 'critical', 'content', page_idx,
  456. f'{page_type} 页缺少分类维度或数值列,需要重建或降级为摘要页',
  457. 'C010', True, {'type': 'dynamic_page_not_supported', 'page_type': page_type}
  458. ))
  459. elif page_type == 'kpi_overview':
  460. selected_metrics = [m for m in getattr(config, 'metrics', []) if getattr(m, 'selected', True)]
  461. if len(selected_metrics) > 6:
  462. issues.append(QualityIssue(
  463. 'minor', 'content', page_idx,
  464. f'核心指标数量 {len(selected_metrics)} 超过 6 个,KPI页应切换为紧凑布局或拆分展示',
  465. 'C011', True, {'type': 'kpi_layout_over_capacity', 'count': len(selected_metrics)}
  466. ))
  467. elif len(selected_metrics) >= 4:
  468. issues.append(QualityIssue(
  469. 'minor', 'content', page_idx,
  470. f'核心指标数量 {len(selected_metrics)} 较多,建议使用紧凑布局以保留洞察区',
  471. 'C011', True, {'type': 'kpi_layout_compact_needed', 'count': len(selected_metrics)}
  472. ))
  473. return issues
  474. def _check_core_metric_presence(self, slide, page_idx, page_type, config) -> list[QualityIssue]:
  475. issues = []
  476. if page_type != 'kpi_overview' or not config:
  477. return issues
  478. selected_metrics = [m for m in getattr(config, 'metrics', []) if getattr(m, 'selected', True)]
  479. if not selected_metrics:
  480. return issues
  481. slide_text = '\n'.join(
  482. shape.text_frame.text for shape in slide.shapes
  483. if shape.has_text_frame and shape.text_frame.text
  484. )
  485. missing = [m.label for m in selected_metrics[:6] if m.label and m.label not in slide_text]
  486. if missing:
  487. issues.append(QualityIssue(
  488. 'critical', 'data', page_idx,
  489. 'KPI概览页缺少已确认核心指标:' + '、'.join(missing),
  490. 'D006', True, {'type': 'core_metric_missing', 'missing': missing}
  491. ))
  492. return issues
  493. def _check_data(self, slide, page_idx, prs) -> list[QualityIssue]:
  494. issues = []
  495. if page_idx == 0:
  496. return issues
  497. for shape in slide.shapes:
  498. if shape.has_text_frame:
  499. text = shape.text_frame.text
  500. page_pattern = re.search(r'(\d+)\s*/\s*(\d+)', text)
  501. if page_pattern:
  502. current = int(page_pattern.group(1))
  503. total = int(page_pattern.group(2))
  504. if total == 0:
  505. issues.append(QualityIssue('major', 'data', page_idx,
  506. f'页码格式异常: {text.strip()}',
  507. 'D002', True, {'type': 'page_num'}))
  508. return issues
  509. def _fix_layout(self, slide, issue):
  510. fd = issue.fix_data
  511. if fd.get('type') in ('left', 'right', 'top', 'bottom'):
  512. shape = fd.get('shape')
  513. if shape:
  514. ensure_safe_position(shape, SLIDE_WIDTH, SLIDE_HEIGHT)
  515. fd['fixed'] = True
  516. elif fd.get('type') == 'overlap':
  517. a, b = fd.get('shape_a'), fd.get('shape_b')
  518. if a and b:
  519. try:
  520. if int(b.left) < int(a.left) + int(a.width) + Emu(50000):
  521. b.left = int(a.left) + int(a.width) + Emu(152400)
  522. ensure_safe_position(b, SLIDE_WIDTH, SLIDE_HEIGHT)
  523. except Exception:
  524. pass
  525. fd['fixed'] = True
  526. elif fd.get('type') == 'placeholder':
  527. shape = fd.get('shape')
  528. if shape and shape.has_text_frame:
  529. for para in shape.text_frame.paragraphs:
  530. para.text = re.sub(r'\{[^}]+\}', '', para.text)
  531. fd['fixed'] = True
  532. elif fd.get('type') == 'edge_left':
  533. shape = fd.get('shape')
  534. if shape:
  535. try:
  536. w = int(shape.width)
  537. if w < SLIDE_WIDTH * 0.5:
  538. shape.left = SAFE_MARGIN
  539. except Exception:
  540. pass
  541. fd['fixed'] = True
  542. elif fd.get('type') == 'empty_template_artifact':
  543. shape = fd.get('shape')
  544. if shape:
  545. self._remove_shape(shape)
  546. fd['fixed'] = True
  547. def _fix_visual(self, slide, issue):
  548. fd = issue.fix_data
  549. if fd.get('type') == 'font_small':
  550. run = fd.get('run')
  551. if run:
  552. run.font.size = FONT_SIZE_MIN
  553. fd['fixed'] = True
  554. elif fd.get('type') == 'font_large':
  555. run = fd.get('run')
  556. if run:
  557. run.font.size = FONT_SIZE_MAX
  558. fd['fixed'] = True
  559. elif fd.get('type') == 'font_mixed':
  560. for shape in slide.shapes:
  561. if shape.has_text_frame:
  562. for para in shape.text_frame.paragraphs:
  563. for run in para.runs:
  564. run.font.name = DEFAULT_FONT
  565. fd['fixed'] = True
  566. def _fix_content(self, slide, issue, prs):
  567. fd = issue.fix_data
  568. if fd.get('type') == 'sparse':
  569. fill_ratio = fd.get('fill_ratio', 0)
  570. if fill_ratio < FILL_RATIO_THRESHOLDS['low']:
  571. try:
  572. box = slide.shapes.add_textbox(
  573. CONTENT_LEFT, Emu(int(FOOTER_TOP) - Emu(1600000)),
  574. Emu(SLIDE_WIDTH - 2 * CONTENT_LEFT - Emu(200000)), Emu(1500000))
  575. tf = box.text_frame
  576. tf.word_wrap = True
  577. p = tf.paragraphs[0]
  578. p.text = (
  579. '[WARNING] 此页面内容不足,需补充深度分析内容。'
  580. '分析应包含:具体数据引用(含数值和单位)、'
  581. '与同类/历史/目标的对比分析、'
  582. '数据背后原因的至少2条解读、'
  583. '以及可执行的业务行动建议。'
  584. '请勿使用"要加强"、"进一步优化"等模糊措辞。'
  585. )
  586. p.font.size = Pt(12)
  587. p.font.color.rgb = RGBColor(0xCC, 0x33, 0x00)
  588. p.font.name = DEFAULT_FONT
  589. p.font.bold = True
  590. fd['fixed'] = True
  591. fd['needs_rebuild'] = True
  592. except Exception:
  593. pass
  594. elif fd.get('type') == 'empty_page':
  595. fd['needs_rebuild'] = True
  596. fd['fixed'] = True
  597. elif fd.get('type') == 'chart_no_text':
  598. fd['needs_rebuild'] = True
  599. fd['fixed'] = True
  600. elif fd.get('type') == 'insight_count':
  601. fd['needs_rebuild'] = True
  602. fd['fixed'] = True
  603. elif fd.get('type') == 'short_text':
  604. fd['needs_rebuild'] = True
  605. fd['fixed'] = True
  606. elif fd.get('type') in ('dynamic_page_not_supported', 'kpi_layout_over_capacity'):
  607. fd['fixed'] = True
  608. elif fd.get('type') == 'kpi_layout_compact_needed':
  609. fd['fixed'] = True
  610. elif fd.get('type') == 'core_metric_missing':
  611. fd['needs_rebuild'] = True
  612. fd['fixed'] = True
  613. elif fd.get('type') == 'missing_title':
  614. try:
  615. box = slide.shapes.add_textbox(
  616. CONTENT_LEFT, Emu(914400),
  617. Emu(SLIDE_WIDTH - 2 * CONTENT_LEFT - Emu(200000)), Emu(508000))
  618. p = box.text_frame.paragraphs[0]
  619. p.text = '数据详情'
  620. p.font.size = Pt(24)
  621. p.font.bold = True
  622. p.font.color.rgb = RGBColor(0x33, 0x33, 0x33)
  623. p.font.name = DEFAULT_FONT
  624. fd['fixed'] = True
  625. except Exception:
  626. pass
  627. elif fd.get('type') == 'text_overflow':
  628. shape = fd.get('shape')
  629. if shape and shape.has_text_frame:
  630. text_len = len(shape.text_frame.text or '')
  631. try:
  632. if text_len > 180 or int(shape.top) + int(shape.height) > int(FOOTER_TOP) - Emu(120000):
  633. fd['needs_rebuild'] = True
  634. else:
  635. for para in shape.text_frame.paragraphs:
  636. for run in para.runs:
  637. if run.font.size and run.font.size > Pt(9):
  638. run.font.size = Pt(9)
  639. except Exception:
  640. fd['needs_rebuild'] = True
  641. fd['fixed'] = True
  642. def _fix_data(self, slide, issue, prs):
  643. fd = issue.fix_data
  644. if fd.get('type') == 'page_num':
  645. fd['fixed'] = True
  646. def _shapes_overlap(self, a, b) -> bool:
  647. ax, ay, aw, ah = int(a.left), int(a.top), int(a.width), int(a.height)
  648. bx, by, bw, bh = int(b.left), int(b.top), int(b.width), int(b.height)
  649. if ax + aw <= bx or bx + bw <= ax:
  650. return False
  651. if ay + ah <= by or by + bh <= ay:
  652. return False
  653. return True
  654. def _is_intentional_overlap(self, a, b) -> bool:
  655. if hasattr(a, 'is_placeholder') or hasattr(b, 'is_placeholder'):
  656. return True
  657. a_area = int(a.width) * int(a.height)
  658. b_area = int(b.width) * int(b.height)
  659. if a_area > b_area * 3 or b_area > a_area * 3:
  660. return True
  661. return False
  662. def _is_title_shape(self, shape) -> bool:
  663. if not shape.has_text_frame:
  664. return False
  665. try:
  666. y = int(shape.top)
  667. return y < int(CONTENT_TOP_BASE) + Emu(200000)
  668. except Exception:
  669. return False
  670. def _find_empty_template_artifacts(self, slide) -> list:
  671. artifacts = []
  672. shapes = list(slide.shapes)
  673. empty_text_boxes = []
  674. for shape in shapes:
  675. if shape.has_text_frame:
  676. text = (shape.text_frame.text or '').strip()
  677. if text:
  678. continue
  679. if int(shape.width) < Emu(200000) or int(shape.height) < Emu(120000):
  680. continue
  681. if int(shape.top) < Emu(900000) or int(shape.top) > int(FOOTER_TOP) - Emu(100000):
  682. continue
  683. empty_text_boxes.append(shape)
  684. artifacts.append(shape)
  685. for shape in shapes:
  686. if shape.has_text_frame:
  687. continue
  688. try:
  689. is_large_soft_card = (
  690. int(shape.width) >= Emu(1000000) and
  691. int(shape.height) >= Emu(500000) and
  692. int(shape.top) < int(FOOTER_TOP) - Emu(400000)
  693. )
  694. if not is_large_soft_card:
  695. continue
  696. overlaps_empty_text = any(self._shapes_overlap(shape, box) for box in empty_text_boxes)
  697. if overlaps_empty_text:
  698. artifacts.append(shape)
  699. except Exception:
  700. continue
  701. # Preserve order while de-duplicating.
  702. seen = set()
  703. unique = []
  704. for shape in artifacts:
  705. key = id(shape)
  706. if key not in seen:
  707. unique.append(shape)
  708. seen.add(key)
  709. return unique
  710. def _remove_shape(self, shape):
  711. el = shape.element
  712. el.getparent().remove(el)
  713. def _is_text_overflowing(self, shape) -> bool:
  714. if not shape.has_text_frame:
  715. return False
  716. text = shape.text_frame.text
  717. if not text.strip():
  718. return False
  719. if len(text) > 800:
  720. return True
  721. try:
  722. w = int(shape.width)
  723. h = int(shape.height)
  724. width_pt = max(1, w / 12700.0)
  725. max_font_pt = 10
  726. para_count = 0
  727. for para in shape.text_frame.paragraphs:
  728. if not para.text.strip():
  729. continue
  730. para_count += 1
  731. for run in para.runs:
  732. if run.font.size:
  733. max_font_pt = max(max_font_pt, run.font.size / 12700.0)
  734. chars_per_line = max(8, int(width_pt / (max_font_pt * 1.15)))
  735. est_lines = max(1, (len(text) + chars_per_line - 1) // chars_per_line)
  736. est_height = int((est_lines * max_font_pt * 1.2 + para_count * 4) * 12700)
  737. if est_height > h * 1.15:
  738. return True
  739. if h < Emu(200000) and len(text) > 80:
  740. return True
  741. except Exception:
  742. pass
  743. return False
  744. def _shape_name(shape):
  745. try:
  746. if shape.has_text_frame:
  747. return shape.text_frame.text[:20].replace('\n', ' ')
  748. except Exception:
  749. pass
  750. try:
  751. return shape.shape_type
  752. except Exception:
  753. pass
  754. return '无名形状'
  755. def _is_in_content_area(shape):
  756. try:
  757. return int(shape.top) >= int(CONTENT_TOP_BASE)
  758. except Exception:
  759. return False
  760. if __name__ == '__main__':
  761. print("QualityInspector module loaded")
  762. inspector = QualityInspector()
  763. print("Ready to inspect PPT files")