agent_analyzer.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416
  1. """
  2. Agent analyzer: intelligent analysis of data profile to generate
  3. recommendations for metrics, pages, charts, and overall report structure.
  4. Uses rule-based heuristics for analysis and generates structured recommendations.
  5. """
  6. from report_config import (
  7. MetricDef, PageDef, MetricType, AggregationType, ChartType,
  8. PeriodType, ColumnRole
  9. )
  10. def analyze_and_recommend(profile: dict, period_type: PeriodType = PeriodType.MONTHLY) -> dict:
  11. recommendations = {
  12. 'suggested_metrics': _recommend_metrics(profile),
  13. 'suggested_pages': _recommend_pages(profile, period_type),
  14. 'suggested_period': period_type.value,
  15. 'suggested_page_range': (6, 15),
  16. 'data_summary': _build_summary(profile),
  17. 'chart_mapping': _build_chart_mapping(profile),
  18. 'analysis_notes': _build_analysis_notes(profile),
  19. 'analyst_requirements': _build_analyst_requirements(profile, period_type),
  20. }
  21. recommendations.update(_suggest_period_and_range(profile))
  22. return recommendations
  23. def _build_analyst_requirements(profile: dict, period_type: PeriodType) -> dict:
  24. """Requirements for decision-grade analysis narratives."""
  25. num_cols = profile.get('numeric_columns', [])
  26. cat_cols = profile.get('category_columns', [])
  27. time_cols = profile.get('time_columns', [])
  28. requirements = [
  29. '每个分析页至少包含业务判断、数据证据、对比关系、原因假设、风险/机会、行动建议中的三项。',
  30. '从第4页开始禁止仅复述图表或排行,必须输出诊断、归因、影响和下一步动作。',
  31. '长分类列表必须压缩为Top N + 其余汇总,不能塞入KPI值或正文长段落。',
  32. '若缺少目标/历史/同比数据,需明确说明当前为基线视图,并提出下一步应补充的对比字段。',
  33. ]
  34. if time_cols:
  35. requirements.append('趋势页必须识别峰值、低谷、拐点、阶段变化或波动区间。')
  36. if cat_cols:
  37. requirements.append('分布/排行页必须分析Top1/Top3贡献、头部集中度、尾部结构和资源配置含义。')
  38. if len(num_cols) >= 2:
  39. requirements.append('KPI页需区分结果指标与过程指标,分析指标之间是否一致或存在背离。')
  40. if period_type == PeriodType.MONTHLY:
  41. requirements.append('月报必须包含月度经营判断、关键驱动/拖累、风险预警和下月行动计划。')
  42. return {
  43. 'role': 'professional_data_analyst',
  44. 'reference': 'references/professional-data-analyst-playbook.md',
  45. 'minimum_requirements': requirements,
  46. 'keywords': [
  47. '环比', '同比', '达成率', '缺口', '贡献率', '集中度', '长尾',
  48. '拐点', '波动率', '结构失衡', '转化效率', '阶段阻塞',
  49. '风险敞口', '关键假设', '情景分析', '预警阈值', '闭环机制',
  50. ],
  51. }
  52. def _recommend_metrics(profile: dict) -> list[dict]:
  53. metrics = []
  54. num_cols = profile.get('numeric_columns', [])
  55. cat_cols = profile.get('category_columns', [])
  56. for i, col in enumerate(num_cols):
  57. ns = col.get('numeric_stats', {}) or {}
  58. label = col.get('inferred_label', col['column_name'])
  59. unit = _infer_unit(col['column_name'])
  60. is_primary = i < 4
  61. metrics.append({
  62. 'name': f"{label}_{col['column_name']}",
  63. 'label': label,
  64. 'column': col['column_name'],
  65. 'aggregation': 'sum',
  66. 'metric_type': 'kpi',
  67. 'unit': unit,
  68. 'selected': is_primary,
  69. 'is_primary': is_primary,
  70. 'sample_value': ns.get('sum', 0),
  71. })
  72. if len(num_cols) <= 4 and ns.get('sum', 0) > 100:
  73. metrics.append({
  74. 'name': f"日均{label}",
  75. 'label': f"日均{label}",
  76. 'column': col['column_name'],
  77. 'aggregation': 'avg',
  78. 'metric_type': 'kpi',
  79. 'unit': unit,
  80. 'selected': False,
  81. 'is_primary': False,
  82. 'sample_value': ns.get('mean', 0),
  83. })
  84. if cat_cols:
  85. top_cat = cat_cols[0]
  86. metrics.append({
  87. 'name': f"覆盖{top_cat['inferred_label']}数",
  88. 'label': f"覆盖{top_cat['inferred_label']}数",
  89. 'column': top_cat['column_name'],
  90. 'aggregation': 'distinct_count',
  91. 'metric_type': 'kpi',
  92. 'unit': '个',
  93. 'selected': True,
  94. 'is_primary': False,
  95. 'sample_value': top_cat.get('unique_count', 0),
  96. })
  97. return metrics
  98. def _recommend_pages(profile: dict, period_type: PeriodType) -> list[dict]:
  99. pages = []
  100. order = 0
  101. pages.append({
  102. 'page_id': 'cover',
  103. 'title': '封面',
  104. 'page_type': 'cover',
  105. 'order': order,
  106. 'selected': True,
  107. 'elements': [],
  108. 'conclusion_title': '',
  109. })
  110. order += 1
  111. num_cols = profile.get('numeric_columns', [])
  112. if period_type in (PeriodType.MONTHLY, PeriodType.QUARTERLY):
  113. pages.append({
  114. 'page_id': 'toc',
  115. 'title': '目录',
  116. 'page_type': 'toc',
  117. 'order': order,
  118. 'selected': True,
  119. 'elements': [],
  120. })
  121. order += 1
  122. pages.append({
  123. 'page_id': 'kpi_overview',
  124. 'title': '核心指标概览',
  125. 'page_type': 'kpi_overview',
  126. 'order': order,
  127. 'selected': True,
  128. 'elements': [{'type': 'kpi_cards', 'count': min(6, len(num_cols))}],
  129. 'conclusion_title': '核心指标概览',
  130. })
  131. order += 1
  132. time_cols = profile.get('time_columns', [])
  133. if time_cols and num_cols:
  134. top_num = num_cols[0]
  135. pages.append({
  136. 'page_id': 'trend',
  137. 'title': f'{top_num["inferred_label"]}趋势',
  138. 'page_type': 'trend',
  139. 'order': order,
  140. 'selected': True,
  141. 'elements': [
  142. {'type': 'line_chart', 'metric': top_num['column_name'],
  143. 'dimension': time_cols[0]['column_name'], 'title': f'{top_num["inferred_label"]}趋势'}
  144. ],
  145. 'conclusion_title': f'{top_num["inferred_label"]}趋势',
  146. })
  147. order += 1
  148. if period_type == PeriodType.MONTHLY:
  149. forecast_cols = [
  150. c for c in num_cols
  151. if any(k in (c.get('column_name', '') + c.get('inferred_label', '')).lower()
  152. for k in ('预测', 'forecast', '目标', 'target', '计划', 'plan'))
  153. ]
  154. if forecast_cols:
  155. pages.append({
  156. 'page_id': 'monthly_forecast',
  157. 'title': '下月预测与行动计划',
  158. 'page_type': 'monthly_forecast',
  159. 'order': order,
  160. 'selected': True,
  161. 'elements': [{
  162. 'type': 'forecast_chart',
  163. 'metrics': [c['column_name'] for c in forecast_cols[:3]],
  164. 'title': '下月预测与行动计划',
  165. }],
  166. 'conclusion_title': '下月预测与行动计划',
  167. })
  168. order += 1
  169. cat_cols = profile.get('category_columns', [])
  170. if cat_cols and num_cols:
  171. top_cat = cat_cols[0]
  172. top_num = num_cols[0]
  173. pages.append({
  174. 'page_id': 'distribution',
  175. 'title': f'{top_cat["inferred_label"]}分布',
  176. 'page_type': 'distribution',
  177. 'order': order,
  178. 'selected': True,
  179. 'elements': [
  180. {'type': 'doughnut_chart', 'metric': top_num['column_name'],
  181. 'dimension': top_cat['column_name'], 'title': f'{top_cat["inferred_label"]}占比'}
  182. ],
  183. 'conclusion_title': f'{top_cat["inferred_label"]}分布',
  184. })
  185. order += 1
  186. if len(cat_cols) >= 2:
  187. cat2 = cat_cols[1] if len(cat_cols) > 1 else cat_cols[0]
  188. pages.append({
  189. 'page_id': 'ranking',
  190. 'title': f'{cat2["inferred_label"]}排行',
  191. 'page_type': 'ranking',
  192. 'order': order,
  193. 'selected': True,
  194. 'elements': [
  195. {'type': 'bar_chart', 'metric': num_cols[0]['column_name'],
  196. 'dimension': cat2['column_name'], 'title': f'{cat2["inferred_label"]}TOP排行'}
  197. ],
  198. 'conclusion_title': f'{cat2["inferred_label"]}TOP排行',
  199. })
  200. order += 1
  201. pages.append({
  202. 'page_id': 'summary',
  203. 'title': '总结与建议',
  204. 'page_type': 'summary',
  205. 'order': order,
  206. 'selected': True,
  207. 'elements': [{'type': 'insight_block', 'title': '总结与建议'}],
  208. 'conclusion_title': '总结与建议',
  209. })
  210. order += 1
  211. pages.append({
  212. 'page_id': 'end',
  213. 'title': '尾页',
  214. 'page_type': 'end',
  215. 'order': order,
  216. 'selected': True,
  217. 'elements': [],
  218. })
  219. return pages
  220. def _suggest_period_and_range(profile: dict) -> dict:
  221. granularity = profile.get('time_granularity', 'monthly')
  222. dr = profile.get('date_range', (None, None))
  223. period_map = {
  224. 'daily': PeriodType.DAILY,
  225. 'weekly': PeriodType.WEEKLY,
  226. 'monthly': PeriodType.MONTHLY,
  227. 'quarterly': PeriodType.QUARTERLY,
  228. 'yearly': PeriodType.MONTHLY,
  229. }
  230. suggested = period_map.get(granularity, PeriodType.MONTHLY)
  231. page_range_map = {
  232. 'daily': (6, 9),
  233. 'weekly': (7, 11),
  234. 'monthly': (8, 14),
  235. 'quarterly': (10, 18),
  236. 'yearly': (12, 20),
  237. }
  238. page_range = page_range_map.get(granularity, (8, 14))
  239. return {
  240. 'suggested_period': suggested.value,
  241. 'suggested_page_range': page_range,
  242. }
  243. def _build_chart_mapping(profile: dict) -> list[dict]:
  244. mapping = []
  245. num_cols = profile.get('numeric_columns', [])
  246. time_cols = profile.get('time_columns', [])
  247. cat_cols = profile.get('category_columns', [])
  248. if time_cols and num_cols:
  249. for nc in num_cols[:3]:
  250. mapping.append({
  251. 'metric': nc['inferred_label'],
  252. 'metric_col': nc['column_name'],
  253. 'dimension': time_cols[0]['column_name'],
  254. 'dimension_label': '时间',
  255. 'chart_type': ChartType.LINE.value,
  256. 'rationale': f'{nc["inferred_label"]}随时间变化趋势',
  257. })
  258. if cat_cols and num_cols:
  259. top_num = num_cols[0]
  260. for cc in cat_cols[:3]:
  261. chart_type = ChartType.DOUGHNUT.value if cc['unique_count'] <= 8 else ChartType.BAR.value
  262. mapping.append({
  263. 'metric': top_num['inferred_label'],
  264. 'metric_col': top_num['column_name'],
  265. 'dimension': cc['column_name'],
  266. 'dimension_label': cc['inferred_label'],
  267. 'chart_type': chart_type,
  268. 'rationale': f'{top_num["inferred_label"]}按{cc["inferred_label"]}的分布',
  269. })
  270. return mapping
  271. def _build_summary(profile: dict) -> str:
  272. lines = []
  273. lines.append(f"数据量: {profile['total_rows']:,} 行 × {profile['total_columns']} 列")
  274. num_cols = profile.get('numeric_columns', [])
  275. cat_cols = profile.get('category_columns', [])
  276. time_cols = profile.get('time_columns', [])
  277. lines.append(f"可计算指标: {len(num_cols)} 个数值列")
  278. lines.append(f"可分析维度: {len(cat_cols)} 个分类列")
  279. if time_cols:
  280. lines.append(f"时间列: {time_cols[0]['column_name']}")
  281. lines.append(f"数据粒度: {profile.get('time_granularity', 'unknown')}")
  282. dr = profile.get('date_range', (None, None))
  283. if dr[0]:
  284. lines.append(f"时间范围: {dr[0]} ~ {dr[1]}")
  285. q = profile.get('data_quality', {})
  286. lines.append(f"质量评分: {q.get('score', 0)}/100")
  287. return '\n'.join(lines)
  288. def _build_analysis_notes(profile: dict) -> list[str]:
  289. notes = []
  290. num_cols = profile.get('numeric_columns', [])
  291. cat_cols = profile.get('category_columns', [])
  292. if not cat_cols:
  293. notes.append('数据中缺少分类维度列,报告将以数值汇总为主,建议补充分类字段以增强分析深度。')
  294. if len(num_cols) >= 4:
  295. names = [c['inferred_label'] for c in num_cols[:4]]
  296. notes.append(f'核心数值指标: {", ".join(names)}')
  297. if len(cat_cols) == 1:
  298. notes.append(f'仅有一个分类维度列 ({cat_cols[0]["inferred_label"]}),报告分析维度较窄。')
  299. elif len(cat_cols) >= 3:
  300. names = [c['inferred_label'] for c in cat_cols[:3]]
  301. notes.append(f'分类维度丰富 ({", ".join(names)}),可支撑多维交叉分析。')
  302. q = profile.get('data_quality', {})
  303. if q.get('score', 100) < 85:
  304. notes.append(f'数据质量评分偏低 ({q["score"]}/100),建议在生成前检查缺失值与异常值。')
  305. return notes
  306. def _infer_unit(col_name: str) -> str:
  307. col_lower = col_name.lower().strip()
  308. unit_map = {
  309. '金额': '元', '销售额': '元', '收入': '元', '利润': '元',
  310. '成本': '元', '费用': '元', '台数': '台', '件数': '件',
  311. '数量': '', '人数': '人', '天数': '天', '占比': '%',
  312. '比率': '%', '比例': '%', '率': '%',
  313. }
  314. for kw, unit in unit_map.items():
  315. if kw in col_lower:
  316. return unit
  317. return ''
  318. def generate_interaction_prompts(recommendations: dict, profile: dict) -> dict:
  319. return {
  320. 'period': {
  321. 'question': '报告周期与页数范围',
  322. 'detail': f"建议周期: {recommendations['suggested_period']}报\n建议页数: {recommendations['suggested_page_range'][0]}-{recommendations['suggested_page_range'][1]} 页\n请确认或调整",
  323. },
  324. 'metrics': {
  325. 'question': '核心指标集',
  326. 'detail': f"检测到 {len(recommendations['suggested_metrics'])} 个可计算指标\n已自动推荐主要的 {min(6, len(recommendations['suggested_metrics']))} 个\n请确认或增删",
  327. },
  328. 'audience': {
  329. 'question': '受众与决策场景',
  330. 'detail': '请选择: 管理层汇报 | 运营分析会 | 对外客户报告 | 自定义描述',
  331. },
  332. 'style': {
  333. 'question': '视觉风格与配色方向',
  334. 'detail': '推荐方案: 商务经典(深蓝) | 清新简约(绿色) | 深色专业 | 温暖品牌\n请选择配色方案',
  335. },
  336. 'pages': {
  337. 'question': '页面结构与模板方案',
  338. 'detail': f'推荐 {len(recommendations["suggested_pages"])} 个页面\n可增删调整页面顺序',
  339. },
  340. }
  341. if __name__ == '__main__':
  342. profile = {
  343. 'total_rows': 3240,
  344. 'total_columns': 15,
  345. 'numeric_columns': [
  346. {'column_name': '销售额', 'inferred_label': '销售额', 'numeric_stats': {'sum': 500000, 'mean': 154}},
  347. {'column_name': '订单量', 'inferred_label': '订单量', 'numeric_stats': {'sum': 3240, 'mean': 1.0}},
  348. {'column_name': '利润', 'inferred_label': '利润', 'numeric_stats': {'sum': 80000, 'mean': 25}},
  349. ],
  350. 'category_columns': [
  351. {'column_name': '区域', 'inferred_label': '区域', 'unique_count': 5},
  352. {'column_name': '产品', 'inferred_label': '产品', 'unique_count': 12},
  353. ],
  354. 'time_columns': [{'column_name': '日期', 'inferred_label': '日期'}],
  355. 'time_granularity': 'monthly',
  356. 'date_range': ('2026-01-01', '2026-04-30'),
  357. 'data_quality': {'score': 92},
  358. }
  359. recs = analyze_and_recommend(profile, PeriodType.MONTHLY)
  360. prompts = generate_interaction_prompts(recs, profile)
  361. for k, v in prompts.items():
  362. print(f"\n{k}: {v['question']}\n{v['detail']}")