| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416 |
- """
- Agent analyzer: intelligent analysis of data profile to generate
- recommendations for metrics, pages, charts, and overall report structure.
- Uses rule-based heuristics for analysis and generates structured recommendations.
- """
- from report_config import (
- MetricDef, PageDef, MetricType, AggregationType, ChartType,
- PeriodType, ColumnRole
- )
- def analyze_and_recommend(profile: dict, period_type: PeriodType = PeriodType.MONTHLY) -> dict:
- recommendations = {
- 'suggested_metrics': _recommend_metrics(profile),
- 'suggested_pages': _recommend_pages(profile, period_type),
- 'suggested_period': period_type.value,
- 'suggested_page_range': (6, 15),
- 'data_summary': _build_summary(profile),
- 'chart_mapping': _build_chart_mapping(profile),
- 'analysis_notes': _build_analysis_notes(profile),
- 'analyst_requirements': _build_analyst_requirements(profile, period_type),
- }
- recommendations.update(_suggest_period_and_range(profile))
- return recommendations
- def _build_analyst_requirements(profile: dict, period_type: PeriodType) -> dict:
- """Requirements for decision-grade analysis narratives."""
- num_cols = profile.get('numeric_columns', [])
- cat_cols = profile.get('category_columns', [])
- time_cols = profile.get('time_columns', [])
- requirements = [
- '每个分析页至少包含业务判断、数据证据、对比关系、原因假设、风险/机会、行动建议中的三项。',
- '从第4页开始禁止仅复述图表或排行,必须输出诊断、归因、影响和下一步动作。',
- '长分类列表必须压缩为Top N + 其余汇总,不能塞入KPI值或正文长段落。',
- '若缺少目标/历史/同比数据,需明确说明当前为基线视图,并提出下一步应补充的对比字段。',
- ]
- if time_cols:
- requirements.append('趋势页必须识别峰值、低谷、拐点、阶段变化或波动区间。')
- if cat_cols:
- requirements.append('分布/排行页必须分析Top1/Top3贡献、头部集中度、尾部结构和资源配置含义。')
- if len(num_cols) >= 2:
- requirements.append('KPI页需区分结果指标与过程指标,分析指标之间是否一致或存在背离。')
- if period_type == PeriodType.MONTHLY:
- requirements.append('月报必须包含月度经营判断、关键驱动/拖累、风险预警和下月行动计划。')
- return {
- 'role': 'professional_data_analyst',
- 'reference': 'references/professional-data-analyst-playbook.md',
- 'minimum_requirements': requirements,
- 'keywords': [
- '环比', '同比', '达成率', '缺口', '贡献率', '集中度', '长尾',
- '拐点', '波动率', '结构失衡', '转化效率', '阶段阻塞',
- '风险敞口', '关键假设', '情景分析', '预警阈值', '闭环机制',
- ],
- }
- def _recommend_metrics(profile: dict) -> list[dict]:
- metrics = []
- num_cols = profile.get('numeric_columns', [])
- cat_cols = profile.get('category_columns', [])
- for i, col in enumerate(num_cols):
- ns = col.get('numeric_stats', {}) or {}
- label = col.get('inferred_label', col['column_name'])
- unit = _infer_unit(col['column_name'])
- is_primary = i < 4
- metrics.append({
- 'name': f"{label}_{col['column_name']}",
- 'label': label,
- 'column': col['column_name'],
- 'aggregation': 'sum',
- 'metric_type': 'kpi',
- 'unit': unit,
- 'selected': is_primary,
- 'is_primary': is_primary,
- 'sample_value': ns.get('sum', 0),
- })
- if len(num_cols) <= 4 and ns.get('sum', 0) > 100:
- metrics.append({
- 'name': f"日均{label}",
- 'label': f"日均{label}",
- 'column': col['column_name'],
- 'aggregation': 'avg',
- 'metric_type': 'kpi',
- 'unit': unit,
- 'selected': False,
- 'is_primary': False,
- 'sample_value': ns.get('mean', 0),
- })
- if cat_cols:
- top_cat = cat_cols[0]
- metrics.append({
- 'name': f"覆盖{top_cat['inferred_label']}数",
- 'label': f"覆盖{top_cat['inferred_label']}数",
- 'column': top_cat['column_name'],
- 'aggregation': 'distinct_count',
- 'metric_type': 'kpi',
- 'unit': '个',
- 'selected': True,
- 'is_primary': False,
- 'sample_value': top_cat.get('unique_count', 0),
- })
- return metrics
- def _recommend_pages(profile: dict, period_type: PeriodType) -> list[dict]:
- pages = []
- order = 0
- pages.append({
- 'page_id': 'cover',
- 'title': '封面',
- 'page_type': 'cover',
- 'order': order,
- 'selected': True,
- 'elements': [],
- 'conclusion_title': '',
- })
- order += 1
- num_cols = profile.get('numeric_columns', [])
- if period_type in (PeriodType.MONTHLY, PeriodType.QUARTERLY):
- pages.append({
- 'page_id': 'toc',
- 'title': '目录',
- 'page_type': 'toc',
- 'order': order,
- 'selected': True,
- 'elements': [],
- })
- order += 1
- pages.append({
- 'page_id': 'kpi_overview',
- 'title': '核心指标概览',
- 'page_type': 'kpi_overview',
- 'order': order,
- 'selected': True,
- 'elements': [{'type': 'kpi_cards', 'count': min(6, len(num_cols))}],
- 'conclusion_title': '核心指标概览',
- })
- order += 1
- time_cols = profile.get('time_columns', [])
- if time_cols and num_cols:
- top_num = num_cols[0]
- pages.append({
- 'page_id': 'trend',
- 'title': f'{top_num["inferred_label"]}趋势',
- 'page_type': 'trend',
- 'order': order,
- 'selected': True,
- 'elements': [
- {'type': 'line_chart', 'metric': top_num['column_name'],
- 'dimension': time_cols[0]['column_name'], 'title': f'{top_num["inferred_label"]}趋势'}
- ],
- 'conclusion_title': f'{top_num["inferred_label"]}趋势',
- })
- order += 1
- if period_type == PeriodType.MONTHLY:
- forecast_cols = [
- c for c in num_cols
- if any(k in (c.get('column_name', '') + c.get('inferred_label', '')).lower()
- for k in ('预测', 'forecast', '目标', 'target', '计划', 'plan'))
- ]
- if forecast_cols:
- pages.append({
- 'page_id': 'monthly_forecast',
- 'title': '下月预测与行动计划',
- 'page_type': 'monthly_forecast',
- 'order': order,
- 'selected': True,
- 'elements': [{
- 'type': 'forecast_chart',
- 'metrics': [c['column_name'] for c in forecast_cols[:3]],
- 'title': '下月预测与行动计划',
- }],
- 'conclusion_title': '下月预测与行动计划',
- })
- order += 1
- cat_cols = profile.get('category_columns', [])
- if cat_cols and num_cols:
- top_cat = cat_cols[0]
- top_num = num_cols[0]
- pages.append({
- 'page_id': 'distribution',
- 'title': f'{top_cat["inferred_label"]}分布',
- 'page_type': 'distribution',
- 'order': order,
- 'selected': True,
- 'elements': [
- {'type': 'doughnut_chart', 'metric': top_num['column_name'],
- 'dimension': top_cat['column_name'], 'title': f'{top_cat["inferred_label"]}占比'}
- ],
- 'conclusion_title': f'{top_cat["inferred_label"]}分布',
- })
- order += 1
- if len(cat_cols) >= 2:
- cat2 = cat_cols[1] if len(cat_cols) > 1 else cat_cols[0]
- pages.append({
- 'page_id': 'ranking',
- 'title': f'{cat2["inferred_label"]}排行',
- 'page_type': 'ranking',
- 'order': order,
- 'selected': True,
- 'elements': [
- {'type': 'bar_chart', 'metric': num_cols[0]['column_name'],
- 'dimension': cat2['column_name'], 'title': f'{cat2["inferred_label"]}TOP排行'}
- ],
- 'conclusion_title': f'{cat2["inferred_label"]}TOP排行',
- })
- order += 1
- pages.append({
- 'page_id': 'summary',
- 'title': '总结与建议',
- 'page_type': 'summary',
- 'order': order,
- 'selected': True,
- 'elements': [{'type': 'insight_block', 'title': '总结与建议'}],
- 'conclusion_title': '总结与建议',
- })
- order += 1
- pages.append({
- 'page_id': 'end',
- 'title': '尾页',
- 'page_type': 'end',
- 'order': order,
- 'selected': True,
- 'elements': [],
- })
- return pages
- def _suggest_period_and_range(profile: dict) -> dict:
- granularity = profile.get('time_granularity', 'monthly')
- dr = profile.get('date_range', (None, None))
- period_map = {
- 'daily': PeriodType.DAILY,
- 'weekly': PeriodType.WEEKLY,
- 'monthly': PeriodType.MONTHLY,
- 'quarterly': PeriodType.QUARTERLY,
- 'yearly': PeriodType.MONTHLY,
- }
- suggested = period_map.get(granularity, PeriodType.MONTHLY)
- page_range_map = {
- 'daily': (6, 9),
- 'weekly': (7, 11),
- 'monthly': (8, 14),
- 'quarterly': (10, 18),
- 'yearly': (12, 20),
- }
- page_range = page_range_map.get(granularity, (8, 14))
- return {
- 'suggested_period': suggested.value,
- 'suggested_page_range': page_range,
- }
- def _build_chart_mapping(profile: dict) -> list[dict]:
- mapping = []
- num_cols = profile.get('numeric_columns', [])
- time_cols = profile.get('time_columns', [])
- cat_cols = profile.get('category_columns', [])
- if time_cols and num_cols:
- for nc in num_cols[:3]:
- mapping.append({
- 'metric': nc['inferred_label'],
- 'metric_col': nc['column_name'],
- 'dimension': time_cols[0]['column_name'],
- 'dimension_label': '时间',
- 'chart_type': ChartType.LINE.value,
- 'rationale': f'{nc["inferred_label"]}随时间变化趋势',
- })
- if cat_cols and num_cols:
- top_num = num_cols[0]
- for cc in cat_cols[:3]:
- chart_type = ChartType.DOUGHNUT.value if cc['unique_count'] <= 8 else ChartType.BAR.value
- mapping.append({
- 'metric': top_num['inferred_label'],
- 'metric_col': top_num['column_name'],
- 'dimension': cc['column_name'],
- 'dimension_label': cc['inferred_label'],
- 'chart_type': chart_type,
- 'rationale': f'{top_num["inferred_label"]}按{cc["inferred_label"]}的分布',
- })
- return mapping
- def _build_summary(profile: dict) -> str:
- lines = []
- lines.append(f"数据量: {profile['total_rows']:,} 行 × {profile['total_columns']} 列")
- num_cols = profile.get('numeric_columns', [])
- cat_cols = profile.get('category_columns', [])
- time_cols = profile.get('time_columns', [])
- lines.append(f"可计算指标: {len(num_cols)} 个数值列")
- lines.append(f"可分析维度: {len(cat_cols)} 个分类列")
- if time_cols:
- lines.append(f"时间列: {time_cols[0]['column_name']}")
- lines.append(f"数据粒度: {profile.get('time_granularity', 'unknown')}")
- dr = profile.get('date_range', (None, None))
- if dr[0]:
- lines.append(f"时间范围: {dr[0]} ~ {dr[1]}")
- q = profile.get('data_quality', {})
- lines.append(f"质量评分: {q.get('score', 0)}/100")
- return '\n'.join(lines)
- def _build_analysis_notes(profile: dict) -> list[str]:
- notes = []
- num_cols = profile.get('numeric_columns', [])
- cat_cols = profile.get('category_columns', [])
- if not cat_cols:
- notes.append('数据中缺少分类维度列,报告将以数值汇总为主,建议补充分类字段以增强分析深度。')
- if len(num_cols) >= 4:
- names = [c['inferred_label'] for c in num_cols[:4]]
- notes.append(f'核心数值指标: {", ".join(names)}')
- if len(cat_cols) == 1:
- notes.append(f'仅有一个分类维度列 ({cat_cols[0]["inferred_label"]}),报告分析维度较窄。')
- elif len(cat_cols) >= 3:
- names = [c['inferred_label'] for c in cat_cols[:3]]
- notes.append(f'分类维度丰富 ({", ".join(names)}),可支撑多维交叉分析。')
- q = profile.get('data_quality', {})
- if q.get('score', 100) < 85:
- notes.append(f'数据质量评分偏低 ({q["score"]}/100),建议在生成前检查缺失值与异常值。')
- return notes
- def _infer_unit(col_name: str) -> str:
- col_lower = col_name.lower().strip()
- unit_map = {
- '金额': '元', '销售额': '元', '收入': '元', '利润': '元',
- '成本': '元', '费用': '元', '台数': '台', '件数': '件',
- '数量': '', '人数': '人', '天数': '天', '占比': '%',
- '比率': '%', '比例': '%', '率': '%',
- }
- for kw, unit in unit_map.items():
- if kw in col_lower:
- return unit
- return ''
- def generate_interaction_prompts(recommendations: dict, profile: dict) -> dict:
- return {
- 'period': {
- 'question': '报告周期与页数范围',
- 'detail': f"建议周期: {recommendations['suggested_period']}报\n建议页数: {recommendations['suggested_page_range'][0]}-{recommendations['suggested_page_range'][1]} 页\n请确认或调整",
- },
- 'metrics': {
- 'question': '核心指标集',
- 'detail': f"检测到 {len(recommendations['suggested_metrics'])} 个可计算指标\n已自动推荐主要的 {min(6, len(recommendations['suggested_metrics']))} 个\n请确认或增删",
- },
- 'audience': {
- 'question': '受众与决策场景',
- 'detail': '请选择: 管理层汇报 | 运营分析会 | 对外客户报告 | 自定义描述',
- },
- 'style': {
- 'question': '视觉风格与配色方向',
- 'detail': '推荐方案: 商务经典(深蓝) | 清新简约(绿色) | 深色专业 | 温暖品牌\n请选择配色方案',
- },
- 'pages': {
- 'question': '页面结构与模板方案',
- 'detail': f'推荐 {len(recommendations["suggested_pages"])} 个页面\n可增删调整页面顺序',
- },
- }
- if __name__ == '__main__':
- profile = {
- 'total_rows': 3240,
- 'total_columns': 15,
- 'numeric_columns': [
- {'column_name': '销售额', 'inferred_label': '销售额', 'numeric_stats': {'sum': 500000, 'mean': 154}},
- {'column_name': '订单量', 'inferred_label': '订单量', 'numeric_stats': {'sum': 3240, 'mean': 1.0}},
- {'column_name': '利润', 'inferred_label': '利润', 'numeric_stats': {'sum': 80000, 'mean': 25}},
- ],
- 'category_columns': [
- {'column_name': '区域', 'inferred_label': '区域', 'unique_count': 5},
- {'column_name': '产品', 'inferred_label': '产品', 'unique_count': 12},
- ],
- 'time_columns': [{'column_name': '日期', 'inferred_label': '日期'}],
- 'time_granularity': 'monthly',
- 'date_range': ('2026-01-01', '2026-04-30'),
- 'data_quality': {'score': 92},
- }
- recs = analyze_and_recommend(profile, PeriodType.MONTHLY)
- prompts = generate_interaction_prompts(recs, profile)
- for k, v in prompts.items():
- print(f"\n{k}: {v['question']}\n{v['detail']}")
|