data_profiler.py 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838
  1. """
  2. Universal data profiling engine: auto-detect schema, statistical features,
  3. and semantic inference from arbitrary Excel data.
  4. Enhanced with content-based value analysis, distribution shape analysis,
  5. derived metric detection, and multi-dimensional quality scoring.
  6. """
  7. import pandas as pd
  8. import numpy as np
  9. from datetime import datetime, date
  10. from collections import Counter
  11. import re
  12. import math
  13. from report_config import ColumnProfile, ColumnRole
  14. # =====================================================================
  15. # DATE PATTERNS — expanded for broader format coverage
  16. # =====================================================================
  17. DATE_PATTERNS = [
  18. re.compile(r'^\d{4}年\d{1,2}月\d{1,2}日$'),
  19. re.compile(r'^\d{4}-\d{2}-\d{2}$'),
  20. re.compile(r'^\d{4}/\d{1,2}/\d{1,2}$'),
  21. re.compile(r'^\d{4}\.\d{1,2}\.\d{1,2}$'),
  22. re.compile(r'^\d{4}年\d{1,2}月$'),
  23. re.compile(r'^\d{4}-\d{2}$'),
  24. re.compile(r'^\d{2}-\d{2}$'),
  25. re.compile(r'^\d{2}/\d{2}$'),
  26. re.compile(r'^\d{8}$'), # YYYYMMDD
  27. ]
  28. TIME_KEYWORDS = [
  29. '日期', '时间', 'date', 'time', '年', '月', '日', '周', '期', '季度',
  30. 'period', 'month', 'year', 'quarter', 'week', 'day', 'timestamp',
  31. '月份', '年份', '周期', '时段', 'time', 'datetime',
  32. 'date', 'created', 'updated', 'modified', '发生', '录入', '创建',
  33. ]
  34. NUMERIC_KEYWORDS = [
  35. '金额', '数量', '台数', '件数', '元', '价格', '收入', '支出',
  36. '利润', '成本', '费用', '销量', '销售额', '总数', '合计',
  37. 'amount', 'price', 'qty', 'quantity', 'revenue', 'cost',
  38. 'sales', 'volume', 'value', 'total', 'sum', 'count',
  39. '单数', '笔数', '人数', '天数', '比率', '占比', '比例', '率',
  40. '预算', 'budget', '花费', 'spend', 'fee', '金额', '单价',
  41. 'unit', '得分', 'score', 'rating', '评分',
  42. ]
  43. CATEGORY_KEYWORDS = [
  44. '国家', '区域', '地区', '城市', '省份', '状态', '类型', '类别',
  45. '分类', '部门', '组', '等级', '级别', '品牌', '渠道',
  46. 'country', 'region', 'city', 'status', 'type', 'category',
  47. 'department', 'group', 'level', 'brand', 'channel',
  48. '负责人', 'owner', 'manager', '产品', 'product', '阶段',
  49. '供应商', 'supplier', '客户', 'customer', '行业', 'industry',
  50. '性别', 'gender', '职位', 'title', '角色', 'role', '标签', 'tag',
  51. '科目', 'account', '方向', 'direction', '方式', 'method',
  52. '意向', 'intent', 'intention', 'priority', '优先级',
  53. ]
  54. ID_KEYWORDS = [
  55. 'id', '编号', '序号', '代码', 'code', 'no', '编码', '合同号',
  56. '订单号', '工单号', '流水号', '单号', '标识', 'key',
  57. 'uuid', 'guid', 'sn', '序列号', '身份证', 'phone', '手机',
  58. '邮箱', 'email', '电话', 'tel', 'mobile',
  59. ]
  60. TEXT_KEYWORDS = [
  61. '备注', '描述', '说明', '详情', '内容', '意见', '建议', '进度更新',
  62. 'note', 'description', 'detail', 'remark', 'comment', 'memo',
  63. '地址', 'address', '介绍', '摘要', 'summary', '附注',
  64. '反馈', 'feedback', '理由', 'reason', '原因', 'cause',
  65. ]
  66. RATE_KEYWORDS = [
  67. '率', 'ratio', 'rate', '占比', '比例', 'percentage', 'pct',
  68. 'percent', 'conversion', '转化率', '完成率', '增长率',
  69. ]
  70. # Patterns for value-based content detection
  71. PHONE_PATTERN = re.compile(r'^[\+]?[\d\-\(\)\s]{6,20}$')
  72. EMAIL_PATTERN = re.compile(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$')
  73. URL_PATTERN = re.compile(r'^https?://', re.IGNORECASE)
  74. YEAR_PATTERN = re.compile(r'^\d{4}$')
  75. def _parse_date(val):
  76. if pd.isna(val):
  77. return None
  78. if isinstance(val, (datetime, date)):
  79. return val
  80. if isinstance(val, (int, float)) and not math.isnan(val):
  81. try:
  82. return pd.Timestamp(val).to_pydatetime()
  83. except (ValueError, OverflowError):
  84. pass
  85. s = str(val).strip()
  86. for pattern in DATE_PATTERNS:
  87. if pattern.match(s):
  88. for fmt in ('%Y年%m月%d日', '%Y-%m-%d', '%Y/%m/%d',
  89. '%Y.%m.%d', '%Y年%m月', '%Y-%m',
  90. '%m-%d', '%m/%d', '%Y%m%d'):
  91. try:
  92. return datetime.strptime(s, fmt)
  93. except ValueError:
  94. continue
  95. return None
  96. # =====================================================================
  97. # VALUE-BASED CONTENT ANALYSIS
  98. # =====================================================================
  99. def _analyze_value_patterns(series: pd.Series, sample_count: int = 100) -> dict:
  100. """Analyze actual data values to detect patterns and content types."""
  101. non_null = series.dropna().astype(str).head(sample_count)
  102. if len(non_null) == 0:
  103. return {}
  104. patterns = {}
  105. # Check if values look like percentages
  106. # Only flag as percentage if: ends with % OR is a decimal fraction (0.0-1.0)
  107. pct_like = sum(1 for v in non_null if v.endswith('%') or
  108. (v.replace('.', '', 1).lstrip('-').isdigit() and
  109. 0 < float(v) <= 1 and not v.isdigit()))
  110. patterns['pct_ratio'] = pct_like / len(non_null)
  111. # Check for yes/no or true/false patterns
  112. yn_vals = {'是', '否', 'yes', 'no', 'true', 'false', 'y', 'n', 't', 'f',
  113. '有', '无', '0', '1'}
  114. yn_like = sum(1 for v in non_null if v.lower() in yn_vals)
  115. patterns['binary_ratio'] = yn_like / len(non_null)
  116. # Check for ordinal/categorical short text
  117. short_text = sum(1 for v in non_null if len(v) <= 20)
  118. patterns['short_text_ratio'] = short_text / len(non_null)
  119. # Check for phone-like patterns
  120. phone_like = sum(1 for v in non_null if PHONE_PATTERN.match(v))
  121. patterns['phone_ratio'] = phone_like / len(non_null)
  122. # Check for email-like patterns
  123. email_like = sum(1 for v in non_null if EMAIL_PATTERN.match(v))
  124. patterns['email_ratio'] = email_like / len(non_null)
  125. # Check for URL-like patterns
  126. url_like = sum(1 for v in non_null if URL_PATTERN.match(v))
  127. patterns['url_ratio'] = url_like / len(non_null)
  128. # Check for pure digit strings (possible IDs)
  129. digit_only = sum(1 for v in non_null if v.isdigit() and len(v) >= 6)
  130. patterns['digit_id_ratio'] = digit_only / len(non_null)
  131. # Check for year-like values
  132. year_like = sum(1 for v in non_null if YEAR_PATTERN.match(v))
  133. patterns['year_ratio'] = year_like / len(non_null)
  134. # Detect ordinal levels
  135. ordinal_sets = [
  136. {'高', '中', '低', 'A', 'B', 'C', '甲', '乙', '丙'},
  137. {'一级', '二级', '三级', '四级', 'level 1', 'level 2', 'level 3'},
  138. {'critical', 'major', 'minor', 'high', 'medium', 'low'},
  139. ]
  140. for oset in ordinal_sets:
  141. ord_like = sum(1 for v in non_null if v in oset)
  142. if ord_like / len(non_null) > 0.3:
  143. patterns['ordinal'] = True
  144. break
  145. else:
  146. patterns['ordinal'] = False
  147. # Average text length
  148. patterns['avg_text_len'] = round(non_null.str.len().mean(), 1)
  149. # Unique ratio
  150. unique_ratio = series.nunique() / max(len(non_null), 1)
  151. patterns['unique_ratio'] = round(unique_ratio, 4)
  152. return patterns
  153. def _infer_role_from_values(value_patterns: dict, col_name: str,
  154. dtype_str: str, unique_count: int, total_rows: int) -> str:
  155. """Infer column role based on value content analysis results."""
  156. up = value_patterns
  157. # High ratio of email patterns
  158. if up.get('email_ratio', 0) > 0.5:
  159. return 'id'
  160. # High ratio of phone patterns
  161. if up.get('phone_ratio', 0) > 0.5:
  162. return 'id'
  163. # High ratio of URL patterns
  164. if up.get('url_ratio', 0) > 0.5:
  165. return 'text'
  166. # Mostly binary values (yes/no)
  167. if up.get('binary_ratio', 0) > 0.6:
  168. return 'category'
  169. # Mostly percentage values
  170. if up.get('pct_ratio', 0) > 0.5:
  171. return 'numeric'
  172. # High ratio of digit-only long strings (likely IDs)
  173. if up.get('digit_id_ratio', 0) > 0.5 and unique_count > total_rows * 0.5:
  174. return 'id'
  175. # Ordinal level detected
  176. if up.get('ordinal', False):
  177. return 'category'
  178. return None # No clear signal from values
  179. # =====================================================================
  180. # SEMANTIC KEYWORD-BASED ROLE INFERENCE
  181. # =====================================================================
  182. def _infer_column_role(col_name: str, dtype_str: str, sample_values: list,
  183. null_rate: float, unique_count: int, total_rows: int,
  184. value_patterns: dict = None) -> ColumnRole:
  185. col_lower = col_name.lower().strip()
  186. # 1) Value-based inference first (stronger signal)
  187. if value_patterns:
  188. value_role = _infer_role_from_values(value_patterns, col_name,
  189. dtype_str, unique_count, total_rows)
  190. if value_role:
  191. return ColumnRole(value_role)
  192. # 2) Keyword-based inference (expanded)
  193. if any(kw in col_lower for kw in ID_KEYWORDS):
  194. return ColumnRole.ID
  195. if any(kw in col_lower for kw in TIME_KEYWORDS):
  196. return ColumnRole.TIME
  197. if any(kw in col_lower for kw in NUMERIC_KEYWORDS):
  198. return ColumnRole.NUMERIC
  199. if any(kw in col_lower for kw in CATEGORY_KEYWORDS):
  200. return ColumnRole.CATEGORY
  201. if any(kw in col_lower for kw in TEXT_KEYWORDS):
  202. return ColumnRole.TEXT
  203. # 3) dtype-based fallback
  204. if 'int' in dtype_str or 'float' in dtype_str:
  205. if unique_count <= 15 and total_rows > 20:
  206. return ColumnRole.CATEGORY
  207. return ColumnRole.NUMERIC
  208. if 'bool' in dtype_str:
  209. return ColumnRole.CATEGORY
  210. if 'datetime' in dtype_str:
  211. return ColumnRole.TIME
  212. # 4) Cardinality-based inference
  213. if total_rows > 0:
  214. cardinality_ratio = unique_count / total_rows
  215. if cardinality_ratio > 0.8 and unique_count > 20:
  216. return ColumnRole.TEXT
  217. if cardinality_ratio < 0.3 and unique_count <= 30:
  218. return ColumnRole.CATEGORY
  219. return ColumnRole.TEXT
  220. def _infer_metric_label(col_name: str, role: ColumnRole, value_patterns: dict = None) -> str:
  221. col_lower = col_name.lower().strip()
  222. # If values are percentage-like, mark as '比率'
  223. if value_patterns and value_patterns.get('pct_ratio', 0) > 0.5:
  224. for kw in ['率', '转化', '占比', '比例']:
  225. if kw in col_lower:
  226. return col_name
  227. return col_name + '(占比)'
  228. label_map = {
  229. '金额': '金额', '销售额': '销售额', '收入': '收入', '利润': '利润',
  230. '数量': '数量', '台数': '台数', '件数': '件数', '订单数': '订单数',
  231. '成本': '成本', '费用': '费用', '销量': '销量', '占比': '占比',
  232. '天数': '天数', '人数': '人数', '比率': '比率', '转化率': '转化率',
  233. '增长率': '增长率', '完成率': '完成率', '单价': '单价',
  234. '价格': '价格', '得分': '得分', '评分': '评分',
  235. }
  236. for kw, label in label_map.items():
  237. if kw in col_lower:
  238. return label
  239. # Check for rate-related keywords
  240. if any(kw in col_lower for kw in RATE_KEYWORDS):
  241. return '比率'
  242. if role == ColumnRole.NUMERIC:
  243. return col_name
  244. elif role == ColumnRole.TIME:
  245. return '日期'
  246. elif role == ColumnRole.CATEGORY:
  247. return col_name
  248. return col_name
  249. def _infer_unit(col_name: str, value_patterns: dict = None) -> str:
  250. col_lower = col_name.lower().strip()
  251. # If values are percentage-like
  252. if value_patterns and value_patterns.get('pct_ratio', 0) > 0.5:
  253. return '%'
  254. unit_map = {
  255. '金额': '元', '销售额': '元', '收入': '元', '利润': '元',
  256. '成本': '元', '费用': '元', '台数': '台', '件数': '件',
  257. '数量': '', '人数': '人', '天数': '天', '占比': '%',
  258. '比率': '%', '比例': '%', '率': '%', '转化率': '%',
  259. '增长率': '%', '完成率': '%', '单价': '元', '价格': '元',
  260. '得分': '分', '评分': '分',
  261. }
  262. for kw, unit in unit_map.items():
  263. if kw in col_lower:
  264. return unit
  265. return ''
  266. # =====================================================================
  267. # DISTRIBUTION SHAPE ANALYSIS
  268. # =====================================================================
  269. def _calc_distribution_shape(series: pd.Series) -> dict:
  270. """Compute skewness, kurtosis and distribution type for numeric series."""
  271. try:
  272. s = series.dropna()
  273. if len(s) < 4:
  274. return {}
  275. skew = round(float(s.skew()), 3)
  276. kurt = round(float(s.kurtosis()), 3)
  277. # Determine distribution type
  278. abs_skew = abs(skew)
  279. if abs_skew < 0.5:
  280. skew_type = '近似对称'
  281. elif abs_skew < 1.0:
  282. skew_type = '轻度偏态'
  283. else:
  284. skew_type = '显著偏态'
  285. if skew > 0.5:
  286. skew_dir = '右偏(长尾在右侧,大部分值偏小)'
  287. elif skew < -0.5:
  288. skew_dir = '左偏(长尾在左侧,大部分值偏大)'
  289. else:
  290. skew_dir = '基本对称'
  291. # Concentration analysis
  292. cv = float(s.std()) / float(s.mean()) if float(s.mean()) != 0 else 0
  293. if cv < 0.3:
  294. concentration = '高度集中'
  295. elif cv < 0.7:
  296. concentration = '中度集中'
  297. elif cv < 1.2:
  298. concentration = '适度分散'
  299. else:
  300. concentration = '高度分散'
  301. return {
  302. 'skewness': skew,
  303. 'kurtosis': kurt,
  304. 'skew_type': skew_type,
  305. 'skew_direction': skew_dir,
  306. 'cv': round(cv, 3),
  307. 'concentration': concentration,
  308. }
  309. except Exception:
  310. return {}
  311. # =====================================================================
  312. # DERIVED METRIC DETECTION
  313. # =====================================================================
  314. def _detect_derived_relations(df: pd.DataFrame, numeric_cols: list) -> list[dict]:
  315. """
  316. Detect potential derived relationships among numeric columns.
  317. E.g., A - B = C, A + B = C, A / B = C (approx.)
  318. """
  319. relations = []
  320. num_names = [c['column_name'] for c in numeric_cols]
  321. if len(num_names) < 3:
  322. return relations
  323. sample = df[num_names].dropna().head(500)
  324. for i, a_name in enumerate(num_names):
  325. for j, b_name in enumerate(num_names):
  326. if j <= i:
  327. continue
  328. a = sample[a_name]
  329. b = sample[b_name]
  330. # Check subtraction: a - b ≈ c or b - a ≈ c
  331. for diff_name in num_names:
  332. if diff_name in (a_name, b_name):
  333. continue
  334. d = sample[diff_name]
  335. diff_ab = (a - b - d).abs().mean()
  336. diff_ba = (b - a - d).abs().mean()
  337. threshold = max(d.mean(), 1) * 0.1
  338. if diff_ab < threshold:
  339. relations.append({
  340. 'type': 'subtraction',
  341. 'expression': f'{a_name} - {b_name} ≈ {diff_name}',
  342. 'accuracy': round(float(1 - diff_ab / max(float(d.mean()), 1)), 3),
  343. 'formula': f'{diff_name} = {a_name} - {b_name}',
  344. })
  345. break
  346. elif diff_ba < threshold:
  347. relations.append({
  348. 'type': 'subtraction',
  349. 'expression': f'{b_name} - {a_name} ≈ {diff_name}',
  350. 'accuracy': round(float(1 - diff_ba / max(float(d.mean()), 1)), 3),
  351. 'formula': f'{diff_name} = {b_name} - {a_name}',
  352. })
  353. break
  354. # Check addition: a + b ≈ c
  355. for sum_name in num_names:
  356. if sum_name in (a_name, b_name):
  357. continue
  358. s = sample[sum_name]
  359. sum_ab = (a + b - s).abs().mean()
  360. threshold = max(s.mean(), 1) * 0.1
  361. if sum_ab < threshold:
  362. relations.append({
  363. 'type': 'addition',
  364. 'expression': f'{a_name} + {b_name} ≈ {sum_name}',
  365. 'accuracy': round(float(1 - sum_ab / max(float(s.mean()), 1)), 3),
  366. 'formula': f'{sum_name} = {a_name} + {b_name}',
  367. })
  368. break
  369. # Also check for ratio relations
  370. if len(num_names) >= 2:
  371. for i, a_name in enumerate(num_names):
  372. for j, b_name in enumerate(num_names):
  373. if j <= i:
  374. continue
  375. a = sample[a_name]
  376. b = sample[b_name]
  377. ratio = (a / b.replace(0, np.nan)).dropna()
  378. if len(ratio) > 0:
  379. ratio_std = float(ratio.std())
  380. ratio_mean = float(ratio.mean())
  381. if ratio_mean > 0 and ratio_std / ratio_mean < 0.1:
  382. # Consistent ratio found
  383. relations.append({
  384. 'type': 'ratio',
  385. 'expression': f'{a_name} / {b_name} ≈ {ratio_mean:.3f}',
  386. 'accuracy': round(float(1 - ratio_std / ratio_mean), 3),
  387. 'formula': f'{a_name} = {b_name} × {ratio_mean:.2f}',
  388. })
  389. return relations
  390. # =====================================================================
  391. # MAIN PROFILING FUNCTION
  392. # =====================================================================
  393. def profile_dataframe(df: pd.DataFrame) -> dict:
  394. total_rows = len(df)
  395. columns = []
  396. for col in df.columns:
  397. series = df[col]
  398. dtype_str = str(series.dtype)
  399. null_count = int(series.isna().sum())
  400. null_rate = round(null_count / total_rows, 4) if total_rows else 0.0
  401. non_null = series.dropna()
  402. unique_count = int(non_null.nunique())
  403. sample_values = non_null.head(5).tolist()
  404. sample_values = [str(v) for v in sample_values]
  405. # Enhanced: Value pattern analysis
  406. value_patterns = _analyze_value_patterns(series)
  407. # Enhanced: Distribution shape analysis for numeric columns
  408. distribution_shape = None
  409. numeric_stats = None
  410. if pd.api.types.is_numeric_dtype(series) and not pd.api.types.is_bool_dtype(series):
  411. try:
  412. numeric_stats = {
  413. 'mean': round(float(series.mean()), 2) if not pd.isna(series.mean()) else 0,
  414. 'median': round(float(series.median()), 2) if not pd.isna(series.median()) else 0,
  415. 'min': round(float(series.min()), 2) if not pd.isna(series.min()) else 0,
  416. 'max': round(float(series.max()), 2) if not pd.isna(series.max()) else 0,
  417. 'std': round(float(series.std()), 2) if not pd.isna(series.std()) else 0,
  418. 'sum': round(float(series.sum()), 2) if not pd.isna(series.sum()) else 0,
  419. 'p25': round(float(series.quantile(0.25)), 2) if not pd.isna(series.quantile(0.25)) else 0,
  420. 'p75': round(float(series.quantile(0.75)), 2) if not pd.isna(series.quantile(0.75)) else 0,
  421. }
  422. distribution_shape = _calc_distribution_shape(series)
  423. except Exception:
  424. numeric_stats = None
  425. # Enhanced role inference with value patterns
  426. role = _infer_column_role(col, dtype_str, sample_values, null_rate,
  427. unique_count, total_rows, value_patterns)
  428. label = _infer_metric_label(col, role, value_patterns)
  429. unit = _infer_unit(col, value_patterns)
  430. # Enhanced: detect if column is a high-cardinality ID
  431. is_high_cardinality_id = (role == ColumnRole.TEXT and
  432. unique_count / max(total_rows, 1) > 0.8 and
  433. unique_count > 20)
  434. if is_high_cardinality_id:
  435. role = ColumnRole.ID
  436. columns.append(ColumnProfile(
  437. column_name=col,
  438. dtype=dtype_str,
  439. role=role,
  440. null_count=null_count,
  441. null_rate=null_rate,
  442. unique_count=unique_count,
  443. sample_values=sample_values,
  444. numeric_stats=numeric_stats,
  445. inferred_label=label,
  446. ))
  447. # Append extra metadata not in ColumnProfile
  448. columns[-1]._unit = unit
  449. columns[-1]._distribution_shape = distribution_shape
  450. columns[-1]._value_patterns = value_patterns
  451. time_cols = [c for c in columns if c.role == ColumnRole.TIME]
  452. numeric_cols = [c for c in columns if c.role == ColumnRole.NUMERIC]
  453. category_cols = [c for c in columns if c.role == ColumnRole.CATEGORY]
  454. text_cols = [c for c in columns if c.role == ColumnRole.TEXT]
  455. id_cols = [c for c in columns if c.role == ColumnRole.ID]
  456. # Date range inference
  457. date_range = (None, None)
  458. time_granularity = 'unknown'
  459. if time_cols:
  460. series = df[time_cols[0].column_name].dropna()
  461. parsed = series.apply(_parse_date).dropna()
  462. if len(parsed) > 0:
  463. date_range = (parsed.min(), parsed.max())
  464. if len(parsed) >= 2:
  465. diff = (parsed.max() - parsed.min()).days
  466. if diff <= 1:
  467. time_granularity = 'daily'
  468. elif diff <= 7:
  469. time_granularity = 'weekly'
  470. elif diff <= 31:
  471. time_granularity = 'monthly'
  472. elif diff <= 92:
  473. time_granularity = 'quarterly'
  474. else:
  475. time_granularity = 'yearly'
  476. # Enhanced: Detect derived relations among numeric columns
  477. derived_relations = _detect_derived_relations(df, [c.__dict__ for c in numeric_cols])
  478. # Enhanced: Multi-dimensional quality scoring
  479. quality_score, quality_details = _calc_quality_score(
  480. df, columns, numeric_cols, date_range
  481. )
  482. # Outlier detection (Enhanced: with CV-based filtering)
  483. outlier_columns = []
  484. for c in numeric_cols:
  485. ns = c.numeric_stats
  486. if ns and ns.get('std', 0) > 0 and ns.get('mean', 0) > 0:
  487. cv = ns['std'] / ns['mean']
  488. if cv > 3:
  489. outlier_columns.append(c.column_name)
  490. return {
  491. 'total_rows': total_rows,
  492. 'total_columns': len(columns),
  493. 'columns': [c.__dict__ for c in columns],
  494. 'time_columns': [c.__dict__ for c in time_cols],
  495. 'numeric_columns': [c.__dict__ for c in numeric_cols],
  496. 'category_columns': [c.__dict__ for c in category_cols],
  497. 'text_columns': [c.__dict__ for c in text_cols],
  498. 'id_columns': [c.__dict__ for c in id_cols],
  499. 'date_range': (
  500. date_range[0].strftime('%Y-%m-%d') if date_range[0] else None,
  501. date_range[1].strftime('%Y-%m-%d') if date_range[1] else None,
  502. ),
  503. 'time_granularity': time_granularity,
  504. 'data_quality': {
  505. 'score': quality_score,
  506. 'details': quality_details,
  507. 'high_null_columns': [c.column_name for c in columns if c.null_rate > 0.3],
  508. 'outlier_columns': outlier_columns,
  509. },
  510. 'derived_relations': derived_relations,
  511. 'column_stats': [{
  512. 'column_name': col.column_name,
  513. 'role': col.role.value,
  514. 'dtype': col.dtype,
  515. 'null_rate': col.null_rate,
  516. 'unique_count': col.unique_count,
  517. 'distribution_shape': getattr(col, '_distribution_shape', None),
  518. 'inferred_label': col.inferred_label,
  519. 'unit': getattr(col, '_unit', ''),
  520. 'numeric_stats': col.numeric_stats,
  521. } for col in columns],
  522. }
  523. # =====================================================================
  524. # ENHANCED QUALITY SCORING
  525. # =====================================================================
  526. def _calc_quality_score(df: pd.DataFrame, columns: list,
  527. numeric_cols: list, date_range: tuple) -> tuple:
  528. """Multi-dimensional quality scoring (0-100)."""
  529. score = 100
  530. details = {}
  531. # 1) Completeness (30%) — null rates
  532. avg_null_rate = np.mean([c.null_rate for c in columns]) if columns else 0
  533. completeness_penalty = min(30, avg_null_rate * 100 * 2)
  534. completeness = max(0, 30 - completeness_penalty)
  535. details['completeness'] = round(completeness, 1)
  536. # 2) Uniqueness (20%) — presence of ID columns or unique identifiers
  537. id_ratio = len([c for c in columns if c.role == ColumnRole.ID]) / max(len(columns), 1)
  538. uniqueness = min(20, 10 + id_ratio * 10)
  539. details['uniqueness'] = round(uniqueness, 1)
  540. # 3) Numeric health (25%) — outliers, zeros, negative values
  541. numeric_health = 25
  542. for c in numeric_cols:
  543. series = df[c.column_name].dropna()
  544. if len(series) == 0:
  545. continue
  546. # Check for negative values in non-negative expected columns
  547. if c.inferred_label in ('台数', '数量', '金额', '人数'):
  548. neg_ratio = (series < 0).sum() / len(series)
  549. if neg_ratio > 0.05:
  550. numeric_health -= 5
  551. # Check for excessive zeros
  552. zero_ratio = (series == 0).sum() / len(series)
  553. if zero_ratio > 0.5:
  554. numeric_health -= 3
  555. details['numeric_health'] = max(0, numeric_health)
  556. # 4) Temporal consistency (15%) — if time columns exist, check date ordering
  557. temporal = 15
  558. if date_range[0] and date_range[1]:
  559. if date_range[0] <= date_range[1]:
  560. temporal = 15
  561. else:
  562. temporal = 5
  563. details['temporal_consistency'] = temporal
  564. # 5) Completeness of categorical data (10%)
  565. cat_health = 10
  566. for c in columns:
  567. if c.role == ColumnRole.CATEGORY and c.null_rate > 0.2:
  568. cat_health -= 2
  569. details['categorical_health'] = max(0, cat_health)
  570. score = completeness + uniqueness + numeric_health + temporal + cat_health
  571. score = max(0, min(100, round(score)))
  572. return score, details
  573. # =====================================================================
  574. # HELPER FUNCTIONS (enhanced)
  575. # =====================================================================
  576. def profile_category_distribution(df: pd.DataFrame, col_name: str, top_n: int = 15) -> dict:
  577. if col_name not in df.columns:
  578. return {}
  579. counts = df[col_name].value_counts().head(top_n).to_dict()
  580. total = df[col_name].notna().sum()
  581. # Calculate concentration (Herfindahl index)
  582. pcts = [v / total for v in counts.values()] if total else []
  583. hhi = sum(p * p for p in pcts) if pcts else 0
  584. return {
  585. 'total_categories': df[col_name].nunique(),
  586. 'top_items': {str(k): {'count': int(v), 'pct': round(v / total * 100, 1) if total else 0}
  587. for k, v in counts.items()},
  588. 'concentration_hhi': round(hhi, 4),
  589. 'concentration_label': '高度集中' if hhi > 0.5 else '中度集中' if hhi > 0.2 else '分散',
  590. }
  591. def profile_numeric_series(df: pd.DataFrame, col_name: str) -> dict:
  592. if col_name not in df.columns:
  593. return {}
  594. series = df[col_name].dropna()
  595. if len(series) == 0:
  596. return {}
  597. shape = _calc_distribution_shape(series)
  598. result = {
  599. 'count': len(series),
  600. 'sum': round(float(series.sum()), 2),
  601. 'mean': round(float(series.mean()), 2),
  602. 'median': round(float(series.median()), 2),
  603. 'min': round(float(series.min()), 2),
  604. 'max': round(float(series.max()), 2),
  605. 'std': round(float(series.std()), 2),
  606. }
  607. if shape:
  608. result.update(shape)
  609. return result
  610. def detect_data_issues(df: pd.DataFrame) -> list[dict]:
  611. issues = []
  612. for col in df.columns:
  613. null_rate = df[col].isna().mean()
  614. if null_rate > 0.5:
  615. issues.append({
  616. 'column': col,
  617. 'type': 'high_missing',
  618. 'severity': 'major',
  619. 'message': f'列"{col}"缺失率{null_rate:.1%},建议排除或补全',
  620. })
  621. elif null_rate > 0.1:
  622. issues.append({
  623. 'column': col,
  624. 'type': 'moderate_missing',
  625. 'severity': 'minor',
  626. 'message': f'列"{col}"缺失率{null_rate:.1%}',
  627. })
  628. if pd.api.types.is_numeric_dtype(df[col]):
  629. series = df[col].dropna()
  630. if len(series) > 0:
  631. q1, q3 = series.quantile(0.25), series.quantile(0.75)
  632. iqr = q3 - q1
  633. lower, upper = q1 - 3 * iqr, q3 + 3 * iqr
  634. outlier_count = ((series < lower) | (series > upper)).sum()
  635. if outlier_count > len(series) * 0.1:
  636. issues.append({
  637. 'column': col,
  638. 'type': 'outliers',
  639. 'severity': 'major',
  640. 'message': f'列"{col}"存在{outlier_count}个异常值({outlier_count/len(series):.1%})',
  641. })
  642. # Check for negative values
  643. neg_count = (series < 0).sum()
  644. if neg_count > 0:
  645. issues.append({
  646. 'column': col,
  647. 'type': 'negative_values',
  648. 'severity': 'minor',
  649. 'message': f'列"{col}"存在{neg_count}个负值',
  650. })
  651. # Check for constant columns
  652. if df[col].nunique() <= 1 and null_rate < 1.0:
  653. issues.append({
  654. 'column': col,
  655. 'type': 'constant_column',
  656. 'severity': 'minor',
  657. 'message': f'列"{col}"为常量列(仅1个唯一值),对分析无贡献',
  658. })
  659. return issues
  660. def generate_summary_text(profile: dict) -> str:
  661. lines = []
  662. lines.append(f"共 {profile['total_rows']:,} 行 × {profile['total_columns']} 列")
  663. num_cols = profile.get('numeric_columns', [])
  664. cat_cols = profile.get('category_columns', [])
  665. time_cols = profile.get('time_columns', [])
  666. lines.append(f"数值列: {len(num_cols)} 个 | 分类列: {len(cat_cols)} 个 | 时间列: {len(time_cols)} 个")
  667. dr = profile.get('date_range', (None, None))
  668. if dr[0]:
  669. lines.append(f"时间范围: {dr[0]} ~ {dr[1]}")
  670. lines.append(f"时间粒度: {profile.get('time_granularity', 'unknown')}")
  671. q = profile.get('data_quality', {})
  672. lines.append(f"数据质量评分: {q.get('score', 0)}/100")
  673. if q.get('details'):
  674. det = q['details']
  675. lines.append(f" 完整性: {det.get('completeness', 0)}/30 | "
  676. f"数值健康: {det.get('numeric_health', 0)}/25 | "
  677. f"时间一致性: {det.get('temporal_consistency', 0)}/15")
  678. if q.get('high_null_columns'):
  679. lines.append(f"高缺失列: {', '.join(q['high_null_columns'])}")
  680. # Enhanced: derived relations
  681. derived = profile.get('derived_relations', [])
  682. if derived:
  683. lines.append(f"检测到 {len(derived)} 个数值关系:")
  684. for rel in derived[:5]:
  685. lines.append(f" {rel['formula']} (置信度: {rel['accuracy']:.0%})")
  686. # Distribution shape summary for numeric columns
  687. shape_cols = []
  688. for nc in num_cols[:3]:
  689. shape = nc.get('distribution_shape')
  690. if shape:
  691. shape_cols.append(f"{nc.get('inferred_label', nc['column_name'])}[{shape.get('concentration', 'N/A')}]")
  692. if shape_cols:
  693. lines.append(f"分布特征: {' | '.join(shape_cols)}")
  694. return '\n'.join(lines)
  695. if __name__ == '__main__':
  696. import sys
  697. if len(sys.argv) > 1:
  698. fp = sys.argv[1]
  699. try:
  700. df = pd.read_excel(fp)
  701. except Exception:
  702. df = pd.read_excel(fp, sheet_name=0)
  703. profile = profile_dataframe(df)
  704. print(generate_summary_text(profile))
  705. issues = detect_data_issues(df)
  706. if issues:
  707. print(f"\n数据问题 ({len(issues)}):")
  708. for iss in issues:
  709. print(f" [{iss['severity']}] {iss['message']}")