data_loader.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599
  1. """
  2. Excel data loader for daily/weekly/monthly report generation.
  3. Contains both legacy order-specific loaders and enhanced generic loaders.
  4. """
  5. import pandas as pd
  6. from datetime import datetime, timedelta
  7. import re
  8. import warnings
  9. import os
  10. import io
  11. import csv
  12. # =====================================================================
  13. # LEGACY SECTION — Order-specific loaders (kept for backward compat)
  14. # =====================================================================
  15. FIELD_MAP = {
  16. '序号': 'seq',
  17. '目的国家': 'country',
  18. '合同号': 'contract_no',
  19. '用户名称/公司': 'customer',
  20. '意向车型及数量': 'product_info',
  21. '订单总数量': 'order_qty',
  22. '负责人': 'owner',
  23. '当前状态': 'status',
  24. '拟定合同时间': 'contract_date',
  25. '跟单天数': 'tracking_days',
  26. '定金支付时间': 'deposit_date',
  27. '订金认领时间': 'deposit_claim_date',
  28. '订单生成时间': 'order_gen_date',
  29. '价格评审时间': 'price_review_date',
  30. '合同评审时间': 'contract_review_date',
  31. '合同提交盖章申请时间': 'seal_apply_date',
  32. '合同盖章时间': 'seal_date',
  33. '车辆下线入库状态': 'inventory_status',
  34. '尾款支付时间': 'final_pay_date',
  35. '尾款认领时间': 'final_claim_date',
  36. '智慧关务信息维护': 'customs_date',
  37. '许可证办理时间': 'license_date',
  38. '车辆发运时间': 'ship_date',
  39. '预计开票时间': 'invoice_date',
  40. '今日进度更新': 'progress_update',
  41. '是否更新': 'is_updated',
  42. '支持需求': 'support_request',
  43. '4月交付': 'deliver_apr',
  44. '5月预测': 'forecast_may',
  45. }
  46. STATUS_ORDER = ['A', 'B', 'C', 'D', 'E', 'F']
  47. STATUS_LABELS = {
  48. 'A': '合同拟定中',
  49. 'B': '已锁定合同待付订金',
  50. 'C': '已付订金待生产',
  51. 'D': '已生产待付尾款',
  52. 'E': '已付尾款待发运',
  53. 'F': '已发运',
  54. }
  55. def _normalize_status(val):
  56. """Extract status code A-F from status string."""
  57. if pd.isna(val):
  58. return None
  59. s = str(val).strip()
  60. m = re.match(r'^([A-F])', s)
  61. if m:
  62. return m.group(1)
  63. return None
  64. def _parse_date(val):
  65. """Parse various date formats."""
  66. if pd.isna(val):
  67. return None
  68. if isinstance(val, datetime):
  69. return val
  70. for fmt in ('%Y-%m-%d', '%Y/%m/%d', '%Y年%m月%d日'):
  71. try:
  72. return datetime.strptime(str(val).strip(), fmt)
  73. except ValueError:
  74. continue
  75. return None
  76. def _sheet_name_for_date(date: datetime) -> str:
  77. """Convert datetime to expected sheet name."""
  78. return date.strftime('%Y年%m月%d日')
  79. def load_workbook_metadata(filepath: str) -> dict:
  80. """Return workbook metadata: sheet names, date range."""
  81. xl = pd.ExcelFile(filepath)
  82. sheets = xl.sheet_names
  83. dates = []
  84. for s in sheets:
  85. try:
  86. d = datetime.strptime(s, '%Y年%m月%d日')
  87. dates.append(d)
  88. except ValueError:
  89. continue
  90. dates.sort()
  91. return {
  92. 'sheets': sheets,
  93. 'date_range': (dates[0], dates[-1]) if dates else (None, None),
  94. 'total_days': len(dates),
  95. }
  96. def load_daily(filepath: str, date: datetime) -> pd.DataFrame:
  97. """Load single-day order data."""
  98. sheet = _sheet_name_for_date(date)
  99. df = pd.read_excel(filepath, sheet_name=sheet)
  100. return _clean_dataframe(df)
  101. def load_date_range(filepath: str, start: datetime, end: datetime) -> pd.DataFrame:
  102. """Load and concatenate data across a date range [start, end]."""
  103. xl = pd.ExcelFile(filepath)
  104. frames = []
  105. current = start
  106. while current <= end:
  107. sheet = _sheet_name_for_date(current)
  108. if sheet in xl.sheet_names:
  109. df = pd.read_excel(filepath, sheet_name=sheet)
  110. df['_data_date'] = current
  111. frames.append(df)
  112. current += timedelta(days=1)
  113. if not frames:
  114. raise ValueError(f"No data found between {start.date()} and {end.date()}")
  115. combined = pd.concat(frames, ignore_index=True)
  116. return _clean_dataframe(combined)
  117. def load_weekly(filepath: str, year: int, week_num: int, week_start_day=0) -> tuple:
  118. """
  119. Load data for a specific week.
  120. Returns (current_week_df, prev_week_df).
  121. week_start_day: 0=Monday, 6=Sunday
  122. """
  123. meta = load_workbook_metadata(filepath)
  124. first_date, last_date = meta['date_range']
  125. if first_date is None:
  126. raise ValueError("No valid date sheets found")
  127. jan4 = datetime(year, 1, 4)
  128. jan4_monday = jan4 - timedelta(days=jan4.weekday())
  129. target_monday = jan4_monday + timedelta(weeks=week_num - 1)
  130. target_sunday = target_monday + timedelta(days=6)
  131. start = max(target_monday, first_date)
  132. end = min(target_sunday, last_date)
  133. current = load_date_range(filepath, start, end)
  134. prev_start = start - timedelta(days=7)
  135. prev_end = end - timedelta(days=7)
  136. if prev_start >= first_date:
  137. previous = load_date_range(filepath, prev_start, prev_end)
  138. else:
  139. previous = pd.DataFrame(columns=current.columns)
  140. return current, previous
  141. def load_monthly(filepath: str, year: int, month: int) -> tuple:
  142. """
  143. Load data for a specific month.
  144. Returns (current_month_df, prev_month_df, yoy_month_df).
  145. """
  146. start = datetime(year, month, 1)
  147. if month == 12:
  148. end = datetime(year + 1, 1, 1) - timedelta(days=1)
  149. else:
  150. end = datetime(year, month + 1, 1) - timedelta(days=1)
  151. current = load_date_range(filepath, start, end)
  152. if month == 1:
  153. prev_start = datetime(year - 1, 12, 1)
  154. prev_end = datetime(year, 1, 1) - timedelta(days=1)
  155. else:
  156. prev_start = datetime(year, month - 1, 1)
  157. prev_end = datetime(year, month, 1) - timedelta(days=1)
  158. try:
  159. previous = load_date_range(filepath, prev_start, prev_end)
  160. except ValueError:
  161. previous = pd.DataFrame(columns=current.columns)
  162. yoy_start = datetime(year - 1, month, 1)
  163. if month == 12:
  164. yoy_end = datetime(year, 1, 1) - timedelta(days=1)
  165. else:
  166. yoy_end = datetime(year - 1, month + 1, 1) - timedelta(days=1)
  167. try:
  168. yoy = load_date_range(filepath, yoy_start, yoy_end)
  169. except ValueError:
  170. yoy = pd.DataFrame(columns=current.columns)
  171. return current, previous, yoy
  172. def _clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
  173. """Rename columns, parse dates, clean statuses (legacy)."""
  174. rename_map = {k: v for k, v in FIELD_MAP.items() if k in df.columns}
  175. df = df.rename(columns=rename_map)
  176. if 'status' in df.columns:
  177. df['status_code'] = df['status'].apply(_normalize_status)
  178. if 'order_qty' in df.columns:
  179. df['order_qty'] = pd.to_numeric(df['order_qty'], errors='coerce')
  180. date_fields = ['contract_date', 'deposit_date', 'order_gen_date',
  181. 'price_review_date', 'contract_review_date', 'seal_apply_date',
  182. 'seal_date', 'final_pay_date', 'customs_date', 'license_date',
  183. 'ship_date', 'invoice_date']
  184. for field in date_fields:
  185. if field in df.columns:
  186. df[field] = df[field].apply(_parse_date)
  187. if 'tracking_days' in df.columns:
  188. df['tracking_days'] = pd.to_numeric(df['tracking_days'], errors='coerce')
  189. if 'is_updated' in df.columns:
  190. df['is_updated_flag'] = df['is_updated'].astype(str).str.strip() == '是'
  191. return df
  192. # =====================================================================
  193. # GENERIC LOADING SECTION — Universal loaders for any Excel data
  194. # =====================================================================
  195. # Summary row keywords (Chinese and English) to auto-detect and skip
  196. SUMMARY_KEYWORDS = [
  197. '合计', '总计', '小计', '汇总', '累计', '总和',
  198. 'total', 'sum', 'subtotal', 'grand total', '合计:', '总计:',
  199. '平均', 'avg', 'average',
  200. ]
  201. # Encoding priority list for CSV detection
  202. CSV_ENCODINGS = ['utf-8', 'utf-8-sig', 'gbk', 'gb2312', 'gb18030', 'latin-1', 'cp1252']
  203. def auto_detect_file_format(filepath: str) -> str:
  204. """Auto-detect file format: xlsx, xls, csv, or unknown."""
  205. ext = os.path.splitext(filepath)[1].lower()
  206. if ext in ('.xlsx', '.xls'):
  207. return ext[1:]
  208. if ext == '.csv':
  209. return 'csv'
  210. if ext in ('.xlsm', '.xlsb'):
  211. return ext[1:]
  212. return 'unknown'
  213. def load_generic_csv(filepath: str, encoding=None, **kwargs) -> pd.DataFrame:
  214. """
  215. Load a CSV file with auto-encoding detection.
  216. Tries common encodings until one succeeds.
  217. """
  218. if encoding:
  219. try:
  220. return pd.read_csv(filepath, encoding=encoding, **kwargs)
  221. except (UnicodeDecodeError, UnicodeError):
  222. raise ValueError(f"Failed to decode {filepath} with encoding {encoding}")
  223. last_error = None
  224. for enc in CSV_ENCODINGS:
  225. try:
  226. df = pd.read_csv(filepath, encoding=enc, **kwargs)
  227. if len(df.columns) > 0:
  228. return df
  229. except (UnicodeDecodeError, UnicodeError, pd.errors.ParserError) as e:
  230. last_error = e
  231. continue
  232. raise ValueError(
  233. f"Unable to decode CSV file {filepath}. Tried encodings: "
  234. f"{CSV_ENCODINGS}. Last error: {last_error}"
  235. )
  236. def _detect_and_skip_footer_rows(df_raw: pd.DataFrame) -> pd.DataFrame:
  237. """Detect and remove summary/aggregation rows at the end of the data."""
  238. if df_raw.empty:
  239. return df_raw
  240. rows_to_drop = []
  241. text_cols = [c for c in df_raw.columns if df_raw[c].dtype == 'object']
  242. for idx in range(len(df_raw) - 1, -1, -1):
  243. row = df_raw.iloc[idx]
  244. is_summary = False
  245. for col in text_cols:
  246. val = str(row.get(col, '')).strip().lower()
  247. if any(kw in val for kw in SUMMARY_KEYWORDS):
  248. is_summary = True
  249. break
  250. if is_summary:
  251. rows_to_drop.append(idx)
  252. else:
  253. break
  254. if len(rows_to_drop) > 20:
  255. break
  256. if rows_to_drop:
  257. df_raw = df_raw.drop(index=rows_to_drop)
  258. df_raw = df_raw.reset_index(drop=True)
  259. return df_raw
  260. def _detect_empty_or_notes_rows(df_raw: pd.DataFrame) -> pd.DataFrame:
  261. """Remove leading empty rows and trailing fully-empty rows."""
  262. if df_raw.empty:
  263. return df_raw
  264. non_empty_rows = df_raw.notna().any(axis=1)
  265. first_valid = non_empty_rows.idxmax() if non_empty_rows.any() else 0
  266. last_valid = non_empty_rows[non_empty_rows].index[-1] if non_empty_rows.any() else len(df_raw)
  267. df_raw = df_raw.iloc[first_valid:last_valid + 1].reset_index(drop=True)
  268. return df_raw
  269. def normalize_column_names(col_name: str) -> str:
  270. """
  271. Normalize a single column name: strip whitespace, unify brackets, remove special chars.
  272. """
  273. if not isinstance(col_name, str):
  274. return col_name
  275. name = col_name.strip()
  276. name = name.replace('(', '(').replace(')', ')')
  277. name = name.replace('【', '[').replace('】', ']')
  278. name = name.replace('\n', ' ').replace('\r', ' ')
  279. name = re.sub(r'\s+', ' ', name)
  280. return name
  281. def _is_unnamed_column(col) -> bool:
  282. return str(col).strip().lower().startswith('unnamed')
  283. def _dedupe_columns(columns) -> list:
  284. seen = {}
  285. result = []
  286. for idx, col in enumerate(columns):
  287. name = normalize_column_names(str(col).strip()) if col is not None else ''
  288. if not name or name.lower() in ('nan', 'none') or _is_unnamed_column(name):
  289. name = f'column_{idx + 1}'
  290. base = name
  291. count = seen.get(base, 0)
  292. if count:
  293. name = f'{base}_{count + 1}'
  294. seen[base] = count + 1
  295. result.append(name)
  296. return result
  297. def _row_header_score(values) -> float:
  298. cells = [str(v).strip() for v in values if pd.notna(v) and str(v).strip()]
  299. if not cells:
  300. return 0
  301. non_numeric = 0
  302. unique = set()
  303. for cell in cells:
  304. unique.add(cell)
  305. try:
  306. float(cell.replace(',', '').replace('%', ''))
  307. is_numeric = True
  308. except Exception:
  309. is_numeric = False
  310. if not is_numeric:
  311. non_numeric += 1
  312. return len(cells) + non_numeric * 1.5 + len(unique) * 0.5
  313. def _detect_header_row(raw_df: pd.DataFrame, max_scan_rows: int = 8) -> int:
  314. if raw_df.empty:
  315. return 0
  316. scan_rows = min(max_scan_rows, len(raw_df))
  317. best_idx = 0
  318. best_score = -1
  319. for idx in range(scan_rows):
  320. row = raw_df.iloc[idx]
  321. score = _row_header_score(row)
  322. next_non_empty = 0
  323. if idx + 1 < len(raw_df):
  324. next_non_empty = raw_df.iloc[idx + 1].notna().sum()
  325. # Prefer rows with many non-numeric labels and data underneath.
  326. score += min(next_non_empty, len(raw_df.columns)) * 0.2
  327. if score > best_score:
  328. best_score = score
  329. best_idx = idx
  330. return best_idx
  331. def _dataframe_from_detected_header(raw_df: pd.DataFrame, header_row='auto') -> pd.DataFrame:
  332. raw_df = raw_df.dropna(how='all').reset_index(drop=True)
  333. raw_df = raw_df.dropna(axis=1, how='all')
  334. if raw_df.empty:
  335. return raw_df
  336. if header_row == 'auto':
  337. header_idx = _detect_header_row(raw_df)
  338. elif header_row is None:
  339. header_idx = 0
  340. else:
  341. header_idx = int(header_row)
  342. header_idx = max(0, min(header_idx, len(raw_df) - 1))
  343. columns = _dedupe_columns(raw_df.iloc[header_idx].tolist())
  344. df = raw_df.iloc[header_idx + 1:].copy().reset_index(drop=True)
  345. df.columns = columns
  346. return df
  347. def _clean_generic_dataframe(df: pd.DataFrame, skip_summary_rows=True) -> pd.DataFrame:
  348. """
  349. Universal DataFrame cleaning:
  350. - Remove fully empty rows/columns
  351. - Drop 'Unnamed' columns
  352. - Normalize column names (strip whitespace, unify brackets)
  353. - Auto-detect and remove summary/total rows
  354. - Try to parse date columns
  355. - Try to parse numeric columns
  356. """
  357. if df.empty:
  358. return df
  359. df = df.dropna(how='all').reset_index(drop=True)
  360. df = df.dropna(axis=1, how='all')
  361. unnamed_mask = df.columns.map(_is_unnamed_column)
  362. if unnamed_mask.any() and (~unnamed_mask).any():
  363. df = df.loc[:, ~unnamed_mask]
  364. df = df.rename(columns=normalize_column_names)
  365. if skip_summary_rows:
  366. df = _detect_and_skip_footer_rows(df)
  367. df = _detect_empty_or_notes_rows(df)
  368. for col in df.columns:
  369. if df[col].dtype == 'object':
  370. try:
  371. try:
  372. parsed = pd.to_datetime(df[col], errors='coerce', format='mixed')
  373. except TypeError:
  374. parsed = pd.to_datetime(df[col], errors='coerce')
  375. if parsed.notna().sum() > len(df) * 0.7:
  376. df[col] = parsed
  377. continue
  378. except Exception:
  379. pass
  380. try:
  381. numeric = pd.to_numeric(df[col], errors='coerce')
  382. if numeric.notna().sum() > len(df) * 0.7:
  383. df[col] = numeric
  384. except Exception:
  385. pass
  386. return df
  387. def load_generic_excel(filepath: str, sheet_name=0, skip_summary_rows=True,
  388. encoding=None, dtype_backend=None, header_row='auto') -> pd.DataFrame:
  389. """
  390. Load any Excel/CSV file into a cleaned DataFrame.
  391. Args:
  392. filepath: Path to the data file (.xlsx, .xls, or .csv)
  393. sheet_name: Sheet name or index (for Excel). Ignored for CSV.
  394. skip_summary_rows: Auto-detect and remove summary/total footer rows
  395. encoding: File encoding (auto-detected for CSV if None)
  396. dtype_backend: Optional pandas dtype backend ('numpy_nullable', 'pyarrow')
  397. header_row: Excel header row index or 'auto'. CSV keeps its native header.
  398. """
  399. fmt = auto_detect_file_format(filepath)
  400. kwargs = {}
  401. if dtype_backend:
  402. kwargs['dtype_backend'] = dtype_backend
  403. if fmt == 'csv':
  404. df = load_generic_csv(filepath, encoding=encoding, **kwargs)
  405. else:
  406. try:
  407. raw_df = pd.read_excel(filepath, sheet_name=sheet_name, header=None, **kwargs)
  408. df = _dataframe_from_detected_header(raw_df, header_row=header_row)
  409. except Exception as e:
  410. if fmt == 'xls':
  411. raise ValueError(
  412. f"Failed to read .xls file. Try converting to .xlsx format. "
  413. f"Error: {e}"
  414. )
  415. raise
  416. return _clean_generic_dataframe(df, skip_summary_rows=skip_summary_rows)
  417. def load_generic_all_sheets(filepath: str, skip_summary_rows=True, header_row='auto') -> pd.DataFrame:
  418. """
  419. Load all sheets from an Excel file, merge into a single DataFrame.
  420. Adds '_source_sheet' column to track the source sheet.
  421. """
  422. fmt = auto_detect_file_format(filepath)
  423. if fmt == 'csv':
  424. return load_generic_excel(filepath, skip_summary_rows=skip_summary_rows)
  425. xl = pd.ExcelFile(filepath)
  426. if len(xl.sheet_names) == 1:
  427. raw_df = pd.read_excel(filepath, sheet_name=xl.sheet_names[0], header=None)
  428. df = _dataframe_from_detected_header(raw_df, header_row=header_row)
  429. return _clean_generic_dataframe(df, skip_summary_rows=skip_summary_rows)
  430. frames = []
  431. for sheet in xl.sheet_names:
  432. try:
  433. raw_df = pd.read_excel(filepath, sheet_name=sheet, header=None)
  434. df = _dataframe_from_detected_header(raw_df, header_row=header_row)
  435. df['_source_sheet'] = sheet
  436. frames.append(df)
  437. except Exception:
  438. continue
  439. if not frames:
  440. raise ValueError(f"No valid sheets found in {filepath}")
  441. combined = pd.concat(frames, ignore_index=True)
  442. return _clean_generic_dataframe(combined, skip_summary_rows=skip_summary_rows)
  443. def auto_detect_date_column(df: pd.DataFrame) -> str:
  444. """Auto-detect the primary date/time column in a DataFrame."""
  445. date_keywords = ['日期', '时间', 'date', 'time', '年', '月', '日']
  446. for col in df.columns:
  447. col_str = str(col).lower().strip()
  448. if any(kw in col_str for kw in date_keywords):
  449. parsed = pd.to_datetime(df[col], errors='coerce')
  450. if parsed.notna().sum() > len(df) * 0.5:
  451. return col
  452. return ''
  453. def auto_parse_single_sheet(filepath: str, sheet_name=0) -> pd.DataFrame:
  454. """Load and clean a single sheet (shortcut for load_generic_excel)."""
  455. return load_generic_excel(filepath, sheet_name=sheet_name)
  456. def load_generic_file_info(filepath: str) -> dict:
  457. """
  458. Return file metadata without full data loading.
  459. Useful for quick inspection before deciding how to load.
  460. """
  461. info = {'filepath': filepath, 'format': auto_detect_file_format(filepath)}
  462. if info['format'] == 'csv':
  463. try:
  464. with open(filepath, 'r', encoding='utf-8-sig') as f:
  465. sample = f.read(8192)
  466. dialect = csv.Sniffer().sniff(sample[:4096])
  467. info['delimiter'] = dialect.delimiter
  468. info['has_header'] = csv.Sniffer().has_header(sample)
  469. info['approx_rows'] = sample.count('\n')
  470. except Exception:
  471. info['delimiter'] = ','
  472. else:
  473. try:
  474. xl = pd.ExcelFile(filepath)
  475. info['sheet_names'] = xl.sheet_names
  476. info['sheet_count'] = len(xl.sheet_names)
  477. except Exception as e:
  478. info['error'] = str(e)
  479. info['file_size_mb'] = round(os.path.getsize(filepath) / (1024 * 1024), 2)
  480. return info
  481. if __name__ == '__main__':
  482. import sys
  483. if len(sys.argv) > 1:
  484. fp = sys.argv[1]
  485. fmt = auto_detect_file_format(fp)
  486. print(f"File: {fp}")
  487. print(f"Format: {fmt}")
  488. file_info = load_generic_file_info(fp)
  489. print(f"Size: {file_info.get('file_size_mb', '?')} MB")
  490. if 'sheet_names' in file_info:
  491. print(f"Sheets ({file_info['sheet_count']}): {file_info['sheet_names'][:5]}...")
  492. df = load_generic_excel(fp)
  493. date_col = auto_detect_date_column(df)
  494. print(f"Generic load: {len(df)} rows x {len(df.columns)} cols, "
  495. f"date column: {date_col}")
  496. print(f"Columns: {list(df.columns)}")