data_loader.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512
  1. """
  2. Excel data loader for daily/weekly/monthly report generation.
  3. Contains both legacy order-specific loaders and enhanced generic loaders.
  4. """
  5. import pandas as pd
  6. from datetime import datetime, timedelta
  7. import re
  8. import warnings
  9. import os
  10. import io
  11. import csv
  12. # =====================================================================
  13. # LEGACY SECTION — Order-specific loaders (kept for backward compat)
  14. # =====================================================================
  15. FIELD_MAP = {
  16. '序号': 'seq',
  17. '目的国家': 'country',
  18. '合同号': 'contract_no',
  19. '用户名称/公司': 'customer',
  20. '意向车型及数量': 'product_info',
  21. '订单总数量': 'order_qty',
  22. '负责人': 'owner',
  23. '当前状态': 'status',
  24. '拟定合同时间': 'contract_date',
  25. '跟单天数': 'tracking_days',
  26. '定金支付时间': 'deposit_date',
  27. '订金认领时间': 'deposit_claim_date',
  28. '订单生成时间': 'order_gen_date',
  29. '价格评审时间': 'price_review_date',
  30. '合同评审时间': 'contract_review_date',
  31. '合同提交盖章申请时间': 'seal_apply_date',
  32. '合同盖章时间': 'seal_date',
  33. '车辆下线入库状态': 'inventory_status',
  34. '尾款支付时间': 'final_pay_date',
  35. '尾款认领时间': 'final_claim_date',
  36. '智慧关务信息维护': 'customs_date',
  37. '许可证办理时间': 'license_date',
  38. '车辆发运时间': 'ship_date',
  39. '预计开票时间': 'invoice_date',
  40. '今日进度更新': 'progress_update',
  41. '是否更新': 'is_updated',
  42. '支持需求': 'support_request',
  43. '4月交付': 'deliver_apr',
  44. '5月预测': 'forecast_may',
  45. }
  46. STATUS_ORDER = ['A', 'B', 'C', 'D', 'E', 'F']
  47. STATUS_LABELS = {
  48. 'A': '合同拟定中',
  49. 'B': '已锁定合同待付订金',
  50. 'C': '已付订金待生产',
  51. 'D': '已生产待付尾款',
  52. 'E': '已付尾款待发运',
  53. 'F': '已发运',
  54. }
  55. def _normalize_status(val):
  56. """Extract status code A-F from status string."""
  57. if pd.isna(val):
  58. return None
  59. s = str(val).strip()
  60. m = re.match(r'^([A-F])', s)
  61. if m:
  62. return m.group(1)
  63. return None
  64. def _parse_date(val):
  65. """Parse various date formats."""
  66. if pd.isna(val):
  67. return None
  68. if isinstance(val, datetime):
  69. return val
  70. for fmt in ('%Y-%m-%d', '%Y/%m/%d', '%Y年%m月%d日'):
  71. try:
  72. return datetime.strptime(str(val).strip(), fmt)
  73. except ValueError:
  74. continue
  75. return None
  76. def _sheet_name_for_date(date: datetime) -> str:
  77. """Convert datetime to expected sheet name."""
  78. return date.strftime('%Y年%m月%d日')
  79. def load_workbook_metadata(filepath: str) -> dict:
  80. """Return workbook metadata: sheet names, date range."""
  81. xl = pd.ExcelFile(filepath)
  82. sheets = xl.sheet_names
  83. dates = []
  84. for s in sheets:
  85. try:
  86. d = datetime.strptime(s, '%Y年%m月%d日')
  87. dates.append(d)
  88. except ValueError:
  89. continue
  90. dates.sort()
  91. return {
  92. 'sheets': sheets,
  93. 'date_range': (dates[0], dates[-1]) if dates else (None, None),
  94. 'total_days': len(dates),
  95. }
  96. def load_daily(filepath: str, date: datetime) -> pd.DataFrame:
  97. """Load single-day order data."""
  98. sheet = _sheet_name_for_date(date)
  99. df = pd.read_excel(filepath, sheet_name=sheet)
  100. return _clean_dataframe(df)
  101. def load_date_range(filepath: str, start: datetime, end: datetime) -> pd.DataFrame:
  102. """Load and concatenate data across a date range [start, end]."""
  103. xl = pd.ExcelFile(filepath)
  104. frames = []
  105. current = start
  106. while current <= end:
  107. sheet = _sheet_name_for_date(current)
  108. if sheet in xl.sheet_names:
  109. df = pd.read_excel(filepath, sheet_name=sheet)
  110. df['_data_date'] = current
  111. frames.append(df)
  112. current += timedelta(days=1)
  113. if not frames:
  114. raise ValueError(f"No data found between {start.date()} and {end.date()}")
  115. combined = pd.concat(frames, ignore_index=True)
  116. return _clean_dataframe(combined)
  117. def load_weekly(filepath: str, year: int, week_num: int, week_start_day=0) -> tuple:
  118. """
  119. Load data for a specific week.
  120. Returns (current_week_df, prev_week_df).
  121. week_start_day: 0=Monday, 6=Sunday
  122. """
  123. meta = load_workbook_metadata(filepath)
  124. first_date, last_date = meta['date_range']
  125. if first_date is None:
  126. raise ValueError("No valid date sheets found")
  127. jan4 = datetime(year, 1, 4)
  128. jan4_monday = jan4 - timedelta(days=jan4.weekday())
  129. target_monday = jan4_monday + timedelta(weeks=week_num - 1)
  130. target_sunday = target_monday + timedelta(days=6)
  131. start = max(target_monday, first_date)
  132. end = min(target_sunday, last_date)
  133. current = load_date_range(filepath, start, end)
  134. prev_start = start - timedelta(days=7)
  135. prev_end = end - timedelta(days=7)
  136. if prev_start >= first_date:
  137. previous = load_date_range(filepath, prev_start, prev_end)
  138. else:
  139. previous = pd.DataFrame(columns=current.columns)
  140. return current, previous
  141. def load_monthly(filepath: str, year: int, month: int) -> tuple:
  142. """
  143. Load data for a specific month.
  144. Returns (current_month_df, prev_month_df, yoy_month_df).
  145. """
  146. start = datetime(year, month, 1)
  147. if month == 12:
  148. end = datetime(year + 1, 1, 1) - timedelta(days=1)
  149. else:
  150. end = datetime(year, month + 1, 1) - timedelta(days=1)
  151. current = load_date_range(filepath, start, end)
  152. if month == 1:
  153. prev_start = datetime(year - 1, 12, 1)
  154. prev_end = datetime(year, 1, 1) - timedelta(days=1)
  155. else:
  156. prev_start = datetime(year, month - 1, 1)
  157. prev_end = datetime(year, month, 1) - timedelta(days=1)
  158. try:
  159. previous = load_date_range(filepath, prev_start, prev_end)
  160. except ValueError:
  161. previous = pd.DataFrame(columns=current.columns)
  162. yoy_start = datetime(year - 1, month, 1)
  163. if month == 12:
  164. yoy_end = datetime(year, 1, 1) - timedelta(days=1)
  165. else:
  166. yoy_end = datetime(year - 1, month + 1, 1) - timedelta(days=1)
  167. try:
  168. yoy = load_date_range(filepath, yoy_start, yoy_end)
  169. except ValueError:
  170. yoy = pd.DataFrame(columns=current.columns)
  171. return current, previous, yoy
  172. def _clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
  173. """Rename columns, parse dates, clean statuses (legacy)."""
  174. rename_map = {k: v for k, v in FIELD_MAP.items() if k in df.columns}
  175. df = df.rename(columns=rename_map)
  176. if 'status' in df.columns:
  177. df['status_code'] = df['status'].apply(_normalize_status)
  178. if 'order_qty' in df.columns:
  179. df['order_qty'] = pd.to_numeric(df['order_qty'], errors='coerce')
  180. date_fields = ['contract_date', 'deposit_date', 'order_gen_date',
  181. 'price_review_date', 'contract_review_date', 'seal_apply_date',
  182. 'seal_date', 'final_pay_date', 'customs_date', 'license_date',
  183. 'ship_date', 'invoice_date']
  184. for field in date_fields:
  185. if field in df.columns:
  186. df[field] = df[field].apply(_parse_date)
  187. if 'tracking_days' in df.columns:
  188. df['tracking_days'] = pd.to_numeric(df['tracking_days'], errors='coerce')
  189. if 'is_updated' in df.columns:
  190. df['is_updated_flag'] = df['is_updated'].astype(str).str.strip() == '是'
  191. return df
  192. # =====================================================================
  193. # GENERIC LOADING SECTION — Universal loaders for any Excel data
  194. # =====================================================================
  195. # Summary row keywords (Chinese and English) to auto-detect and skip
  196. SUMMARY_KEYWORDS = [
  197. '合计', '总计', '小计', '汇总', '累计', '总和',
  198. 'total', 'sum', 'subtotal', 'grand total', '合计:', '总计:',
  199. '平均', 'avg', 'average',
  200. ]
  201. # Encoding priority list for CSV detection
  202. CSV_ENCODINGS = ['utf-8', 'utf-8-sig', 'gbk', 'gb2312', 'gb18030', 'latin-1', 'cp1252']
  203. def auto_detect_file_format(filepath: str) -> str:
  204. """Auto-detect file format: xlsx, xls, csv, or unknown."""
  205. ext = os.path.splitext(filepath)[1].lower()
  206. if ext in ('.xlsx', '.xls'):
  207. return ext[1:]
  208. if ext == '.csv':
  209. return 'csv'
  210. if ext in ('.xlsm', '.xlsb'):
  211. return ext[1:]
  212. return 'unknown'
  213. def load_generic_csv(filepath: str, encoding=None, **kwargs) -> pd.DataFrame:
  214. """
  215. Load a CSV file with auto-encoding detection.
  216. Tries common encodings until one succeeds.
  217. """
  218. if encoding:
  219. try:
  220. return pd.read_csv(filepath, encoding=encoding, **kwargs)
  221. except (UnicodeDecodeError, UnicodeError):
  222. raise ValueError(f"Failed to decode {filepath} with encoding {encoding}")
  223. last_error = None
  224. for enc in CSV_ENCODINGS:
  225. try:
  226. df = pd.read_csv(filepath, encoding=enc, **kwargs)
  227. if len(df.columns) > 0:
  228. return df
  229. except (UnicodeDecodeError, UnicodeError, pd.errors.ParserError) as e:
  230. last_error = e
  231. continue
  232. raise ValueError(
  233. f"Unable to decode CSV file {filepath}. Tried encodings: "
  234. f"{CSV_ENCODINGS}. Last error: {last_error}"
  235. )
  236. def _detect_and_skip_footer_rows(df_raw: pd.DataFrame) -> pd.DataFrame:
  237. """Detect and remove summary/aggregation rows at the end of the data."""
  238. if df_raw.empty:
  239. return df_raw
  240. rows_to_drop = []
  241. text_cols = [c for c in df_raw.columns if df_raw[c].dtype == 'object']
  242. for idx in range(len(df_raw) - 1, -1, -1):
  243. row = df_raw.iloc[idx]
  244. is_summary = False
  245. for col in text_cols:
  246. val = str(row.get(col, '')).strip().lower()
  247. if any(kw in val for kw in SUMMARY_KEYWORDS):
  248. is_summary = True
  249. break
  250. if is_summary:
  251. rows_to_drop.append(idx)
  252. else:
  253. break
  254. if len(rows_to_drop) > 20:
  255. break
  256. if rows_to_drop:
  257. df_raw = df_raw.drop(index=rows_to_drop)
  258. df_raw = df_raw.reset_index(drop=True)
  259. return df_raw
  260. def _detect_empty_or_notes_rows(df_raw: pd.DataFrame) -> pd.DataFrame:
  261. """Remove leading empty rows and trailing fully-empty rows."""
  262. if df_raw.empty:
  263. return df_raw
  264. non_empty_rows = df_raw.notna().any(axis=1)
  265. first_valid = non_empty_rows.idxmax() if non_empty_rows.any() else 0
  266. last_valid = non_empty_rows[non_empty_rows].index[-1] if non_empty_rows.any() else len(df_raw)
  267. df_raw = df_raw.iloc[first_valid:last_valid + 1].reset_index(drop=True)
  268. return df_raw
  269. def normalize_column_names(col_name: str) -> str:
  270. """
  271. Normalize a single column name: strip whitespace, unify brackets, remove special chars.
  272. """
  273. if not isinstance(col_name, str):
  274. return col_name
  275. name = col_name.strip()
  276. name = name.replace('(', '(').replace(')', ')')
  277. name = name.replace('【', '[').replace('】', ']')
  278. name = name.replace('\n', ' ').replace('\r', ' ')
  279. name = re.sub(r'\s+', ' ', name)
  280. return name
  281. def _clean_generic_dataframe(df: pd.DataFrame, skip_summary_rows=True) -> pd.DataFrame:
  282. """
  283. Universal DataFrame cleaning:
  284. - Remove fully empty rows/columns
  285. - Drop 'Unnamed' columns
  286. - Normalize column names (strip whitespace, unify brackets)
  287. - Auto-detect and remove summary/total rows
  288. - Try to parse date columns
  289. - Try to parse numeric columns
  290. """
  291. if df.empty:
  292. return df
  293. df = df.dropna(how='all').reset_index(drop=True)
  294. df = df.dropna(axis=1, how='all')
  295. df = df.loc[:, ~df.columns.astype(str).str.contains('^Unnamed', na=False)]
  296. df = df.rename(columns=normalize_column_names)
  297. if skip_summary_rows:
  298. df = _detect_and_skip_footer_rows(df)
  299. df = _detect_empty_or_notes_rows(df)
  300. for col in df.columns:
  301. if df[col].dtype == 'object':
  302. try:
  303. parsed = pd.to_datetime(df[col], errors='coerce', infer_datetime_format=True)
  304. if parsed.notna().sum() > len(df) * 0.7:
  305. df[col] = parsed
  306. continue
  307. except Exception:
  308. pass
  309. try:
  310. numeric = pd.to_numeric(df[col], errors='coerce')
  311. if numeric.notna().sum() > len(df) * 0.7:
  312. df[col] = numeric
  313. except Exception:
  314. pass
  315. return df
  316. def load_generic_excel(filepath: str, sheet_name=0, skip_summary_rows=True,
  317. encoding=None, dtype_backend=None) -> pd.DataFrame:
  318. """
  319. Load any Excel/CSV file into a cleaned DataFrame.
  320. Args:
  321. filepath: Path to the data file (.xlsx, .xls, or .csv)
  322. sheet_name: Sheet name or index (for Excel). Ignored for CSV.
  323. skip_summary_rows: Auto-detect and remove summary/total footer rows
  324. encoding: File encoding (auto-detected for CSV if None)
  325. dtype_backend: Optional pandas dtype backend ('numpy_nullable', 'pyarrow')
  326. """
  327. fmt = auto_detect_file_format(filepath)
  328. kwargs = {}
  329. if dtype_backend:
  330. kwargs['dtype_backend'] = dtype_backend
  331. if fmt == 'csv':
  332. df = load_generic_csv(filepath, encoding=encoding, **kwargs)
  333. else:
  334. try:
  335. df = pd.read_excel(filepath, sheet_name=sheet_name, **kwargs)
  336. except Exception as e:
  337. if fmt == 'xls':
  338. raise ValueError(
  339. f"Failed to read .xls file. Try converting to .xlsx format. "
  340. f"Error: {e}"
  341. )
  342. raise
  343. return _clean_generic_dataframe(df, skip_summary_rows=skip_summary_rows)
  344. def load_generic_all_sheets(filepath: str, skip_summary_rows=True) -> pd.DataFrame:
  345. """
  346. Load all sheets from an Excel file, merge into a single DataFrame.
  347. Adds '_source_sheet' column to track the source sheet.
  348. """
  349. fmt = auto_detect_file_format(filepath)
  350. if fmt == 'csv':
  351. return load_generic_excel(filepath, skip_summary_rows=skip_summary_rows)
  352. xl = pd.ExcelFile(filepath)
  353. if len(xl.sheet_names) == 1:
  354. df = pd.read_excel(filepath, sheet_name=xl.sheet_names[0])
  355. return _clean_generic_dataframe(df, skip_summary_rows=skip_summary_rows)
  356. frames = []
  357. for sheet in xl.sheet_names:
  358. try:
  359. df = pd.read_excel(filepath, sheet_name=sheet)
  360. df['_source_sheet'] = sheet
  361. frames.append(df)
  362. except Exception:
  363. continue
  364. if not frames:
  365. raise ValueError(f"No valid sheets found in {filepath}")
  366. combined = pd.concat(frames, ignore_index=True)
  367. return _clean_generic_dataframe(combined, skip_summary_rows=skip_summary_rows)
  368. def auto_detect_date_column(df: pd.DataFrame) -> str:
  369. """Auto-detect the primary date/time column in a DataFrame."""
  370. date_keywords = ['日期', '时间', 'date', 'time', '年', '月', '日']
  371. for col in df.columns:
  372. col_str = str(col).lower().strip()
  373. if any(kw in col_str for kw in date_keywords):
  374. parsed = pd.to_datetime(df[col], errors='coerce')
  375. if parsed.notna().sum() > len(df) * 0.5:
  376. return col
  377. return ''
  378. def auto_parse_single_sheet(filepath: str, sheet_name=0) -> pd.DataFrame:
  379. """Load and clean a single sheet (shortcut for load_generic_excel)."""
  380. return load_generic_excel(filepath, sheet_name=sheet_name)
  381. def load_generic_file_info(filepath: str) -> dict:
  382. """
  383. Return file metadata without full data loading.
  384. Useful for quick inspection before deciding how to load.
  385. """
  386. info = {'filepath': filepath, 'format': auto_detect_file_format(filepath)}
  387. if info['format'] == 'csv':
  388. try:
  389. with open(filepath, 'r', encoding='utf-8-sig') as f:
  390. sample = f.read(8192)
  391. dialect = csv.Sniffer().sniff(sample[:4096])
  392. info['delimiter'] = dialect.delimiter
  393. info['has_header'] = csv.Sniffer().has_header(sample)
  394. info['approx_rows'] = sample.count('\n')
  395. except Exception:
  396. info['delimiter'] = ','
  397. else:
  398. try:
  399. xl = pd.ExcelFile(filepath)
  400. info['sheet_names'] = xl.sheet_names
  401. info['sheet_count'] = len(xl.sheet_names)
  402. except Exception as e:
  403. info['error'] = str(e)
  404. info['file_size_mb'] = round(os.path.getsize(filepath) / (1024 * 1024), 2)
  405. return info
  406. if __name__ == '__main__':
  407. import sys
  408. if len(sys.argv) > 1:
  409. fp = sys.argv[1]
  410. fmt = auto_detect_file_format(fp)
  411. print(f"File: {fp}")
  412. print(f"Format: {fmt}")
  413. file_info = load_generic_file_info(fp)
  414. print(f"Size: {file_info.get('file_size_mb', '?')} MB")
  415. if 'sheet_names' in file_info:
  416. print(f"Sheets ({file_info['sheet_count']}): {file_info['sheet_names'][:5]}...")
  417. df = load_generic_excel(fp)
  418. date_col = auto_detect_date_column(df)
  419. print(f"Generic load: {len(df)} rows x {len(df.columns)} cols, "
  420. f"date column: {date_col}")
  421. print(f"Columns: {list(df.columns)}")