data_loader.py 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249
  1. """
  2. Excel data loader for daily/weekly/monthly report generation.
  3. """
  4. import pandas as pd
  5. from datetime import datetime, timedelta
  6. import re
  7. import warnings
  8. # Field mapping: Excel column name -> internal field name
  9. FIELD_MAP = {
  10. '序号': 'seq',
  11. '目的国家': 'country',
  12. '合同号': 'contract_no',
  13. '用户名称/公司': 'customer',
  14. '意向车型及数量': 'product_info',
  15. '订单总数量': 'order_qty',
  16. '负责人': 'owner',
  17. '当前状态': 'status',
  18. '拟定合同时间': 'contract_date',
  19. '跟单天数': 'tracking_days',
  20. '定金支付时间': 'deposit_date',
  21. '订金认领时间': 'deposit_claim_date',
  22. '订单生成时间': 'order_gen_date',
  23. '价格评审时间': 'price_review_date',
  24. '合同评审时间': 'contract_review_date',
  25. '合同提交盖章申请时间': 'seal_apply_date',
  26. '合同盖章时间': 'seal_date',
  27. '车辆下线入库状态': 'inventory_status',
  28. '尾款支付时间': 'final_pay_date',
  29. '尾款认领时间': 'final_claim_date',
  30. '智慧关务信息维护': 'customs_date',
  31. '许可证办理时间': 'license_date',
  32. '车辆发运时间': 'ship_date',
  33. '预计开票时间': 'invoice_date',
  34. '今日进度更新': 'progress_update',
  35. '是否更新': 'is_updated',
  36. '支持需求': 'support_request',
  37. '4月交付': 'deliver_apr',
  38. '5月预测': 'forecast_may',
  39. }
  40. STATUS_ORDER = ['A', 'B', 'C', 'D', 'E', 'F']
  41. STATUS_LABELS = {
  42. 'A': '合同拟定中',
  43. 'B': '已锁定合同待付订金',
  44. 'C': '已付订金待生产',
  45. 'D': '已生产待付尾款',
  46. 'E': '已付尾款待发运',
  47. 'F': '已发运',
  48. }
  49. def _normalize_status(val):
  50. """Extract status code A-F from status string."""
  51. if pd.isna(val):
  52. return None
  53. s = str(val).strip()
  54. # Match pattern like "A(合同拟定中)" or "A"
  55. m = re.match(r'^([A-F])', s)
  56. if m:
  57. return m.group(1)
  58. return None
  59. def _parse_date(val):
  60. """Parse various date formats."""
  61. if pd.isna(val):
  62. return None
  63. if isinstance(val, datetime):
  64. return val
  65. for fmt in ('%Y-%m-%d', '%Y/%m/%d', '%Y年%m月%d日'):
  66. try:
  67. return datetime.strptime(str(val).strip(), fmt)
  68. except ValueError:
  69. continue
  70. return None
  71. def _sheet_name_for_date(date: datetime) -> str:
  72. """Convert datetime to expected sheet name."""
  73. return date.strftime('%Y年%m月%d日')
  74. def load_workbook_metadata(filepath: str) -> dict:
  75. """Return workbook metadata: sheet names, date range."""
  76. xl = pd.ExcelFile(filepath)
  77. sheets = xl.sheet_names
  78. dates = []
  79. for s in sheets:
  80. try:
  81. d = datetime.strptime(s, '%Y年%m月%d日')
  82. dates.append(d)
  83. except ValueError:
  84. continue
  85. dates.sort()
  86. return {
  87. 'sheets': sheets,
  88. 'date_range': (dates[0], dates[-1]) if dates else (None, None),
  89. 'total_days': len(dates),
  90. }
  91. def load_daily(filepath: str, date: datetime) -> pd.DataFrame:
  92. """Load single-day order data."""
  93. sheet = _sheet_name_for_date(date)
  94. df = pd.read_excel(filepath, sheet_name=sheet)
  95. return _clean_dataframe(df)
  96. def load_date_range(filepath: str, start: datetime, end: datetime) -> pd.DataFrame:
  97. """Load and concatenate data across a date range [start, end]."""
  98. xl = pd.ExcelFile(filepath)
  99. frames = []
  100. current = start
  101. while current <= end:
  102. sheet = _sheet_name_for_date(current)
  103. if sheet in xl.sheet_names:
  104. df = pd.read_excel(filepath, sheet_name=sheet)
  105. df['_data_date'] = current
  106. frames.append(df)
  107. current += timedelta(days=1)
  108. if not frames:
  109. raise ValueError(f"No data found between {start.date()} and {end.date()}")
  110. combined = pd.concat(frames, ignore_index=True)
  111. return _clean_dataframe(combined)
  112. def load_weekly(filepath: str, year: int, week_num: int, week_start_day=0) -> tuple:
  113. """
  114. Load data for a specific week.
  115. Returns (current_week_df, prev_week_df).
  116. week_start_day: 0=Monday, 6=Sunday
  117. """
  118. # Find the first day of the given week
  119. # Simplified: assume data starts from a known reference
  120. meta = load_workbook_metadata(filepath)
  121. first_date, last_date = meta['date_range']
  122. if first_date is None:
  123. raise ValueError("No valid date sheets found")
  124. # Find the Monday of the target week (using ISO week definition)
  125. # Jan 4 is always in week 1
  126. jan4 = datetime(year, 1, 4)
  127. # Adjust to Monday
  128. jan4_monday = jan4 - timedelta(days=jan4.weekday())
  129. target_monday = jan4_monday + timedelta(weeks=week_num - 1)
  130. target_sunday = target_monday + timedelta(days=6)
  131. # Clamp to available data range
  132. start = max(target_monday, first_date)
  133. end = min(target_sunday, last_date)
  134. current = load_date_range(filepath, start, end)
  135. # Previous week
  136. prev_start = start - timedelta(days=7)
  137. prev_end = end - timedelta(days=7)
  138. if prev_start >= first_date:
  139. previous = load_date_range(filepath, prev_start, prev_end)
  140. else:
  141. previous = pd.DataFrame(columns=current.columns)
  142. return current, previous
  143. def load_monthly(filepath: str, year: int, month: int) -> tuple:
  144. """
  145. Load data for a specific month.
  146. Returns (current_month_df, prev_month_df, yoy_month_df).
  147. """
  148. start = datetime(year, month, 1)
  149. # Last day of month
  150. if month == 12:
  151. end = datetime(year + 1, 1, 1) - timedelta(days=1)
  152. else:
  153. end = datetime(year, month + 1, 1) - timedelta(days=1)
  154. current = load_date_range(filepath, start, end)
  155. # Previous month
  156. if month == 1:
  157. prev_start = datetime(year - 1, 12, 1)
  158. prev_end = datetime(year, 1, 1) - timedelta(days=1)
  159. else:
  160. prev_start = datetime(year, month - 1, 1)
  161. prev_end = datetime(year, month, 1) - timedelta(days=1)
  162. try:
  163. previous = load_date_range(filepath, prev_start, prev_end)
  164. except ValueError:
  165. previous = pd.DataFrame(columns=current.columns)
  166. # YoY (same month last year)
  167. yoy_start = datetime(year - 1, month, 1)
  168. if month == 12:
  169. yoy_end = datetime(year, 1, 1) - timedelta(days=1)
  170. else:
  171. yoy_end = datetime(year - 1, month + 1, 1) - timedelta(days=1)
  172. try:
  173. yoy = load_date_range(filepath, yoy_start, yoy_end)
  174. except ValueError:
  175. yoy = pd.DataFrame(columns=current.columns)
  176. return current, previous, yoy
  177. def _clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
  178. """Rename columns, parse dates, clean statuses."""
  179. # Rename known columns
  180. rename_map = {k: v for k, v in FIELD_MAP.items() if k in df.columns}
  181. df = df.rename(columns=rename_map)
  182. # Normalize status
  183. if 'status' in df.columns:
  184. df['status_code'] = df['status'].apply(_normalize_status)
  185. # Parse numeric fields
  186. if 'order_qty' in df.columns:
  187. df['order_qty'] = pd.to_numeric(df['order_qty'], errors='coerce')
  188. # Parse date fields
  189. date_fields = ['contract_date', 'deposit_date', 'order_gen_date',
  190. 'price_review_date', 'contract_review_date', 'seal_apply_date',
  191. 'seal_date', 'final_pay_date', 'customs_date', 'license_date',
  192. 'ship_date', 'invoice_date']
  193. for field in date_fields:
  194. if field in df.columns:
  195. df[field] = df[field].apply(_parse_date)
  196. # Tracking days
  197. if 'tracking_days' in df.columns:
  198. df['tracking_days'] = pd.to_numeric(df['tracking_days'], errors='coerce')
  199. # Boolean updated
  200. if 'is_updated' in df.columns:
  201. df['is_updated_flag'] = df['is_updated'].astype(str).str.strip() == '是'
  202. return df
  203. if __name__ == '__main__':
  204. import sys
  205. if len(sys.argv) > 1:
  206. fp = sys.argv[1]
  207. meta = load_workbook_metadata(fp)
  208. print(f"Sheets: {meta['sheets'][:5]}...")
  209. print(f"Date range: {meta['date_range'][0]} ~ {meta['date_range'][1]}")
  210. print(f"Total days: {meta['total_days']}")