| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- 订单信息解析脚本:支持编号列表格式和自然语言格式
- """
- import re
- def parse_structured_order(text: str) -> dict:
- """
- 解析编号列表格式的订单信息
- """
- result = {}
-
- # 基本信息提取
- m = re.search(r'姓名[::]\s*(.+?)[;;\n]', text)
- result['sales_name'] = m.group(1).strip() if m else ''
-
- m = re.search(r'任职[::]\s*(.+?)[;;\n]', text)
- result['sales_title'] = m.group(1).strip() if m else ''
-
- m = re.search(r'合同签约公司中文名称[::]\s*(.+?)[;;\n]', text)
- result['buyer_cn'] = m.group(1).strip() if m else ''
-
- m = re.search(r'合同签约公司英文名称[::]\s*(.+?)(?:\n|$)', text)
- result['buyer_en'] = m.group(1).strip() if m else ''
-
- m = re.search(r'合同签约公司中文地址[::]\s*(.+?)[;;\n]', text)
- result['address_cn'] = m.group(1).strip() if m else ''
-
- m = re.search(r'合同签约公司英文地址[::]\s*(.+?)(?:\n|$)', text)
- result['address_en'] = m.group(1).strip() if m else ''
-
- m = re.search(r'合同联系电话[::]\s*([\d+\-]+)', text)
- result['tel'] = m.group(1).strip() if m else ''
-
- m = re.search(r'国内出发港口[::]\s*(.+?)[;;\n]', text)
- result['departure_port'] = m.group(1).strip() if m else ''
-
- m = re.search(r'目的地国家[::]\s*(.+?)(?:\n|$)', text)
- result['destination_country'] = m.group(1).strip() if m else ''
-
- m = re.search(r'目的地港口[::]\s*(.+?)(?:\n|$)', text)
- result['destination_port'] = m.group(1).strip() if m else ''
-
- m = re.search(r'结算方式[::]\s*(FCA|FOB|EXW|CIF)', text, re.IGNORECASE)
- result['trade_term'] = m.group(1).upper() if m else 'FCA'
-
- # 车型列表解析 - 使用行扫描方式
- vehicles = []
- lines = text.split('\n')
-
- i = 0
- while i < len(lines):
- line = lines[i].strip()
- # 匹配车型起始行: (数字)... 型号:XXX
- m = re.match(r'[((](\d+)[))]\s*(.+?)\s*型号[::]\s*(\w+)', line)
- if m:
- seq = m.group(1)
- name_part = m.group(2).strip()
- model_code = m.group(3).strip()
-
- # 解析车型名称(中英文分离)
- name_cn, name_en = split_name_cn_en(name_part)
-
- # 下一行提取排放、发动机代码、数量、颜色
- engine_code = ''
- emission = ''
- quantity = 1
- color = ''
-
- if i + 1 < len(lines):
- next_line = lines[i + 1].strip()
- # 模式: 排放标准 代码:XXX X台颜色
- m2 = re.search(r'(.+?)\s+代码[::]\s*(\w+)\s+(\d+)台(\w+)', next_line)
- if m2:
- emission = m2.group(1).strip()
- engine_code = m2.group(2).strip()
- quantity = int(m2.group(3))
- color = m2.group(4).strip()
- i += 1 # 跳过已处理的下一行
-
- vehicles.append({
- 'seq': seq,
- 'name_cn': name_cn,
- 'name_en': name_en,
- 'model_code': model_code,
- 'emission': emission,
- 'engine_code': engine_code,
- 'quantity': quantity,
- 'color': color,
- 'unit_price_usd': None,
- 'config_desc': '',
- })
- i += 1
-
- result['vehicles'] = vehicles
- return result
- def split_name_cn_en(name_part: str) -> tuple:
- """
- 将车型描述分割为中文名和英文名
-
- 例如:
- '荣光新双排货车 N350 Double Cab Pickup,-' -> ('荣光新双排货车', 'N350 Double Cab Pickup')
- '五菱荣光新卡双后轮 2.0L 5MT 单排' -> ('五菱荣光新卡双后轮 2.0L 5MT 单排', '')
- """
- # 移除末尾的标点符号
- name_part = re.sub(r'[,,、\-]+$', '', name_part).strip()
-
- # 查找中英文分界点(连续英文字母序列)
- # 策略:找到最后一个中文字符之后的主要英文部分
- # 更简单的策略:如果包含明显的中英分界(中文字符后接大段英文)
-
- # 匹配模式: 中文部分 + 空格 + 英文部分(英文部分以字母/数字开头,包含字母、数字和空格)
- m = re.match(r'([\u4e00-\u9fff][\u4e00-\u9fff\s\d.]+?)\s+([A-Za-z0-9][A-Za-z0-9\s\(\)]+)$', name_part)
- if m:
- return m.group(1).strip(), m.group(2).strip()
-
- # 如果没有明显的英文部分,全部作为中文名
- if re.search(r'[\u4e00-\u9fff]', name_part):
- return name_part, ''
-
- # 如果没有中文,全部作为英文名
- return '', name_part
- def parse_natural_language(text: str) -> dict:
- """
- 使用正则从自然语言中提取关键字段
- """
- result = {}
-
- # 买方公司名称
- m = re.search(r'(?:买方|买方公司|签约公司|客户)[是|::]\s*([^,,;;。\n]+)', text)
- if m:
- name = m.group(1).strip()
- if re.search(r'[\u4e00-\u9fff]', name):
- result['buyer_cn'] = name
- result['buyer_en'] = ''
- else:
- result['buyer_en'] = name
- result['buyer_cn'] = ''
-
- if not result.get('buyer_en'):
- m = re.search(r'英文名称[是|::]\s*([^,,;;。\n]+)', text)
- if m:
- result['buyer_en'] = m.group(1).strip()
-
- if not result.get('buyer_cn'):
- m = re.search(r'中文名称[是|::]\s*([^,,;;。\n]+)', text)
- if m:
- result['buyer_cn'] = m.group(1).strip()
-
- # 地址
- m = re.search(r'(?:地址|ADD)[是|::]\s*([^,,;;。\n]+)', text)
- if m:
- addr = m.group(1).strip()
- if re.search(r'[\u4e00-\u9fff]', addr):
- result['address_cn'] = addr
- result['address_en'] = ''
- else:
- result['address_en'] = addr
- result['address_cn'] = ''
-
- # 电话
- m = re.search(r'(?:电话|Tel|联系电话)[是|::]\s*([\d+\-]+)', text)
- if m:
- result['tel'] = m.group(1).strip()
-
- # 贸易条款
- m = re.search(r'(?:贸易条款|结算方式|Incoterm)[是|::]?\s*(FCA|FOB|EXW|CIF)', text, re.IGNORECASE)
- result['trade_term'] = m.group(1).upper() if m else 'FCA'
-
- # 出发港口
- m = re.search(r'(?:出发港|出发港口|起运港)[是|::]\s*([^,,;;。\n]+)', text)
- result['departure_port'] = m.group(1).strip() if m else ''
-
- # 目的地国家
- m = re.search(r'(?:目的地国家|目的国|出口到|发往)[是|::]?\s*([^,,;;。\n]+)', text)
- result['destination_country'] = m.group(1).strip() if m else ''
-
- # 车型解析
- vehicles = []
-
- # 模式1: ... 型号:XXX ... 代码:XXX ... X台
- pattern = r'(?:车型[\d]*[::、..]?\s*)?(.+?)\s+(?:型号[::]\s*)?(LZW\w+).*?(?:代码[::]\s*)?(\w+).*?(\d+)台'
- matches = re.findall(pattern, text, re.DOTALL)
- for m in matches:
- name_part = m[0].strip()
- name_cn, name_en = split_name_cn_en(name_part)
- vehicles.append({
- 'name_cn': name_cn,
- 'name_en': name_en,
- 'model_code': m[1].strip(),
- 'emission': '',
- 'engine_code': m[2].strip(),
- 'quantity': int(m[3]),
- 'color': '',
- 'unit_price_usd': None,
- 'config_desc': '',
- })
-
- if not vehicles:
- pattern2 = r'(LZW\w+)\s+(?:发动机代码[::]\s*)?(\w+).*?(\d+)台'
- matches = re.findall(pattern2, text)
- for m in matches:
- vehicles.append({
- 'name_cn': '',
- 'name_en': '',
- 'model_code': m[0].strip(),
- 'emission': '',
- 'engine_code': m[1].strip(),
- 'quantity': int(m[2]),
- 'color': '',
- 'unit_price_usd': None,
- 'config_desc': '',
- })
-
- result['vehicles'] = vehicles
- return result
- def parse_order_info(text: str) -> dict:
- """
- 主解析函数:自动判断格式并解析
- """
- text = text.strip()
-
- # 判断是否为结构化编号列表格式
- is_structured = bool(re.search(r'^\d+[..]\s*(姓名|任职|意向车型|合同签约公司)', text, re.MULTILINE))
-
- if is_structured:
- result = parse_structured_order(text)
- else:
- result = parse_natural_language(text)
-
- # 填充默认值
- if not result.get('trade_term'):
- result['trade_term'] = 'FCA'
- if not result.get('departure_port'):
- result['departure_port'] = 'Guangzhou nansha Port'
-
- return result
- if __name__ == '__main__':
- import json
- import sys
- sys.stdout.reconfigure(encoding='utf-8')
-
- with open('../assets/订单合同信息案例.txt', encoding='utf-8') as f:
- test_text = f.read()
-
- result = parse_order_info(test_text)
- print(json.dumps(result, ensure_ascii=False, indent=2))
|