# -*- coding: utf-8 -*- """ 快递单解析模块 将分散的 OCR 文本块合并并解析成结构化的快递单信息 """ import re from dataclasses import dataclass, field from typing import List, Optional, Dict, Any from .engine import TextBlock @dataclass class ExpressInfo: """ 快递单结构化信息 Attributes: tracking_number: 运单号 sender_name: 寄件人姓名 sender_phone: 寄件人电话 sender_address: 寄件人地址 receiver_name: 收件人姓名 receiver_phone: 收件人电话 receiver_address: 收件人地址 courier_company: 快递公司 raw_text: 原始合并文本(用于调试) confidence: 平均置信度 extra_fields: 其他识别到的字段 """ tracking_number: Optional[str] = None sender_name: Optional[str] = None sender_phone: Optional[str] = None sender_address: Optional[str] = None receiver_name: Optional[str] = None receiver_phone: Optional[str] = None receiver_address: Optional[str] = None courier_company: Optional[str] = None raw_text: str = "" confidence: float = 0.0 extra_fields: Dict[str, str] = field(default_factory=dict) def to_dict(self) -> Dict[str, Any]: """转换为字典格式""" return { "tracking_number": self.tracking_number, "sender": { "name": self.sender_name, "phone": self.sender_phone, "address": self.sender_address }, "receiver": { "name": self.receiver_name, "phone": self.receiver_phone, "address": self.receiver_address }, "courier_company": self.courier_company, "confidence": self.confidence, "extra_fields": self.extra_fields, "raw_text": self.raw_text } @property def is_valid(self) -> bool: """检查是否包含有效的快递单信息""" # 至少需要运单号或收件人信息 return bool(self.tracking_number or self.receiver_name or self.receiver_phone) @dataclass class TextLine: """ 合并后的文本行 Attributes: text: 合并后的文本 blocks: 原始文本块列表 y_center: 行中心 Y 坐标 x_min: 行起始 X 坐标 """ text: str blocks: List[TextBlock] y_center: float x_min: float @property def confidence(self) -> float: """计算平均置信度""" if not self.blocks: return 0.0 return sum(b.confidence for b in self.blocks) / len(self.blocks) class ExpressParser: """ 快递单解析器 将分散的文本块合并成行,并提取结构化信息 """ # 快递公司关键词 COURIER_KEYWORDS = { "顺丰": "顺丰速运", "SF": "顺丰速运", "圆通": "圆通速递", "中通": "中通快递", "韵达": "韵达快递", "申通": "申通快递", "极兔": "极兔速递", "京东": "京东物流", "JD": "京东物流", "邮政": "中国邮政", "EMS": "中国邮政EMS", "百世": "百世快递", "德邦": "德邦快递", "天天": "天天快递", "宅急送": "宅急送", } # 字段关键词模式 FIELD_PATTERNS = { "tracking_number": [ r"运单号[::]\s*(\w+)", r"单号[::]\s*(\w+)", r"快递单号[::]\s*(\w+)", r"物流单号[::]\s*(\w+)", r"^(\d{10,20})$", # 纯数字运单号 r"^([A-Z]{2}\d{9,13}[A-Z]{2})$", # 国际快递单号格式 ], "receiver_name": [ r"收件人[::]\s*(.+?)(?:\s|电话|手机|地址|$)", r"收货人[::]\s*(.+?)(?:\s|电话|手机|地址|$)", r"收[::]\s*(.+?)(?:\s|电话|手机|地址|$)", ], "receiver_phone": [ r"收件人.*?电话[::]\s*(\d{11})", r"收件人.*?手机[::]\s*(\d{11})", r"收.*?(\d{11})", r"电话[::]\s*(\d{11})", r"手机[::]\s*(\d{11})", r"(? ExpressInfo: """ 解析文本块列表,提取快递单信息 Args: text_blocks: OCR 识别的文本块列表 Returns: 结构化的快递单信息 """ if not text_blocks: return ExpressInfo() # 1. 合并文本块为行 lines = self._merge_blocks_to_lines(text_blocks) # 2. 生成完整文本(用于正则匹配) full_text = self._lines_to_text(lines) # 3. 提取结构化信息 info = self._extract_info(full_text, lines) # 4. 计算平均置信度 info.confidence = sum(b.confidence for b in text_blocks) / len(text_blocks) info.raw_text = full_text return info def _merge_blocks_to_lines(self, blocks: List[TextBlock]) -> List[TextLine]: """ 将文本块按位置合并为行 基于 Y 坐标将相近的文本块合并到同一行, 然后按 X 坐标排序合并文本 """ if not blocks: return [] # 按 Y 坐标排序 sorted_blocks = sorted(blocks, key=lambda b: b.center[1]) lines: List[TextLine] = [] current_line_blocks: List[TextBlock] = [sorted_blocks[0]] current_y = sorted_blocks[0].center[1] for block in sorted_blocks[1:]: block_y = block.center[1] block_height = block.height # 判断是否属于同一行(Y 坐标差值小于阈值) threshold = block_height * self._line_merge_threshold if abs(block_y - current_y) <= threshold: current_line_blocks.append(block) else: # 完成当前行,开始新行 line = self._create_line(current_line_blocks) lines.append(line) current_line_blocks = [block] current_y = block_y # 处理最后一行 if current_line_blocks: line = self._create_line(current_line_blocks) lines.append(line) return lines def _create_line(self, blocks: List[TextBlock]) -> TextLine: """ 从文本块列表创建文本行 按 X 坐标排序,根据间距决定是否添加空格 """ # 按 X 坐标排序 sorted_blocks = sorted(blocks, key=lambda b: b.center[0]) # 合并文本 text_parts = [] prev_block = None for block in sorted_blocks: if prev_block is not None: # 计算水平间距 prev_right = max(p[0] for p in prev_block.bbox) curr_left = min(p[0] for p in block.bbox) gap = curr_left - prev_right # 计算平均字符宽度 avg_char_width = prev_block.width / max(len(prev_block.text), 1) # 如果间距较大,添加空格 if gap > avg_char_width * self._horizontal_gap_threshold: text_parts.append(" ") text_parts.append(block.text) prev_block = block merged_text = "".join(text_parts) y_center = sum(b.center[1] for b in sorted_blocks) / len(sorted_blocks) x_min = min(min(p[0] for p in b.bbox) for b in sorted_blocks) return TextLine( text=merged_text, blocks=sorted_blocks, y_center=y_center, x_min=x_min ) def _lines_to_text(self, lines: List[TextLine]) -> str: """将文本行列表转换为完整文本""" return "\n".join(line.text for line in lines) def _extract_info(self, full_text: str, lines: List[TextLine]) -> ExpressInfo: """ 从文本中提取快递单信息 Args: full_text: 完整文本 lines: 文本行列表 Returns: 结构化的快递单信息 """ info = ExpressInfo() # 提取快递公司 info.courier_company = self._extract_courier_company(full_text) # 提取各字段 for field_name, patterns in self.FIELD_PATTERNS.items(): value = self._extract_field(full_text, patterns) if value: setattr(info, field_name, value) # 尝试从上下文推断地址 if not info.receiver_address: info.receiver_address = self._extract_address_from_context(lines, "收") if not info.sender_address: info.sender_address = self._extract_address_from_context(lines, "寄") return info def _extract_courier_company(self, text: str) -> Optional[str]: """提取快递公司名称""" text_upper = text.upper() for keyword, company in self.COURIER_KEYWORDS.items(): if keyword.upper() in text_upper: return company return None def _extract_field(self, text: str, patterns: List[str]) -> Optional[str]: """ 使用正则表达式列表提取字段值 Args: text: 待匹配文本 patterns: 正则表达式列表 Returns: 匹配到的字段值,或 None """ for pattern in patterns: match = re.search(pattern, text, re.MULTILINE | re.IGNORECASE) if match: value = match.group(1).strip() # 清理常见的干扰字符 value = re.sub(r'[【】\[\]()()]', '', value) if value: return value return None def _extract_address_from_context( self, lines: List[TextLine], context_keyword: str ) -> Optional[str]: """ 从上下文中提取地址 查找包含省/市/区/县/街/路等关键词的行 """ address_keywords = ["省", "市", "区", "县", "镇", "村", "街", "路", "号", "栋", "楼", "室"] # 查找包含上下文关键词的行索引 context_line_idx = -1 for i, line in enumerate(lines): if context_keyword in line.text: context_line_idx = i break # 在上下文行附近查找地址 search_range = range( max(0, context_line_idx), min(len(lines), context_line_idx + 3 if context_line_idx >= 0 else len(lines)) ) address_parts = [] for i in search_range: line_text = lines[i].text # 检查是否包含地址关键词 if any(kw in line_text for kw in address_keywords): # 清理行首的标签(如 "地址:") cleaned = re.sub(r'^[^::]*[::]\s*', '', line_text) if cleaned and cleaned != line_text: address_parts.append(cleaned) elif any(kw in line_text for kw in address_keywords[:4]): # 省/市/区/县 address_parts.append(line_text) if address_parts: return "".join(address_parts) return None def merge_text_blocks(self, text_blocks: List[TextBlock]) -> str: """ 仅合并文本块,不进行字段提取 用于获取完整的合并文本 Args: text_blocks: 文本块列表 Returns: 合并后的完整文本 """ lines = self._merge_blocks_to_lines(text_blocks) return self._lines_to_text(lines)