#!/usr/bin/env python3
"""
整理832平台数据 - 从商品名提取供应商
"""

import csv
import re

INPUT_FILE = "henan_products_with_cookie.csv"
OUTPUT_FILE = "henan_products_cleaned.csv"

def extract_supplier_from_name(name):
    """从商品名提取供应商（地区名）"""
    if not name:
        return ""
    
    # 匹配模式: xxx县、xxx市、xxx旗、xxx区
    patterns = [
        r'^([^县]+县)',
        r'^([^市]+市)',
        r'^([^旗]+旗)',
        r'^([^区]+区)',
    ]
    
    for pattern in patterns:
        match = re.search(pattern, name)
        if match:
            supplier = match.group(1)
            # 过滤掉太短的
            if len(supplier) >= 2:
                return supplier
    
    return ""

def main():
    print("📂 读取数据...")
    
    with open(INPUT_FILE, 'r', encoding='utf-8-sig') as f:
        reader = csv.DictReader(f)
        products = list(reader)
    
    print(f"  读取到 {len(products)} 条数据")
    
    # 处理数据
    cleaned = []
    seen = set()
    
    for p in products:
        name = p.get('商品名称', '').strip()
        
        # 过滤非商品
        skip_words = ['资讯', '通知', '公告', '更多', '客服', '登录', '注册', '首页', '购物车', '欢迎']
        if any(w in name for w in skip_words) and len(name) < 20:
            continue
        
        # 去重
        key = name[:30] + p.get('单价(元)', '')
        if key in seen:
            continue
        seen.add(key)
        
        # 提取供应商
        supplier = extract_supplier_from_name(name)
        
        p['供应商名称'] = supplier if supplier else p.get('供应商名称', '')
        
        cleaned.append(p)
    
    print(f"\n📊 清理后: {len(cleaned)} 条")
    
    # 统计
    with_supplier = len([p for p in cleaned if p.get('供应商名称')])
    print(f"  有供应商: {with_supplier} 条")
    
    # 保存
    with open(OUTPUT_FILE, 'w', newline='', encoding='utf-8-sig') as f:
        writer = csv.DictWriter(f, fieldnames=['序号', '商品名称', '单价(元)', '销量', '供应商名称', '联系电话', '商品链接', '抓取时间'])
        writer.writeheader()
        
        for idx, p in enumerate(cleaned, 1):
            p['序号'] = idx
            writer.writerow(p)
    
    print(f"\n✅ 已保存: {OUTPUT_FILE}")
    
    # 预览
    print(f"\n📋 预览 (前15条):")
    for i, p in enumerate(cleaned[:15], 1):
        print(f"  {i}. {p.get('商品名称', '')[:40]}")
        print(f"     💰 {p.get('单价(元)', '')}元")
        if p.get('供应商名称'):
            print(f"     🏪 {p.get('供应商名称')}")
    
    print(f"\n📈 供应商分布 (前10):")
    from collections import Counter
    suppliers = [p.get('供应商名称') for p in cleaned if p.get('供应商名称')]
    for sup, cnt in Counter(suppliers).most_common(10):
        print(f"  - {sup}: {cnt} 条")

if __name__ == "__main__":
    main()
