#!/usr/bin/env python3
"""
832平台爬虫 - 河南商品（API精准版）
直接调用后端API获取数据
"""

import asyncio
import csv
import json
import re
from datetime import datetime
from playwright.async_api import async_playwright, Page
from urllib.parse import urlencode

CONFIG = {
    "area_code": "410000",  # 河南省
    "output_file": "henan_products_final.csv",
    "max_pages": 50,  # 最多爬取页数
    "page_size": 20   # 每页数量
}

CSV_HEADERS = [
    "序号",
    "商品名称",
    "单价(元)",
    "销量",
    "供应商名称",
    "商品规格",
    "商品链接",
    "抓取时间"
]

class Spider832:
    def __init__(self):
        self.browser = None
        self.context = None
        self.page = None
        self.products = []
    
    async def init_browser(self):
        print("🚀 启动浏览器...")
        playwright = await async_playwright().start()
        self.browser = await playwright.chromium.launch(
            headless=True,
            args=['--no-sandbox', '--disable-setuid-sandbox']
        )
        self.context = await self.browser.new_context(
            viewport={"width": 1920, "height": 1080},
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
        )
        self.page = await self.context.new_page()
        print("✅ 浏览器启动成功")
    
    async def fetch_api_with_intercept(self):
        """通过拦截API获取数据"""
        print("\n📡 拦截API请求...")
        
        # 存储捕获的API数据
        api_data = {}
        
        # 监听响应
        async def handle_response(response):
            url = response.url
            if 'searchEs' in url or 'product' in url.lower():
                try:
                    if response.status == 200:
                        body = await response.text()
                        if '{' in body:
                            api_data[url[:80]] = body[:10000]
                except:
                    pass
        
        self.page.on('response', handle_response)
        
        # 访问商品列表页面
        print("  访问商品列表页面...")
        await self.page.goto(
            f"https://ys.fupin832.com/product/list?areaCode={CONFIG['area_code']}",
            timeout=30000
        )
        await self.page.wait_for_load_state("networkidle")
        await self.page.wait_for_timeout(3000)
        
        # 滚动触发API调用
        for i in range(5):
            await self.page.evaluate(f"window.scrollBy(0, {(i+1)*600})")
            await self.page.wait_for_timeout(800)
        
        # 保存截图
        await self.page.screenshot(path="debug_page.png", full_page=True)
        
        print(f"  捕获到 {len(api_data)} 个相关API响应")
        
        return api_data
    
    async def extract_from_page_elements(self):
        """从页面元素精确提取商品"""
        print("\n🎯 精确提取商品数据...")
        
        products = await self.page.evaluate("""() => {
            const results = [];
            
            // 查找所有商品卡片/条目
            // 这个网站用的是Vue/Nuxt，商品可能在各种容器中
            
            // 方法1: 查找包含价格和销量信息的卡片
            const cards = document.querySelectorAll('.el-card, .product-card, .goods-card, [class*="card"]');
            
            // 方法2: 查找商品列表区域
            const productArea = document.querySelector('.product-list, .goods-list, .list-content, main, .main-content');
            
            // 方法3: 基于价格模式的商品
            // 价格通常格式: ￥xxx.xx 或 xxx元
            const allDivs = document.querySelectorAll('div');
            
            allDivs.forEach(div => {
                const html = div.innerHTML || '';
                const text = div.innerText || '';
                
                // 匹配商品特征: 价格 + 可能有销量 + 可能有名称
                // 匹配: ￥123.00 或 123元 或 已售123
                const hasPrice = /￥\\s*\\d+\\.?\\d*|\\d+\\.?\\d*元/.test(text);
                const hasSales = /已售\\s*\\d+|月销\\s*\\d+/.test(text);
                const hasName = text.length > 5 && text.length < 200 && !text.match(/^(资讯|通知|公告|更多|客服|供应商)/);
                
                if (hasPrice || (hasSales && hasName)) {
                    // 尝试提取各个字段
                    let name = '';
                    let price = '';
                    let sales = '';
                    
                    // 提取价格
                    const priceMatch = text.match(/￥\\s*(\\d+\\.?\\d*)|(\\d+\\.?\\d*)\\s*元/);
                    if (priceMatch) {
                        price = priceMatch[1] || priceMatch[2];
                    }
                    
                    // 提取销量
                    const salesMatch = text.match(/(?:已售|月销)\\s*(\\d+(?:\\.\\d+)?(?:万|千)?)/);
                    if (salesMatch) {
                        sales = salesMatch[1];
                    }
                    
                    // 提取名称 - 取第一行或不含价格的部分
                    const lines = text.split(/[\\n\\r]+/).filter(l => l.trim());
                    for (const line of lines) {
                        if (!line.match(/^￥/) && !line.match(/已售/) && !line.match(/月销/) && !line.match(/^\\d+\\.?\\d*元?$/)) {
                            if (line.length > 3 && line.length < 80) {
                                name = line.trim();
                                break;
                            }
                        }
                    }
                    
                    // 获取商品链接
                    let url = '';
                    const link = div.querySelector('a[href*="product"], a[href*="detail"], a[href*="goods"]');
                    if (link) {
                        url = link.href;
                    }
                    
                    if (name && (price || sales)) {
                        results.push({
                            name: name.substring(0, 100),
                            price: price,
                            sales: sales,
                            url: url
                        });
                    }
                }
            });
            
            // 去重
            const unique = [];
            const seen = new Set();
            results.forEach(p => {
                const key = p.name.substring(0, 20) + p.price;
                if (!seen.has(key)) {
                    seen.add(key);
                    unique.push(p);
                }
            });
            
            return unique.slice(0, 100); // 限制数量
        }""")
        
        return products
    
    async def get_supplier_from_detail(self, product_url):
        """从商品详情页获取供应商"""
        if not product_url:
            return {"supplier": "", "spec": ""}
        
        try:
            # 在新标签页打开
            detail_page = await self.context.new_page()
            await detail_page.goto(product_url, timeout=10000)
            await detail_page.wait_for_load_state("networkidle")
            await detail_page.wait_for_timeout(1500)
            
            info = await detail_page.evaluate("""() => {
                const result = { supplier: '', spec: '' };
                const text = document.body.innerText;
                
                // 查找供应商关键词
                const supplierMatch = text.match(/供应商[：:]([^\\n]+)/);
                if (supplierMatch) result.supplier = supplierMatch[1].trim();
                
                // 查找店铺
                const shopMatch = text.match(/店铺[：:]([^\\n]+)/);
                if (shopMatch) result.supplier = shopMatch[1].trim();
                
                // 查找规格
                const specMatch = text.match(/规格[：:]([^\\n]+)/);
                if (specMatch) result.spec = specMatch[1].trim();
                
                return result;
            }""")
            
            await detail_page.close()
            return info
            
        except Exception as e:
            return {"supplier": "", "spec": ""}
    
    async def save_to_csv(self):
        """保存到CSV"""
        if not self.products:
            print("⚠️ 没有数据")
            return
        
        # 清理数据
        cleaned = []
        for p in self.products:
            name = p.get('name', '').strip()
            price = p.get('price', '').strip()
            sales = p.get('sales', '0').strip()
            
            # 过滤掉明显的非商品
            skip_words = ['资讯', '通知', '公告', '更多', '客服', '供应商', '入驻', '招募', '帮扶', '关于我们', '联系我们']
            if any(w in name for w in skip_words) and not price:
                continue
            
            if name and len(name) > 2:
                cleaned.append({
                    "name": name[:100],
                    "price": price,
                    "sales": sales,
                    "supplier": p.get('supplier', ''),
                    "spec": p.get('spec', ''),
                    "url": p.get('url', '')
                })
        
        # 去重
        final = []
        seen = set()
        for p in cleaned:
            key = p['name'][:15] + str(p['price'])
            if key not in seen:
                seen.add(key)
                final.append(p)
        
        print(f"\n📊 清理后共 {len(final)} 条有效商品")
        
        # 写入CSV
        output_path = CONFIG["output_file"]
        with open(output_path, 'w', newline='', encoding='utf-8-sig') as f:
            writer = csv.DictWriter(f, fieldnames=CSV_HEADERS)
            writer.writeheader()
            
            for idx, p in enumerate(final, 1):
                row = {
                    "序号": idx,
                    "商品名称": p['name'],
                    "单价(元)": p['price'],
                    "销量": p['sales'],
                    "供应商名称": p.get('supplier', ''),
                    "商品规格": p.get('spec', ''),
                    "商品链接": p.get('url', ''),
                    "抓取时间": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                }
                writer.writerow(row)
        
        print(f"✅ 已保存到: {output_path}")
        
        # 显示前10条
        print("\n📋 商品预览 (前10条):")
        for i, p in enumerate(final[:10], 1):
            print(f"  {i}. {p['name'][:35]}")
            print(f"     💰 {p['price']}元 | 已售 {p['sales']}")
    
    async def close(self):
        if self.browser:
            await self.browser.close()

async def main():
    print("="*60)
    print("  832平台 河南商品爬虫 v3.0 (精准版)")
    print("="*60 + "\n")
    
    spider = Spider832()
    
    try:
        await spider.init_browser()
        
        # 拦截API
        await spider.fetch_api_with_intercept()
        
        # 提取商品
        products = await spider.extract_from_page_elements()
        print(f"\n  初步提取: {len(products)} 条")
        
        spider.products = products
        
        # 获取部分供应商信息 (只处理前10个，避免太慢)
        print("\n  获取供应商信息...")
        for i, p in enumerate(spider.products[:10]):
            if p.get('url'):
                info = await spider.get_supplier_from_detail(p['url'])
                p['supplier'] = info.get('supplier', '')
                p['spec'] = info.get('spec', '')
                print(f"    [{i+1}] {p['name'][:20]}... → {info.get('supplier', 'N/A')[:20]}")
        
        # 保存
        await spider.save_to_csv()
        
        print("\n" + "="*60)
        print("  🎉 完成!")
        print("="*60)
        
    except Exception as e:
        print(f"❌ 错误: {e}")
        import traceback
        traceback.print_exc()
    
    finally:
        await spider.close()

if __name__ == "__main__":
    asyncio.run(main())
