#!/usr/bin/env python3
"""
832平台爬虫 - 方案A：获取商品详情页链接 + 供应商 + 电话
"""

import asyncio
import csv
from datetime import datetime
from playwright.async_api import async_playwright

CONFIG = {
    "output_file": "henan_products_detailed.csv",
    "max_pages": 5,  # 翻页数量
    "detail_timeout": 15000,
}

COOKIES = [
    {"name": "gxyj_Sign-In-Token", "value": "Bearer eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJjb2RlIjpudWxsLCJ1c2VyX25hbWUiOm51bGwsImNvbXBhbnlOYW1lIjoiIiwiY2xpZW50X2lkIjoiVU5JRklDQVRJT04iLCJhY2NvdW50U3RhdHVzIjoxLCJpc0ZyZWV6ZSI6MiwidWlkIjoiMTM5OTkwNzMxMDg5NzI4NDUyOCIsInB1cmNoYXNlclByb3BlcnR5IjowLCJjb21wYW55Tm8iOiIzMDAwMTA2MDE3IiwiZ3JhbnRfdHlwZSI6Im11bHRpX3Bhc3N3b3JkIiwic2NvcGUiOlsiYWxsIl0sImxvZ2luTmFtZSI6ImNoaW5hZGF2aWQiLCJleHAiOjE3NzMxOTgyNzEsImp0aSI6IjJmM2FiMGZhLTg2MGItNGVlNS05MTM2LThjNDU4MGIzZjRlNSIsInN0YXRpb25JZCI6IjEiLCJhZG1pblR5cGUiOjEsImN1cnJlbnRTdGF0aW9uSWQiOiIxIiwiYWNjb3VudFR5cGUiOiIzIiwiYWNjTm8iOiJBQ0MyNjAzMTAzMDAwMDAwMDAwMDAwMDcyIiwiY29tcGFueVN0YXR1cyI6MSwiYXV0aG9yaXRpZXMiOlsicm9vdCJdLCJhdWQiOlsiMSJdLCJwaG9uZSI6IjE4MTEwMDc4NzYyIiwibWFpbklkIjoiMTM5OTkwNzMxMDg5NzI4NDUzMCIsInVzZXJuYW1lIjoiY2hpbmFkYXZpZCJ9.CctYfcFiUdPUdmXEV7StkAU9PlAWgM-0v4dk6L6beV3SK93HT-b5g9tHIpO45CZTPa2pXw3GMK5X3dg6v5Y9Olxl35DAXYWhQVC2NjI-qkpOIfSSBM8i-DXR_HyIWpyeXWJBTWk4BdWZZPt8wkWzD5SGwM2-whcVQt5phsDrB8c", "domain": ".fupin832.com"},
    {"name": "JSESSIONID", "value": "0BB906763F5D9BEEEEACA841CA0E9690", "domain": "ys.fupin832.com"},
]

CSV_HEADERS = ["序号", "商品名称", "单价(元)", "销量", "供应商名称", "联系电话", "商品链接", "抓取时间"]

class Spider:
    def __init__(self):
        self.browser = None
        self.context = None
        self.page = None
        self.all_products = []
    
    async def init(self):
        print("🚀 初始化浏览器...")
        pw = await async_playwright().start()
        self.browser = await pw.chromium.launch(
            headless=True,
            args=['--no-sandbox', '--disable-setuid-sandbox']
        )
        self.context = await self.browser.new_context(
            viewport={"width": 1920, "height": 1080},
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
        )
        
        for c in COOKIES:
            await self.context.add_cookies([{
                "name": c["name"],
                "value": c["value"],
                "domain": c["domain"],
                "path": "/",
            }])
        
        self.page = await self.context.new_page()
        print("✅ 初始化完成")
    
    async def extract_product_links(self):
        """提取商品列表中的商品链接和基本信息"""
        
        products = await self.page.evaluate("""() => {
            const results = [];
            
            // 方法1: 查找所有带href的链接
            const allLinks = document.querySelectorAll('a[href]');
            
            allLinks.forEach(link => {
                const href = link.href;
                const text = link.innerText?.trim() || '';
                
                // 商品链接特征
                const isProductLink = href.match(/product|detail|goods|item/) || 
                                     text.match(/￥\\s*\\d+/);
                
                if (isProductLink && text.length > 5 && text.length < 150) {
                    // 提取价格
                    const priceMatch = text.match(/￥\\s*(\\d+\\.?\\d*)|(\\d+\\.?\\d*)\\s*元/);
                    const price = priceMatch ? (priceMatch[1] || priceMatch[2]) : '';
                    
                    // 提取销量
                    const salesMatch = text.match(/(?:已售|月销)\\s*(\\d+(?:\\.\\d+)?(?:万|千)?)/);
                    const sales = salesMatch ? salesMatch[1] : '';
                    
                    // 提取商品名称
                    let name = text
                        .replace(/￥\\s*\\d+\\.?\\d*/, '')
                        .replace(/\\d+\\.?\\d*元/, '')
                        .replace(/已售\\s*\\d+/, '')
                        .replace(/月销\\s*\\d+/, '')
                        .trim();
                    
                    // 清理换行
                    name = name.split(/\\n/)[0].trim();
                    
                    // 过滤
                    const skipWords = ['资讯', '通知', '公告', '更多', '客服', '登录', '购物车', '首页', '关于我们', '联系我们', '供应商'];
                    if (skipWords.some(w => name.startsWith(w))) return;
                    
                    if (name.length > 3 && (price || sales)) {
                        // 标准化URL
                        let url = href;
                        if (!url.startsWith('http')) {
                            url = 'https://ys.fupin832.com' + href;
                        }
                        
                        results.push({
                            name: name.substring(0, 100),
                            price: price,
                            sales: sales,
                            url: url
                        });
                    }
                }
            });
            
            // 去重
            const unique = [];
            const seen = new Set();
            results.forEach(p => {
                const key = p.name.substring(0, 20) + p.price;
                if (!seen.has(key)) {
                    seen.add(key);
                    unique.push(p);
                }
            });
            
            return unique.slice(0, 30);
        }""")
        
        return products
    
    async def get_product_detail(self, url):
        """获取商品详情"""
        try:
            # 创建新页面
            detail_page = await self.context.new_page()
            
            await detail_page.goto(url, timeout=CONFIG['detail_timeout'])
            await detail_page.wait_for_load_state("networkidle")
            await detail_page.wait_for_timeout(2000)
            
            # 提取详情
            info = await detail_page.evaluate("""() => {
                const text = document.body.innerText || '';
                const result = {
                    supplier: '',
                    contact: '',
                    address: ''
                };
                
                // 供应商/店铺/公司
                const supplierPatterns = [
                    /供应商[：:\\\\s]+([^\\\\n]{2,40})/,
                    /店铺[：:\\\\s]+([^\\\\n]{2,40})/,
                    /商家[：:\\\\s]+([^\\\\n]{2,40})/,
                    /企业名称[：:\\\\s]+([^\\\\n]{2,40})/,
                    /公司名称[：:\\\\s]+([^\\\\n]{2,40})/,
                    /所属供应商[：:\\\\s]+([^\\\\n]{2,40})/,
                    /产地[：:\\\\s]+([^\\\\n]{2,40})/,
                    /供货商[：:\\\\s]+([^\\\\n]{2,40})/,
                ];
                
                for (const pattern of supplierPatterns) {
                    const match = text.match(pattern);
                    if (match) {
                        result.supplier = match[1].trim();
                        break;
                    }
                }
                
                // 电话
                const phonePatterns = [
                    /电话[：:\\\\s]+(\d{3,4}[-\\\\s]?\\\\d{7,8})/,
                    /手机[：:\\\\s]+(\d{11})/,
                    /联系电话[：:\\\\s]+(\d{3,4}[-\\\\s]?\\\\d{7,8})/,
                    /客服电话[：:\\\\s]+(\d{3,4}[-\\\\s]?\\\\d{7,8})/,
                    /联系手机[：:\\\\s]+(\d{11})/,
                    /TEL[：:\\\\s]+(\d{3,4}[-\\\\s]?\\\\d{7,8})/,
                ];
                
                for (const pattern of phonePatterns) {
                    const match = text.match(pattern);
                    if (match) {
                        result.contact = match[1].trim();
                        break;
                    }
                }
                
                // 如果还没找到，尝试直接匹配电话格式
                if (!result.contact) {
                    const directPhone = text.match(/(\d{3,4}[-\\\\s]\\\\d{7,8})/);
                    if (directPhone) {
                        result.contact = directPhone[1].trim();
                    }
                }
                
                // 地址
                const addressPatterns = [
                    /地址[：:\\\\s]+([^\\\\n]{5,60})/,
                    /产地[：:\\\\s]+([^\\\\n]{5,60})/,
                    /发货地[：:\\\\s]+([^\\\\n]{5,60})/,
                ];
                
                for (const pattern of addressPatterns) {
                    const match = text.match(pattern);
                    if (match) {
                        result.address = match[1].trim();
                        break;
                    }
                }
                
                return result;
            }""")
            
            await detail_page.close()
            return info
            
        except Exception as e:
            return {'supplier': '', 'contact': '', 'address': ''}
    
    async def crawl_page(self, page_num):
        """爬取单页"""
        print(f"\n{'='*50}")
        print(f"📄 第 {page_num}/{CONFIG['max_pages']} 页")
        print(f"{'='*50}")
        
        # 构建URL
        if page_num == 1:
            url = "https://ys.fupin832.com/product/list?areaCode=410000"
        else:
            url = f"https://ys.fupin832.com/product/list?areaCode=410000&page={page_num}"
        
        await self.page.goto(url, timeout=30000)
        await self.page.wait_for_load_state("networkidle")
        await self.page.wait_for_timeout(3000)
        
        # 滚动加载
        for i in range(3):
            await self.page.evaluate(f"window.scrollBy(0, {(i+1)*600})")
            await self.page.wait_for_timeout(1000)
        
        # 提取商品和链接
        print("  🔍 提取商品列表...")
        products = await self.extract_product_links()
        print(f"  找到 {len(products)} 个商品")
        
        if not products:
            return []
        
        # 获取详情
        print("  📞 获取供应商详情...")
        for i, p in enumerate(products):
            if p.get('url'):
                print(f"    [{i+1}/{len(products)}] 访问: {p['name'][:25]}...")
                detail = await self.get_product_detail(p['url'])
                p['supplier'] = detail.get('supplier', '')
                p['contact'] = detail.get('contact', '')
                p['address'] = detail.get('address', '')
                
                if p['supplier']:
                    print(f"       ✅ 供应商: {p['supplier'][:30]}")
                if p['contact']:
                    print(f"       ✅ 电话: {p['contact']}")
            else:
                # 推断供应商
                name = p.get('name', '')
                for kw in ['县', '市', '旗', '区']:
                    if kw in name:
                        p['supplier'] = name.split(kw)[0] + kw
                        break
        
        return products
    
    async def run(self):
        """运行爬虫"""
        for page in range(1, CONFIG['max_pages'] + 1):
            products = await self.crawl_page(page)
            
            if not products:
                print(f"  ⚠️ 第 {page} 页无数据，停止")
                break
            
            self.all_products.extend(products)
            print(f"\n  📊 累计: {len(self.all_products)} 条")
            
            # 翻页等待
            await self.page.wait_for_timeout(2000)
        
        # 保存
        await self.save()
    
    async def save(self):
        """保存到CSV"""
        if not self.all_products:
            print("⚠️ 无数据")
            return
        
        # 清理
        cleaned = []
        seen = set()
        for p in self.all_products:
            key = p.get('name', '')[:20] + str(p.get('price', ''))
            if key not in seen and p.get('name'):
                seen.add(key)
                cleaned.append(p)
        
        # 统计
        with_supplier = len([p for p in cleaned if p.get('supplier')])
        with_contact = len([p for p in cleaned if p.get('contact')])
        
        print(f"\n📊 统计:")
        print(f"  - 总数: {len(cleaned)} 条")
        print(f"  - 有供应商: {with_supplier} 条 ({100*with_supplier//len(cleaned)}%)")
        print(f"  - 有联系电话: {with_contact} 条 ({100*with_contact//len(cleaned)}%)")
        
        # 保存
        with open(CONFIG["output_file"], 'w', newline='', encoding='utf-8-sig') as f:
            writer = csv.DictWriter(f, fieldnames=CSV_HEADERS)
            writer.writeheader()
            
            for idx, p in enumerate(cleaned, 1):
                writer.writerow({
                    "序号": idx,
                    "商品名称": p.get('name', ''),
                    "单价(元)": p.get('price', ''),
                    "销量": p.get('sales', ''),
                    "供应商名称": p.get('supplier', ''),
                    "联系电话": p.get('contact', ''),
                    "商品链接": p.get('url', ''),
                    "抓取时间": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                })
        
        print(f"\n✅ 已保存: {CONFIG['output_file']}")
        
        # 预览
        print(f"\n📋 预览 (前10条):")
        for i, p in enumerate(cleaned[:10], 1):
            print(f"  {i}. {p.get('name', '')[:35]}")
            print(f"     💰 {p.get('price', '')}元 | 销量 {p.get('sales', '')}")
            if p.get('supplier'):
                print(f"     🏪 {p.get('supplier', '')[:40]}")
            if p.get('contact'):
                print(f"     📞 {p.get('contact')}")
    
    async def close(self):
        if self.browser:
            await self.browser.close()

async def main():
    print("="*60)
    print("  832平台爬虫 - 方案A完整版")
    print("  目标: 供应商名称 + 联系电话")
    print("="*60)
    
    spider = Spider()
    try:
        await spider.init()
        await spider.run()
        print("\n🎉 爬虫执行完成!")
    except Exception as e:
        print(f"❌ 错误: {e}")
        import traceback
        traceback.print_exc()
    finally:
        await spider.close()

if __name__ == "__main__":
    asyncio.run(main())
