#!/usr/bin/env python3
"""
832平台爬虫 - 获取供应商详情（增强版）
"""

import asyncio
import csv
import json
from datetime import datetime
from playwright.async_api import async_playwright

CONFIG = {
    "output_file": "henan_products_complete.csv",
    "max_pages": 5,  # 翻页数量
}

COOKIES = [
    {"name": "gxyj_Sign-In-Token", "value": "Bearer eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJjb2RlIjpudWxsLCJ1c2VyX25hbWUiOm51bGwsImNvbXBhbnlOYW1lIjoiIiwiY2xpZW50X2lkIjoiVU5JRklDQVRJT04iLCJhY2NvdW50U3RhdHVzIjoxLCJpc0ZyZWV6ZSI6MiwidWlkIjoiMTM5OTkwNzMxMDg5NzI4NDUyOCIsInB1cmNoYXNlclByb3BlcnR5IjowLCJjb21wYW55Tm8iOiIzMDAwMTA2MDE3IiwiZ3JhbnRfdHlwZSI6Im11bHRpX3Bhc3N3b3JkIiwic2NvcGUiOlsiYWxsIl0sImxvZ2luTmFtZSI6ImNoaW5hZGF2aWQiLCJleHAiOjE3NzMxOTgyNzEsImp0aSI6IjJmM2FiMGZhLTg2MGItNGVlNS05MTM2LThjNDU4MGIzZjRlNSIsInN0YXRpb25JZCI6IjEiLCJhZG1pblR5cGUiOjEsImN1cnJlbnRTdGF0aW9uSWQiOiIxIiwiYWNjb3VudFR5cGUiOiIzIiwiYWNjTm8iOiJBQ0MyNjAzMTAzMDAwMDAwMDAwMDAwMDcyIiwiY29tcGFueVN0YXR1cyI6MSwiYXV0aG9yaXRpZXMiOlsicm9vdCJdLCJhdWQiOlsiMSJdLCJwaG9uZSI6IjE4MTEwMDc4NzYyIiwibWFpbklkIjoiMTM5OTkwNzMxMDg5NzI4NDUzMCIsInVzZXJuYW1lIjoiY2hpbmFkYXZpZCJ9.CctYfcFiUdPUdmXEV7StkAU9PlAWgM-0v4dk6L6beV3SK93HT-b5g9tHIpO45CZTPa2pXw3GMK5X3dg6v5Y9Olxl35DAXYWhQVC2NjI-qkpOIfSSBM8i-DXR_HyIWpyeXWJBTWk4BdWZZPt8wkWzD5SGwM2-whcVQt5phsDrB8c", "domain": ".fupin832.com"},
    {"name": "JSESSIONID", "value": "0BB906763F5D9BEEEEACA841CA0E9690", "domain": "ys.fupin832.com"},
    {"name": "gxyj_Login_UserName", "value": "chinadavid", "domain": ".fupin832.com"},
]

CSV_HEADERS = ["序号", "商品名称", "单价(元)", "销量", "供应商名称", "联系电话", "商品链接", "抓取时间"]

class Spider:
    def __init__(self):
        self.browser = None
        self.context = None
        self.page = None
        self.all_products = []
    
    async def init(self):
        print("🚀 启动浏览器...")
        playwright = await async_playwright().start()
        self.browser = await playwright.chromium.launch(
            headless=True,
            args=['--no-sandbox', '--disable-setuid-sandbox']
        )
        self.context = await self.browser.new_context(
            viewport={"width": 1920, "height": 1080},
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
        )
        
        # 添加Cookie
        for cookie in COOKIES:
            await self.context.add_cookies([{
                "name": cookie["name"],
                "value": cookie["value"],
                "domain": cookie["domain"],
                "path": "/",
            }])
        
        self.page = await self.context.new_page()
        print("✅ 完成")
    
    async def extract_products_with_links(self):
        """提取商品及链接"""
        print("  🔍 提取商品及链接...")
        
        products = await self.page.evaluate("""() => {
            const results = [];
            
            // 方法1: 查找商品卡片
            const cards = document.querySelectorAll('[class*="product"], [class*="goods"], [class*="item"], .el-card');
            
            cards.forEach(card => {
                const text = card.innerText || '';
                
                // 必须包含价格
                if (!/￥\\s*\\d+\\.?\\d*|\\d+\\.?\\d*元/.test(text)) return;
                
                // 获取链接
                const link = card.querySelector('a');
                const url = link ? link.href : '';
                
                // 提取价格
                const priceMatch = text.match(/￥\\s*(\\d+\\.?\\d*)|(\\d+\\.?\\d*)\\s*元/);
                const price = priceMatch ? (priceMatch[1] || priceMatch[2]) : '';
                
                // 提取销量
                const salesMatch = text.match(/(?:已售|月销)\\s*(\\d+(?:\\.\\d+)?(?:万|千)?)/);
                const sales = salesMatch ? salesMatch[1] : '';
                
                // 提取名称 - 取最长的非价格、非销量文本
                let name = '';
                const lines = text.split(/\\n/).filter(l => l.trim());
                for (const line of lines) {
                    if (line.length > name.length && 
                        !line.match(/^￥/) && 
                        !line.match(/已售|月销/) &&
                        line.length > 5 && line.length < 100) {
                        name = line.trim();
                    }
                }
                
                // 过滤
                if (!name || name.match(/^(资讯|通知|公告|更多|客服|登录|购物车|首页)/)) return;
                
                results.push({
                    name: name.substring(0, 100),
                    price: price,
                    sales: sales,
                    url: url
                });
            });
            
            // 方法2: 如果没找到，尝试查找所有链接
            if (results.length < 3) {
                document.querySelectorAll('a').forEach(a => {
                    const href = a.href || '';
                    const text = a.innerText?.trim() || '';
                    
                    // 商品链接特征
                    if (href.match(/product|detail|goods/) && text.length > 10 && text.length < 100) {
                        const priceMatch = text.match(/￥\\s*(\\d+\\.?\\d*)|(\\d+\\.?\\d*)\\s*元/);
                        if (priceMatch) {
                            results.push({
                                name: text.replace(/￥\\s*\\d+\\.?\\d*/, '').replace(/\\d+\\.?\\d*元/, '').trim().substring(0, 100),
                                price: priceMatch[1] || priceMatch[2],
                                sales: '',
                                url: href
                            });
                        }
                    }
                });
            }
            
            // 去重
            const unique = [];
            const seen = new Set();
            results.forEach(p => {
                const key = p.name.substring(0, 20) + p.price;
                if (!seen.has(key)) {
                    seen.add(key);
                    unique.push(p);
                }
            });
            
            return unique.slice(0, 50);
        }""")
        
        print(f"    找到 {len(products)} 条商品")
        return products
    
    async def get_supplier_detail(self, url):
        """获取供应商详情"""
        if not url:
            return {'supplier': '', 'contact': ''}
        
        try:
            print(f"    访问详情页: {url[:50]}...")
            
            detail_page = await self.context.new_page()
            await detail_page.goto(url, timeout=15000)
            await detail_page.wait_for_load_state("networkidle")
            await detail_page.wait_for_timeout(2000)
            
            info = await detail_page.evaluate("""() => {
                const text = document.body.innerText || '';
                const result = { supplier: '', contact: '' };
                
                // 查找供应商
                // 多种可能的格式
                const patterns = [
                    /供应商[：:\\s]+([^\\n]{2,50}?)(?:\\n|$)/,
                    /店铺[：:\\s]+([^\\n]{2,50}?)(?:\\n|$)/,
                    /商家[：:\\s]+([^\\n]{2,50}?)(?:\\n|$)/,
                    /企业名称[：:\\s]+([^\\n]{2,50}?)(?:\\n|$)/,
                    /公司名称[：:\\s]+([^\\n]{2,50}?)(?:\\n|$)/,
                    /所属地区[：:\\s]+([^\\n]{2,50}?)(?:\\n|$)/,
                    /产地[：:\\s]+([^\\n]{2,50}?)(?:\\n|$)/,
                ];
                
                for (const pattern of patterns) {
                    const match = text.match(pattern);
                    if (match) {
                        result.supplier = match[1].trim();
                        break;
                    }
                }
                
                // 查找联系电话
                const phonePatterns = [
                    /电话[：:\\s]+(\d{3,4}[-\\s]?\\d{7,8})/,
                    /手机[：:\\s]+(\d{11})/,
                    /联系电话[：:\\s]+(\d{3,4}[-\\s]?\\d{7,8})/,
                    /客服[：:\\s]+(\d{3,4}[-\\s]?\\d{7,8})/,
                    /(\d{3,4}[-\\s]?\\d{7,8})/,  // 直接匹配电话格式
                ];
                
                for (const pattern of phonePatterns) {
                    const match = text.match(pattern);
                    if (match) {
                        result.contact = match[1].trim();
                        break;
                    }
                }
                
                // 如果没找到供应商，尝试从商品名推断地区
                if (!result.supplier) {
                    // 商品名通常包含县名
                    const countyMatch = text.match(/^(\\S+?县|\\S+?市|\\S+?旗|\\S+?区)/);
                    if (countyMatch) {
                        result.supplier = countyMatch[1];
                    }
                }
                
                return result;
            }""")
            
            await detail_page.close()
            return info
            
        except Exception as e:
            print(f"    访问详情页失败: {str(e)[:50]}")
            return {'supplier': '', 'contact': ''}
    
    async def crawl_pages(self):
        """爬取多页"""
        print(f"\n📄 开始爬取 {CONFIG['max_pages']} 页数据...\n")
        
        for page_num in range(1, CONFIG['max_pages'] + 1):
            print(f"{'='*50}")
            print(f"  第 {page_num}/{CONFIG['max_pages']} 页")
            print(f"{'='*50}")
            
            # 访问页面（带翻页参数）
            url = f"https://ys.fupin832.com/product/list?areaCode=410000"
            if page_num > 1:
                url += f"&page={page_num}"
            
            await self.page.goto(url, timeout=30000)
            await self.page.wait_for_load_state("networkidle")
            await self.page.wait_for_timeout(3000)
            
            # 滚动
            for i in range(3):
                await self.page.evaluate(f"window.scrollBy(0, {(i+1)*600})")
                await self.page.wait_for_timeout(1000)
            
            # 提取商品
            products = await self.extract_products_with_links()
            
            if not products or len(products) < 3:
                print(f"  ⚠️ 第 {page_num} 页无数据，停止")
                break
            
            print(f"\n  🏪 获取供应商详情...")
            for i, p in enumerate(products):
                if p.get('url'):
                    info = await self.get_supplier_detail(p['url'])
                    p['supplier'] = info.get('supplier', '')
                    p['contact'] = info.get('contact', '')
                    print(f"    [{i+1}] {p['name'][:20]}... → {p.get('supplier', 'N/A')[:15]}")
                else:
                    # 从商品名推断供应商
                    name = p.get('name', '')
                    county = name.split('县')[0].split('市')[0].split('旗')[0]
                    if len(county) > 0 and len(county) < 15:
                        p['supplier'] = county + '（从商品名推断）'
                        p['contact'] = ''
                    else:
                        p['supplier'] = ''
                        p['contact'] = ''
            
            self.all_products.extend(products)
            print(f"\n  📊 累计: {len(self.all_products)} 条")
    
    async def save(self):
        if not self.all_products:
            print("⚠️ 无数据")
            return
        
        # 清理
        cleaned = []
        for p in self.all_products:
            name = p.get('name', '').strip()
            if name and len(name) > 3 and not name.startswith(('欢迎', '资讯', '通知')):
                cleaned.append(p)
        
        # 去重
        final = []
        seen = set()
        for p in cleaned:
            key = p['name'][:20] + str(p.get('price', ''))
            if key not in seen:
                seen.add(key)
                final.append(p)
        
        print(f"\n📊 去重后: {len(final)} 条")
        
        with open(CONFIG["output_file"], 'w', newline='', encoding='utf-8-sig') as f:
            writer = csv.DictWriter(f, fieldnames=CSV_HEADERS)
            writer.writeheader()
            
            for idx, p in enumerate(final, 1):
                writer.writerow({
                    "序号": idx,
                    "商品名称": p.get('name', ''),
                    "单价(元)": p.get('price', ''),
                    "销量": p.get('sales', '0'),
                    "供应商名称": p.get('supplier', ''),
                    "联系电话": p.get('contact', ''),
                    "商品链接": p.get('url', ''),
                    "抓取时间": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                })
        
        print(f"✅ 已保存: {CONFIG['output_file']}")
        
        # 统计
        with_supplier = len([p for p in final if p.get('supplier')])
        with_contact = len([p for p in final if p.get('contact')])
        print(f"\n📈 统计:")
        print(f"  - 总数: {len(final)} 条")
        print(f"  - 有供应商: {with_supplier} 条")
        print(f"  - 有联系方式: {with_contact} 条")
        
        # 预览
        print(f"\n📋 预览 (前10条):")
        for i, p in enumerate(final[:10], 1):
            print(f"  {i}. {p.get('name', '')[:30]}")
            print(f"     💰 {p.get('price', '')}元 | 已售 {p.get('sales', '0')}")
            if p.get('supplier'):
                print(f"     🏪 {p.get('supplier')[:30]}")
            if p.get('contact'):
                print(f"     📞 {p.get('contact')}")
    
    async def close(self):
        if self.browser:
            await self.browser.close()

async def main():
    print("="*60)
    print("  832平台 河南商品爬虫 (完整版 v4)")
    print("  包含供应商信息")
    print("="*60)
    
    spider = Spider()
    try:
        await spider.init()
        await spider.crawl_pages()
        await spider.save()
        print("\n🎉 完成!")
    except Exception as e:
        print(f"❌ {e}")
        import traceback
        traceback.print_exc()
    finally:
        await spider.close()

if __name__ == "__main__":
    asyncio.run(main())
