#!/usr/bin/env python3
"""
832平台爬虫 - 使用Cookie获取完整数据
"""

import asyncio
import csv
import json
from datetime import datetime
from playwright.async_api import async_playwright

CONFIG = {
    "output_file": "henan_products_with_cookie.csv",
    "max_products": 100,
}

# 用户提供的Cookie
COOKIES = [
    {"name": "gxyj_Sign-In-Token", "value": "Bearer eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJjb2RlIjpudWxsLCJ1c2VyX25hbWUiOm51bGwsImNvbXBhbnlOYW1lIjoiIiwiY2xpZW50X2lkIjoiVU5JRklDQVRJT04iLCJhY2NvdW50U3RhdHVzIjoxLCJpc0ZyZWV6ZSI6MiwidWlkIjoiMTM5OTkwNzMxMDg5NzI4NDUyOCIsInB1cmNoYXNlclByb3BlcnR5IjowLCJjb21wYW55Tm8iOiIzMDAwMTA2MDE3IiwiZ3JhbnRfdHlwZSI6Im11bHRpX3Bhc3N3b3JkIiwic2NvcGUiOlsiYWxsIl0sImxvZ2luTmFtZSI6ImNoaW5hZGF2aWQiLCJleHAiOjE3NzMxOTgyNzEsImp0aSI6IjJmM2FiMGZhLTg2MGItNGVlNS05MTM2LThjNDU4MGIzZjRlNSIsInN0YXRpb25JZCI6IjEiLCJhZG1pblR5cGUiOjEsImN1cnJlbnRTdGF0aW9uSWQiOiIxIiwiYWNjb3VudFR5cGUiOiIzIiwiYWNjTm8iOiJBQ0MyNjAzMTAzMDAwMDAwMDAwMDAwMDcyIiwiY29tcGFueVN0YXR1cyI6MSwiYXV0aG9yaXRpZXMiOlsicm9vdCJdLCJhdWQiOlsiMSJdLCJwaG9uZSI6IjE4MTEwMDc4NzYyIiwibWFpbklkIjoiMTM5OTkwNzMxMDg5NzI4NDUzMCIsInVzZXJuYW1lIjoiY2hpbmFkYXZpZCJ9.CctYfcFiUdPUdmXEV7StkAU9PlAWgM-0v4dk6L6beV3SK93HT-b5g9tHIpO45CZTPa2pXw3GMK5X3dg6v5Y9Olxl35DAXYWhQVC2NjI-qkpOIfSSBM8i-DXR_HyIWpyeXWJBTWk4BdWZZPt8wkWzD5SGwM2-whcVQt5phsDrB8c", "domain": ".fupin832.com"},
    {"name": "JSESSIONID", "value": "0BB906763F5D9BEEEEACA841CA0E9690", "domain": "ys.fupin832.com"},
    {"name": "gxyj_Login_UserName", "value": "chinadavid", "domain": ".fupin832.com"},
    {"name": "gxyj_Login_User_uuid", "value": "1399907310897284528", "domain": ".fupin832.com"},
]

CSV_HEADERS = ["序号", "商品名称", "单价(元)", "销量", "供应商名称", "联系电话", "商品链接", "抓取时间"]

class Spider:
    def __init__(self):
        self.browser = None
        self.context = None
        self.page = None
        self.products = []
    
    async def init(self):
        print("🚀 启动浏览器...")
        playwright = await async_playwright().start()
        self.browser = await playwright.chromium.launch(
            headless=True,
            args=['--no-sandbox', '--disable-setuid-sandbox']
        )
        self.context = await self.browser.new_context(
            viewport={"width": 1920, "height": 1080}
        )
        
        # 添加Cookie
        print("  添加Cookie...")
        for cookie in COOKIES:
            cookie_config = {
                "name": cookie["name"],
                "value": cookie["value"],
                "domain": cookie["domain"],
                "path": "/",
            }
            await self.context.add_cookies([cookie_config])
        
        self.page = await self.context.new_page()
        print("✅ 完成")
    
    async def crawl(self):
        print("\n📄 访问商品列表（使用Cookie）...")
        await self.page.goto("https://ys.fupin832.com/product/list?areaCode=410000", timeout=30000)
        await self.page.wait_for_load_state("networkidle")
        await self.page.wait_for_timeout(3000)
        
        # 滚动加载
        for i in range(3):
            await self.page.evaluate(f"window.scrollBy(0, {(i+1)*800})")
            await self.page.wait_for_timeout(1000)
        
        await self.page.screenshot(path="logged_in_page.png", full_page=True)
        print("  📸 截图已保存")
        
        # 提取商品
        print("\n🔍 提取商品数据...")
        products = await self.page.evaluate("""() => {
            const results = [];
            
            // 查找包含价格的元素
            const allDivs = document.querySelectorAll('div');
            
            allDivs.forEach(div => {
                const text = div.innerText || '';
                const html = div.innerHTML || '';
                
                // 匹配商品特征：价格 + 可能的名称
                const hasPrice = /￥\\s*\\d+\\.?\\d*|\\d+\\.?\\d*元/.test(text);
                const hasName = text.length > 5 && text.length < 150;
                
                // 过滤掉非商品
                const skipWords = ['资讯', '通知', '公告', '更多', '客服', '供应商', '入驻', '招募', '帮扶', '登录', '注册', '首页', '购物车'];
                const isSkip = skipWords.some(w => text.startsWith(w) && text.length < 20);
                
                if (hasPrice && hasName && !isSkip) {
                    let name = '', price = '', sales = '';
                    
                    // 提取价格
                    const priceMatch = text.match(/￥\s*(\d+\.?\d*)|(\d+\.?\d*)\s*元/);
                    if (priceMatch) price = priceMatch[1] || priceMatch[2];
                    
                    // 提取销量
                    const salesMatch = text.match(/(?:已售|月销)\s*(\d+(?:\.\d+)?(?:万|千)?)/);
                    if (salesMatch) sales = salesMatch[1];
                    
                    // 提取名称
                    const lines = text.split(/\\n/).filter(l => l.trim());
                    for (const line of lines) {
                        if (!line.match(/^￥/) && !line.match(/已售/) && !line.match(/月销/) && !line.match(/^\\d+\.?\\d*元?$/) && line.length > 3) {
                            name = line.trim();
                            break;
                        }
                    }
                    
                    // 获取链接
                    let url = '';
                    const link = div.querySelector('a[href*="product"], a[href*="detail"]');
                    if (link) url = link.href;
                    
                    if (name && name.length > 3 && (price || sales)) {
                        results.push({
                            name: name.substring(0, 100),
                            price: price,
                            sales: sales,
                            url: url
                        });
                    }
                }
            });
            
            // 去重
            const unique = [];
            const seen = new Set();
            results.forEach(p => {
                const key = p.name.substring(0, 15) + p.price;
                if (!seen.has(key) && !p.name.includes('欢迎')) {
                    seen.add(key);
                    unique.push(p);
                }
            });
            
            return unique.slice(0, 100);
        }""")
        
        print(f"  初步提取: {len(products)} 条")
        self.products = products
        
        # 获取供应商详情（只处理前10个）
        print("\n🏪 获取供应商详情...")
        for i, p in enumerate(self.products[:10]):
            if p.get('url'):
                try:
                    detail = await self.get_supplier_detail(p['url'])
                    p['supplier'] = detail.get('supplier', '')
                    p['contact'] = detail.get('contact', '')
                    print(f"    [{i+1}] {p['name'][:25]}... → {p.get('supplier', 'N/A')[:20]}")
                except:
                    p['supplier'] = ''
                    p['contact'] = ''
        
        return self.products
    
    async def get_supplier_detail(self, url):
        """获取商品详情"""
        try:
            detail_page = await self.context.new_page()
            await detail_page.goto(url, timeout=10000)
            await detail_page.wait_for_load_state("networkidle")
            await detail_page.wait_for_timeout(1500)
            
            info = await detail_page.evaluate("""() => {
                const text = document.body.innerText;
                const result = { supplier: '', contact: '' };
                
                // 供应商
                const supMatch = text.match(/供应商[：:]([^\\n]{2,30})/);
                if (supMatch) result.supplier = supMatch[1].trim();
                
                const shopMatch = text.match(/店铺[：:]([^\\n]{2,30})/);
                if (shopMatch) result.supplier = shopMatch[1].trim();
                
                // 电话
                const phoneMatch = text.match(/(?:电话|联系方式|手机)[：:]\\s*(\\d{3,4}[-\\s]?\\d{7,8})/);
                if (phoneMatch) result.contact = phoneMatch[1].trim();
                
                return result;
            }""")
            
            await detail_page.close()
            return info
        except:
            return {'supplier': '', 'contact': ''}
    
    async def save(self):
        if not self.products:
            print("⚠️ 无数据")
            return
        
        # 清理
        cleaned = [p for p in self.products if p.get('name') and len(p.get('name', '')) > 3]
        
        print(f"\n📊 清理后: {len(cleaned)} 条")
        
        with open(CONFIG["output_file"], 'w', newline='', encoding='utf-8-sig') as f:
            writer = csv.DictWriter(f, fieldnames=CSV_HEADERS)
            writer.writeheader()
            
            for idx, p in enumerate(cleaned, 1):
                writer.writerow({
                    "序号": idx,
                    "商品名称": p.get('name', ''),
                    "单价(元)": p.get('price', ''),
                    "销量": p.get('sales', '0'),
                    "供应商名称": p.get('supplier', ''),
                    "联系电话": p.get('contact', ''),
                    "商品链接": p.get('url', ''),
                    "抓取时间": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                })
        
        print(f"✅ 已保存: {CONFIG['output_file']}")
        
        # 预览
        print("\n📋 前10条:")
        for i, p in enumerate(cleaned[:10], 1):
            print(f"  {i}. {p.get('name', '')[:30]}")
            print(f"     💰 {p.get('price', '')}元 | 已售 {p.get('sales', '0')}")
            if p.get('supplier'):
                print(f"     🏪 {p.get('supplier', '')[:30]}")
    
    async def close(self):
        if self.browser:
            await self.browser.close()

async def main():
    print("="*50)
    print("  832平台 河南商品爬虫 (Cookie版)")
    print("="*50)
    
    spider = Spider()
    try:
        await spider.init()
        await spider.crawl()
        await spider.save()
        print("\n🎉 完成!")
    except Exception as e:
        print(f"❌ {e}")
        import traceback
        traceback.print_exc()
    finally:
        await spider.close()

if __name__ == "__main__":
    asyncio.run(main())
