#!/usr/bin/env python3
"""
832平台爬虫 - 最终版
"""

import asyncio
import csv
from datetime import datetime
from playwright.async_api import async_playwright

CONFIG = {
    "output_file": "henan_products_final_v2.csv",
    "max_pages": 3,
}

COOKIES = [
    {"name": "gxyj_Sign-In-Token", "value": "Bearer eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJjb2RlIjpudWxsLCJ1c2VyX25hbWUiOm51bGwsImNvbXBhbnlOYW1lIjoiIiwiY2xpZW50X2lkIjoiVU5JRklDQVRJT04iLCJhY2NvdW50U3RhdHVzIjoxLCJpc0ZyZWV6ZSI6MiwidWlkIjoiMTM5OTkwNzMxMDg5NzI4NDUyOCIsInB1cmNoYXNlclByb3BlcnR5IjowLCJjb21wYW55Tm8iOiIzMDAwMTA2MDE3IiwiZ3JhbnRfdHlwZSI6Im11bHRpX3Bhc3N3b3JkIiwic2NvcGUiOlsiYWxsIl0sImxvZ2luTmFtZSI6ImNoaW5hZGF2aWQiLCJleHAiOjE3NzMxOTgyNzEsImp0aSI6IjJmM2FiMGZhLTg2MGItNGVlNS05MTM2LThjNDU4MGIzZjRlNSIsInN0YXRpb25JZCI6IjEiLCJhZG1pblR5cGUiOjEsImN1cnJlbnRTdGF0aW9uSWQiOiIxIiwiYWNjb3VudFR5cGUiOiIzIiwiYWNjTm8iOiJBQ0MyNjAzMTAzMDAwMDAwMDAwMDAwMDcyIiwiY29tcGFueVN0YXR1cyI6MSwiYXV0aG9yaXRpZXMiOlsicm9vdCJdLCJhdWQiOlsiMSJdLCJwaG9uZSI6IjE4MTEwMDc4NzYyIiwibWFpbklkIjoiMTM5OTkwNzMxMDg5NzI4NDUzMCIsInVzZXJuYW1lIjoiY2hpbmFkYXZpZCJ9.CctYfcFiUdPUdmXEV7StkAU9PlAWgM-0v4dk6L6beV3SK93HT-b5g9tHIpO45CZTPa2pXw3GMK5X3dg6v5Y9Olxl35DAXYWhQVC2NjI-qkpOIfSSBM8i-DXR_HyIWpyeXWJBTWk4BdWZZPt8wkWzD5SGwM2-whcVQt5phsDrB8c", "domain": ".fupin832.com"},
    {"name": "JSESSIONID", "value": "0BB906763F5D9BEEEEACA841CA0E9690", "domain": "ys.fupin832.com"},
]

CSV_HEADERS = ["序号", "商品名称", "单价(元)", "销量", "供应商名称", "联系电话", "商品链接", "抓取时间"]

class Spider:
    def __init__(self):
        self.browser = None
        self.context = None
        self.page = None
        self.products = []
    
    async def init(self):
        print("🚀 启动...")
        pw = await async_playwright().start()
        self.browser = await pw.chromium.launch(
            headless=True,
            args=['--no-sandbox', '--disable-setuid-sandbox']
        )
        self.context = await self.browser.new_context(viewport={"width": 1920, "height": 1080})
        
        for c in COOKIES:
            await self.context.add_cookies([{"name": c["name"], "value": c["value"], "domain": c["domain"], "path": "/"}])
        
        self.page = await self.context.new_page()
        print("✅")
    
    async def get_product_details(self, url):
        """获取商品详情"""
        try:
            dp = await self.context.new_page()
            await dp.goto(url, timeout=12000)
            await dp.wait_for_load_state("networkidle")
            await dp.wait_for_timeout(1500)
            
            info = await dp.evaluate("""() => {
                const t = document.body.innerText;
                const r = {s:'', c:''};
                
                // 供应商
                const ss = ['供应商','店铺','商家','企业','公司','产地','所属'];
                for(const kw of ss) {
                    const m = t.match(new RegExp(kw+'[：:\\\\s]+([^\\\\n]{2,30})'));
                    if(m) { r.s = m[1].trim(); break; }
                }
                
                // 电话
                const ms = t.match(/(\\d{3,4}[-\\\\s]?\\d{7,8})/);
                if(ms) r.c = ms[1].trim();
                
                return r;
            }""")
            
            await dp.close()
            return info.get('s',''), info.get('c','')
        except:
            return '',''

    async def run(self):
        for pg in range(1, CONFIG["max_pages"]+1):
            print(f"\n📄 第 {pg}/{CONFIG['max_pages']} 页")
            
            u = f"https://ys.fupin832.com/product/list?areaCode=410000&page={pg}" if pg>1 else "https://ys.fupin832.com/product/list?areaCode=410000"
            await self.page.goto(u, timeout=30000)
            await self.page.wait_for_load_state("networkidle")
            await self.page.wait_for_timeout(2000)
            
            for i in range(3):
                await self.page.evaluate(f"window.scrollBy(0,{(i+1)*600})")
                await self.page.wait_for_timeout(800)
            
            # 提取
            prods = await self.page.evaluate("""() => {
                const rs = [];
                const as = document.querySelectorAll('a[href*="product"], a[href*="detail"]');
                
                as.forEach(a => {
                    const txt = a.innerText?.trim()||'';
                    const href = a.href;
                    
                    // 价格特征
                    if(/￥?\\s*\\d+\\.?\\d*|\\d+\\.?\\d*元/.test(txt) && txt.length>8 && txt.length<120) {
                        const p = txt.match(/￥?\\s*(\\d+\\.?\\d*)/);
                        const s = txt.match(/(?:已售|月销)\\s*(\\d+(?:\\.\\d+)?(?:万|千)?)/);
                        let n = txt.replace(/￥\\s*\\d+\\.?\\d*/,'').replace(/\\d+\\.?\\d*元/,'').replace(/已售\\s*\\d+/,'').replace(/月销\\s*\\d+/,'').trim();
                        n = n.split('\\n')[0].trim();
                        
                        if(n.length>3) rs.push({n: n.substring(0,80), p:p?p[1]:'', s:s?s[1]:'', u:href});
                    }
                });
                
                // 去重
                const u = []; const v = new Set();
                rs.forEach(x => {
                    const k = x.n.substring(0,15)+x.p;
                    if(!v.has(k)){v.add(k);u.push(x);}
                });
                return u.slice(0,30);
            }""")
            
            print(f"  提取: {len(prods)} 条")
            
            # 获取详情
            for i,p in enumerate(prods[:8]):
                if p.get('u'):
                    s,c = await self.get_product_details(p['u'])
                    p['sup'] = s
                    p['con'] = c
                    print(f"    [{i+1}] {p['n'][:20]}... → {s[:15] if s else 'N/A'}")
                else:
                    # 推断供应商
                    n = p.get('n','')
                    for kw in ['县','市','旗','区']:
                        if kw in n:
                            p['sup'] = n.split(kw)[0]+kw
                            break
                    p['con'] = ''
            
            self.products.extend(prods)
            print(f"  累计: {len(self.products)}")
        
        # 清理保存
        cleaned = []
        seen = set()
        for p in self.products:
            k = p.get('n','')[:20] + str(p.get('p',''))
            if k not in seen and p.get('n'):
                seen.add(k)
                cleaned.append(p)
        
        with open(CONFIG["output_file"], 'w', newline='', encoding='utf-8-sig') as f:
            w = csv.DictWriter(f, fieldnames=CSV_HEADERS)
            w.writeheader()
            for i,p in enumerate(cleaned,1):
                w.writerow({
                    "序号": i, "商品名称": p.get('n',''), "单价(元)": p.get('p',''),
                    "销量": p.get('s',''), "供应商名称": p.get('sup',''),
                    "联系电话": p.get('con',''), "商品链接": p.get('u',''),
                    "抓取时间": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                })
        
        sup_cnt = len([p for p in cleaned if p.get('sup')])
        con_cnt = len([p for p in cleaned if p.get('con')])
        
        print(f"\n✅ 保存: {CONFIG['output_file']}")
        print(f"📊 总数: {len(cleaned)} | 有供应商: {sup_cnt} | 有电话: {con_cnt}")
        
        print(f"\n📋 预览:")
        for i,p in enumerate(cleaned[:8],1):
            print(f"  {i}. {p.get('n','')[:35]}")
            print(f"     💰{p.get('p','')}元 | 销量 {p.get('s','')}")
            if p.get('sup'): print(f"     🏪 {p.get('sup','')[:30]}")
            if p.get('con'): print(f"     📞 {p.get('con')}")
        
        await self.browser.close()

async def main():
    print("="*50)
    print("  832平台 河南商品爬虫 v5")
    print("="*50)
    s = Spider()
    try:
        await s.init()
        await s.run()
    except Exception as e:
        print(f"❌ {e}")
    finally:
        await s.browser.close() if s.browser else None

asyncio.run(main())
