#!/usr/bin/env python3
"""
832平台爬虫 - 直接从页面文本提取 + 构造搜索URL获取详情
"""

import asyncio
import csv
import re
from datetime import datetime
from playwright.async_api import async_playwright

CONFIG = {
    "output_file": "henan_products_full.csv",
    "max_pages": 3,
}

COOKIES = [
    {"name": "gxyj_Sign-In-Token", "value": "Bearer eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJjb2RlIjpudWxsLCJ1c2VyX25hbWUiOm51bGwsImNvbXBhbnlOYW1lIjoiIiwiY2xpZW50X2lkIjoiVU5JRklDQVRJT04iLCJhY2NvdW50U3RhdHVzIjoxLCJpc0ZyZWV6ZSI6MiwidWlkIjoiMTM5OTkwNzMxMDg5NzI4NDUyOCIsInB1cmNoYXNlclByb3BlcnR5IjowLCJjb21wYW55Tm8iOiIzMDAwMTA2MDE3IiwiZ3JhbnRfdHlwZSI6Im11bHRpX3Bhc3N3b3JkIiwic2NvcGUiOlsiYWxsIl0sImxvZ2luTmFtZSI6ImNoaW5hZGF2aWQiLCJleHAiOjE3NzMxOTgyNzEsImp0aSI6IjJmM2FiMGZhLTg2MGItNGVlNS05MTM2LThjNDU4MGIzZjRlNSIsInN0YXRpb25JZCI6IjEiLCJhZG1pblR5cGUiOjEsImN1cnJlbnRTdGF0aW9uSWQiOiIxIiwiYWNjb3VudFR5cGUiOiIzIiwiYWNjTm8iOiJBQ0MyNjAzMTAzMDAwMDAwMDAwMDAwMDcyIiwiY29tcGFueVN0YXR1cyI6MSwiYXV0aG9yaXRpZXMiOlsicm9vdCJdLCJhdWQiOlsiMSJdLCJwaG9uZSI6IjE4MTEwMDc4NzYyIiwibWFpbklkIjoiMTM5OTkwNzMxMDg5NzI4NDUzMCIsInVzZXJuYW1lIjoiY2hpbmFkYXZpZCJ9.CctYfcFiUdPUdmXEV7StkAU9PlAWgM-0v4dk6L6beV3SK93HT-b5g9tHIpO45CZTPa2pXw3GMK5X3dg6v5Y9Olxl35DAXYWhQVC2NjI-qkpOIfSSBM8i-DXR_HyIWpyeXWJBTWk4BdWZZPt8wkWzD5SGwM2-whcVQt5phsDrB8c", "domain": ".fupin832.com"},
]

CSV_HEADERS = ["序号", "商品名称", "单价(元)", "销量", "供应商名称", "联系电话", "商品链接", "抓取时间"]

class Spider:
    def __init__(self):
        self.browser = None
        self.context = None
        self.page = None
        self.all_products = []
    
    async def init(self):
        print("🚀 启动...")
        pw = await async_playwright().start()
        self.browser = await pw.chromium.launch(headless=True, args=['--no-sandbox'])
        self.context = await self.browser.new_context(viewport={"width": 1920, "height": 1080})
        
        for c in COOKIES:
            await self.context.add_cookies([{"name": c["name"], "value": c["value"], "domain": c["domain"], "path": "/"}])
        
        self.page = await self.context.new_page()
        print("✅")
    
    async def extract_from_page(self):
        """从页面文本提取商品"""
        products = await self.page.evaluate("""() => {
            const results = [];
            const seen = new Set();
            
            // 查找所有div，包含价格和商品名
            document.querySelectorAll('div').forEach(div => {
                const text = div.innerText?.trim() || '';
                
                // 必须包含价格符号
                if (!text.includes('￥')) return;
                
                // 提取价格
                const priceMatch = text.match(/￥\\s*(\\d+\\.?\\d*)/);
                if (!priceMatch) return;
                const price = priceMatch[1];
                
                // 提取商品名 - 取第一行
                const lines = text.split(/\\n/).map(l => l.trim()).filter(l => l);
                let name = '';
                for (const line of lines) {
                    if (!line.match(/^￥/) && !line.match(/^\\d+\\.?\\d*$/) && line.length > 3) {
                        name = line;
                        break;
                    }
                }
                
                // 过滤
                if (!name || name.length < 5) return;
                const skipWords = ['资讯', '通知', '公告', '更多', '客服', '登录', '购物车', '首页', '关于', '联系'];
                if (skipWords.some(w => name.startsWith(w))) return;
                
                // 去重
                const key = name.substring(0, 20) + price;
                if (seen.has(key)) return;
                seen.add(key);
                
                results.push({
                    name: name.substring(0, 100),
                    price: price,
                    url: ''
                });
            });
            
            return results.slice(0, 50);
        }""")
        
        return products
    
    async def search_product(self, product_name):
        """搜索商品获取详情"""
        try:
            # 使用站内搜索
            search_url = f"https://ys.fupin832.com/product/list?keyword={encodeURIComponent(product_name.substring(0, 20))}"
            
            search_page = await self.context.new_page()
            await search_page.goto(search_url, timeout=10000)
            await search_page.wait_for_timeout(2000)
            
            # 尝试找到第一个商品链接
            const firstLink = await search_page.evaluate("""() => {
                const links = document.querySelectorAll('a[href*="product"], a[href*="detail"]');
                for (const link of links) {
                    const href = link.href;
                    if (href.includes('/product/') || href.includes('/detail/')) {
                        return href;
                    }
                }
                return '';
            }""")
            
            await search_page.close()
            return firstLink
            
        except:
            return ''
    
    async def get_detail_by_search(self, product_name):
        """通过搜索获取商品详情"""
        try:
            # 构造搜索URL
            keyword = product_name.substring(0, 15) if len(product_name) > 15 else product_name
            search_url = f"https://ys.fupin832.com/product/list?keyword={keyword}"
            
            detail_page = await self.context.new_page()
            await detail_page.goto(search_url, timeout=12000)
            await detail_page.wait_for_timeout(2000)
            
            # 查找第一个商品的详情链接
            result = await detail_page.evaluate("""() => {
                // 找到所有链接
                const links = Array.from(document.querySelectorAll('a[href]'));
                
                // 找到包含商品详情的链接
                for (const link of links) {
                    const href = link.href;
                    if (href.includes('/product/') || href.includes('/detail/') || href.includes('/goods/')) {
                        // 找到了链接，访问它
                        return { found: true, url: href };
                    }
                }
                return { found: false };
            }""")
            
            if (result.found && result.url) {
                # 访问详情页
                await detail_page.goto(result.url, timeout=10000)
                await detail_page.wait_for_timeout(2000)
                
                // 提取详情
                info = await detail_page.evaluate("""() => {
                    const text = document.body.innerText;
                    const r = { s: '', c: '' };
                    
                    // 供应商
                    const m1 = text.match(/供应商[：:\\s]+([^\\n]{2,40})/);
                    if (m1) r.s = m1[1].trim();
                    
                    // 电话
                    const m2 = text.match(/(\d{3,4}[-\\s]?\\d{7,8})/);
                    if (m2) r.c = m2[1].trim();
                    
                    return r;
                }""")
                
                await detail_page.close()
                return info
            
            await detail_page.close()
            return {'s': '', 'c': ''}
            
        except Exception as e:
            return {'s': '', 'c': ''}
    
    async def crawl_page(self, page_num):
        """爬取单页"""
        print(f"\n📄 第 {page_num} 页")
        
        url = f"https://ys.fupin832.com/product/list?areaCode=410000"
        if page_num > 1:
            url += f"&page={page_num}"
        
        await self.page.goto(url, timeout=30000)
        await self.page.wait_for_load_state("networkidle")
        await self.page.wait_for_timeout(3000)
        
        # 滚动
        for i in range(3):
            await self.page.evaluate(f"window.scrollBy(0, {(i+1)*600})")
            await self.page.wait_for_timeout(1000)
        
        # 提取
        print("  提取商品...")
        products = await self.extract_from_page()
        print(f"  找到 {len(products)} 条")
        
        if not products:
            return []
        
        # 获取详情（前5个）
        print("  获取详情...")
        for i, p in enumerate(products[:5]):
            print(f"    [{i+1}] {p['name'][:20]}...")
            detail = await self.get_detail_by_search(p['name'])
            p['supplier'] = detail.get('s', '')
            p['contact'] = detail.get('c', '')
            if p['supplier']:
                print(f"       ✅ {p['supplier'][:20]}")
        
        # 其他商品推断供应商
        for p in products[5:]:
            name = p.get('name', '')
            for kw in ['县', '市', '旗']:
                if kw in name:
                    p['supplier'] = name.split(kw)[0] + kw
                    break
        
        return products
    
    async def run(self):
        for page in range(1, CONFIG['max_pages'] + 1):
            products = await self.crawl_page(page)
            if not products:
                break
            self.all_products.extend(products)
            print(f"  累计: {len(self.all_products)}")
        
        await self.save()
    
    async def save(self):
        if not self.all_products:
            print("⚠️ 无数据")
            return
        
        # 清理
        cleaned = []
        seen = set()
        for p in self.all_products:
            key = p.get('name', '')[:20] + str(p.get('price', ''))
            if key not in seen:
                seen.add(key)
                cleaned.append(p)
        
        with open(CONFIG["output_file"], 'w', newline='', encoding='utf-8-sig') as f:
            w = csv.DictWriter(f, fieldnames=CSV_HEADERS)
            w.writeheader()
            for i, p in enumerate(cleaned, 1):
                w.writerow({
                    "序号": i,
                    "商品名称": p.get('name', ''),
                    "单价(元)": p.get('price', ''),
                    "销量": p.get('sales', ''),
                    "供应商名称": p.get('supplier', ''),
                    "联系电话": p.get('contact', ''),
                    "商品链接": p.get('url', ''),
                    "抓取时间": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                })
        
        sup_cnt = len([p for p in cleaned if p.get('supplier')])
        con_cnt = len([p for p in cleaned if p.get('contact')])
        
        print(f"\n✅ 保存: {CONFIG['output_file']}")
        print(f"  总数: {len(cleaned)}")
        print(f"  有供应商: {sup_cnt}")
        print(f"  有电话: {con_cnt}")
        
        print(f"\n📋 预览:")
        for i, p in enumerate(cleaned[:8], 1):
            print(f"  {i}. {p.get('name', '')[:35]}")
            print(f"     💰{p.get('price', '')}元")
            if p.get('supplier'): print(f"     🏪 {p.get('supplier')[:30]}")
            if p.get('contact'): print(f"     📞 {p.get('contact')}")
    
    async def close(self):
        if self.browser:
            await self.browser.close()

async def main():
    print("="*50)
    print("  832平台爬虫 - 搜索获取详情版")
    print("="*50)
    
    s = Spider()
    try:
        await s.init()
        await s.run()
        print("\n🎉 完成!")
    except Exception as e:
        print(f"❌ {e}")
    finally:
        await s.close()

if __name__ == "__main__":
    asyncio.run(main())
