#!/usr/bin/env python3
"""
832平台爬虫 - 河南商品（最终版）
无需登录，直接调用API获取数据
"""

import asyncio
import csv
import json
import re
from datetime import datetime
from playwright.async_api import async_playwright

# 配置
CONFIG = {
    "area_code": "410000",  # 河南省
    "output_file": "henan_products.csv",
    "base_url": "https://ys.fupin832.com",
    "max_pages": 10,  # 最大爬取页数
    "page_size": 20   # 每页数量
}

CSV_HEADERS = [
    "序号",
    "供应商名称",
    "商品名称",
    "单价(元)",
    "销量",
    "商品规格",
    "商品链接",
    "抓取时间"
]

class Spider832:
    def __init__(self):
        self.browser = None
        self.context = None
        self.page = None
        self.products = []
        self.all_products = []
    
    async def init_browser(self):
        print("🚀 启动浏览器...")
        playwright = await async_playwright().start()
        self.browser = await playwright.chromium.launch(
            headless=True,
            args=['--no-sandbox', '--disable-setuid-sandbox']
        )
        self.context = await self.browser.new_context(
            viewport={"width": 1920, "height": 1080},
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
        )
        self.page = await self.context.new_page()
        print("✅ 浏览器启动成功")
    
    async def fetch_products_from_api(self, page_num=1):
        """从API获取商品列表"""
        print(f"\n📥 正在获取第 {page_num} 页商品...")
        
        # 尝试多个API端点
        api_urls = [
            # 方式1: 搜索API
            f"{CONFIG['base_url']}/frontweb/decorationcenter/searchEsRecommendData",
            # 方式2: 直接商品API
            f"{CONFIG['base_url']}/frontapi/product/queryProductPage",
            # 方式3: 通用搜索
            f"{CONFIG['base_url']}/api/product/search"
        ]
        
        for api_url in api_urls:
            try:
                # 构造请求
                payload = {
                    "pageShow": CONFIG['page_size'],
                    "nowPage": page_num,
                    "areaCode": CONFIG['area_code'],
                    "provinceCode": CONFIG['area_code']
                }
                
                response = await self.page.goto(api_url, timeout=15000)
                
                if response and response.status == 200:
                    # 尝试POST请求
                    await self.page.goto(api_url, timeout=15000)
                    
            except Exception as e:
                print(f"  API尝试失败: {str(e)[:50]}")
        
        # 访问商品列表页面触发API调用
        print(f"  访问商品列表页面...")
        await self.page.goto(
            f"{CONFIG['base_url']}/product/list?areaCode={CONFIG['area_code']}",
            timeout=30000
        )
        await self.page.wait_for_load_state("networkidle")
        
        # 等待API加载
        await self.page.wait_for_timeout(3000)
        
        # 滚动触发更多加载
        await self.page.evaluate("window.scrollBy(0, 1000)")
        await self.page.wait_for_timeout(1000)
        
        # 获取API响应中的商品数据
        products_data = await self.extract_products_from_page()
        
        return products_data
    
    async def extract_products_from_page(self):
        """从页面提取商品数据"""
        print("  🔍 提取商品数据...")
        
        # 方法1: 从页面DOM提取
        products = await self.page.evaluate("""() => {
            const results = [];
            
            // 尝试多种选择器
            const selectors = [
                // 可能的商品卡片选择器
                '.product-item',
                '.goods-item',
                '.product-card',
                '.goods-card',
                '[class*="product"]',
                '[class*="goods"]',
                '[class*="item"]',
                // 表格形式
                'table tr',
                // 列表形式
                '.list-item',
                '.item'
            ];
            
            let items = [];
            for (const sel of selectors) {
                items = document.querySelectorAll(sel);
                if (items.length > 5 && items.length < 500) {
                    console.log('Found selector:', sel, 'with', items.length, 'items');
                    break;
                }
            }
            
            // 遍历每个商品元素
            items.forEach((item, index) => {
                try {
                    const html = item.innerHTML;
                    const text = item.innerText || '';
                    
                    // 提取价格
                    let price = '';
                    const priceMatch = text.match(/￥?\s*(\d+\.?\d*)/);
                    if (priceMatch) {
                        price = priceMatch[1];
                    }
                    
                    // 提取销量
                    let sales = '';
                    const salesMatch = text.match(/已售\s*(\d+)/);
                    if (salesMatch) {
                        sales = salesMatch[1];
                    }
                    
                    // 提取商品名称
                    let name = '';
                    const nameEl = item.querySelector('[class*="name"], [class*="title"], [class*="product"], h3, h4, .title, .product-name');
                    if (nameEl) {
                        name = nameEl.innerText.trim();
                    }
                    
                    // 如果没找到，用链接文本
                    if (!name) {
                        const link = item.querySelector('a');
                        if (link) {
                            name = link.innerText.trim();
                        }
                    }
                    
                    // 获取链接
                    let url = '';
                    const link = item.querySelector('a[href*="product"], a[href*="detail"], a[href*="goods"]');
                    if (link) {
                        url = link.href;
                    }
                    
                    if ((name || price) && name.length > 1) {
                        results.push({
                            name: name || '未知商品',
                            price: price || '',
                            sales: sales || '0',
                            url: url || ''
                        });
                    }
                } catch(e) {}
            });
            
            return results;
        }""")
        
        # 方法2: 如果DOM提取失败，尝试从API响应提取
        if not products or len(products) < 3:
            print("  🔄 DOM提取数据较少，尝试其他方式...")
            products = await self.extract_from_page_text()
        
        # 去重
        unique_products = []
        seen = set()
        for p in products:
            key = p.get('name', '')[:20]
            if key and key not in seen:
                seen.add(key)
                unique_products.append(p)
        
        print(f"  ✅ 共提取到 {len(unique_products)} 条商品")
        return unique_products
    
    async def extract_from_page_text(self):
        """从页面文本提取"""
        return await self.page.evaluate("""() => {
            const results = [];
            
            // 获取所有文本内容进行分析
            const pricePattern = /([^\\n]{1,50}￥?\\s*\\d+\\.?\\d*[^\\n]{1,50})/g;
            const text = document.body.innerText;
            
            // 匹配商品名称 + 价格
            const matches = text.match(/([^\\n]{5,80}￥?\\s*\\d+\\.?\\d*)/g) || [];
            
            matches.forEach(match => {
                const cleanMatch = match.trim().replace(/\\s+/g, ' ');
                
                // 提取价格
                const priceMatch = cleanMatch.match(/￥?\\s*(\\d+\\.?\\d*)/);
                const price = priceMatch ? priceMatch[1] : '';
                
                // 提取销量
                const salesMatch = cleanMatch.match(/已售\\s*(\\d+)/);
                const sales = salesMatch ? salesMatch[1] : '0';
                
                // 清理名称
                let name = cleanMatch
                    .replace(/￥?\\s*\\d+\\.?\\d*/g, '')
                    .replace(/已售\\s*\\d+/g, '')
                    .replace(/\\d+元/g, '')
                    .trim();
                
                if (name.length > 3 && name.length < 100) {
                    results.push({
                        name: name,
                        price: price,
                        sales: sales,
                        url: ''
                    });
                }
            });
            
            return results;
        }""")
    
    async def get_supplier_info(self, product_url):
        """获取供应商信息"""
        if not product_url:
            return {}
        
        try:
            await self.page.goto(product_url, timeout=10000)
            await self.page.wait_for_load_state("networkidle")
            
            supplier_info = await self.page.evaluate("""() => {
                const result = {
                    supplier: '',
                    contact: ''
                };
                
                // 查找供应商名称
                const supplierSelectors = [
                    '[class*="supplier"]',
                    '[class*="vendor"]',
                    '[class*="shop"]',
                    'text="供应商"',
                    'text="店铺"'
                ];
                
                for (const sel of supplierSelectors) {
                    const el = document.querySelector(sel);
                    if (el) {
                        result.supplier = el.innerText.trim();
                        break;
                    }
                }
                
                // 查找联系方式
                const contactSelectors = [
                    '[class*="contact"]',
                    'text="电话"',
                    'text="联系方式"',
                    'text="手机"'
                ];
                
                for (const sel of contactSelectors) {
                    const el = document.querySelector(sel);
                    if (el) {
                        result.contact = el.innerText.trim();
                        break;
                    }
                }
                
                return result;
            }""")
            
            return supplier_info
            
        except Exception as e:
            return {}
    
    async def crawl_all_pages(self):
        """爬取所有页面"""
        print("\n" + "="*60)
        print("  开始爬取河南商品数据")
        print("="*60)
        
        # 访问主页面
        await self.page.goto(
            f"{CONFIG['base_url']}/product/list?areaCode={CONFIG['area_code']}",
            timeout=30000
        )
        await self.page.wait_for_load_state("networkidle")
        await self.page.wait_for_timeout(2000)
        
        # 爬取多页
        for page in range(1, CONFIG['max_pages'] + 1):
            print(f"\n{'='*40}")
            print(f"  第 {page}/{CONFIG['max_pages']} 页")
            print(f"{'='*40}")
            
            # 滚动加载
            await self.page.evaluate(f"window.scrollBy(0, {page * 800})")
            await self.page.wait_for_timeout(1500)
            
            # 提取商品
            products = await self.extract_products_from_page()
            
            if not products or len(products) < 3:
                print("  ⚠️ 未找到更多商品，停止爬取")
                break
            
            # 获取供应商信息（只处理前几个，避免太慢）
            for i, product in enumerate(products[:5]):
                if product.get('url'):
                    supplier = await self.get_supplier_info(product['url'])
                    product['supplier'] = supplier.get('supplier', '')
                    product['contact'] = supplier.get('contact', '')
            
            self.all_products.extend(products)
            print(f"  📊 当前共 {len(self.all_products)} 条商品")
            
            # 尝试翻页
            try:
                next_btn = await self.page.query_selector('button:has-text("下一页"), a:has-text("下一页"), .next')
                if next_btn:
                    await next_btn.click()
                    await self.page.wait_for_timeout(2000)
                else:
                    print("  ⚠️ 未找到下一页按钮")
                    break
            except:
                print("  ⚠️ 翻页失败")
                break
        
        return self.all_products
    
    async def save_to_csv(self):
        """保存到CSV"""
        if not self.all_products:
            print("⚠️ 没有数据可保存")
            return
        
        output_path = CONFIG["output_file"]
        
        with open(output_path, 'w', newline='', encoding='utf-8-sig') as f:
            writer = csv.DictWriter(f, fieldnames=CSV_HEADERS)
            writer.writeheader()
            
            for idx, product in enumerate(self.all_products, 1):
                row = {
                    "序号": idx,
                    "供应商名称": product.get('supplier', ''),
                    "商品名称": product.get('name', ''),
                    "单价(元)": product.get('price', ''),
                    "销量": product.get('sales', '0'),
                    "商品规格": product.get('spec', ''),
                    "商品链接": product.get('url', ''),
                    "抓取时间": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                }
                writer.writerow(row)
        
        print(f"\n✅ 数据已保存到: {output_path}")
        print(f"📊 共保存 {len(self.all_products)} 条记录")
        
        # 显示前几条数据
        print("\n📋 前5条数据预览:")
        for i, p in enumerate(self.all_products[:5], 1):
            print(f"  {i}. {p.get('name', 'N/A')[:40]}")
            print(f"     💰 {p.get('price', 'N/A')}元 | 已售 {p.get('sales', '0')}")
    
    async def close(self):
        if self.browser:
            await self.browser.close()
            print("🔒 浏览器已关闭")

async def main():
    print("="*60)
    print("  832平台 河南商品爬虫 v2.0")
    print("  作者: 李狗蛋")
    print("="*60 + "\n")
    
    spider = Spider832()
    
    try:
        await spider.init_browser()
        
        # 爬取所有页
        await spider.crawl_all_pages()
        
        # 保存结果
        await spider.save_to_csv()
        
        print("\n" + "="*60)
        print("  🎉 爬虫执行完成!")
        print("="*60)
        
    except Exception as e:
        print(f"❌ 错误: {e}")
        import traceback
        traceback.print_exc()
    
    finally:
        await spider.close()

if __name__ == "__main__":
    asyncio.run(main())
