#!/usr/bin/env python3
"""
832平台爬虫 - 河南地区商品数据抓取（无需登录版）
"""

import asyncio
import csv
import json
import re
from datetime import datetime
from playwright.async_api import async_playwright

# 配置
CONFIG = {
    "area_code": "410000",  # 河南省
    "output_file": "henan_products.csv",
    "base_url": "https://ys.fupin832.com"
}

# CSV 表头
CSV_HEADERS = [
    "序号",
    "供应商名称",
    "商品名称",
    "单价(元)",
    "销量",
    "联系方式",
    "商品链接",
    "抓取时间"
]

class Spider832:
    def __init__(self):
        self.browser = None
        self.context = None
        self.page = None
        self.products = []
    
    async def init_browser(self):
        """初始化浏览器"""
        print("🚀 启动浏览器...")
        playwright = await async_playwright().start()
        self.browser = await playwright.chromium.launch(
            headless=True,
            args=['--no-sandbox', '--disable-setuid-sandbox']
        )
        self.context = await self.browser.new_context(
            viewport={"width": 1920, "height": 1080},
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        )
        self.page = await self.context.new_page()
        
        # 监听网络请求，捕获API响应
        self.api_responses = []
        
        async def handle_response(response):
            url = response.url
            if '/api/' in url or 'product' in url.lower() or 'goods' in url.lower():
                try:
                    if response.status == 200:
                        body = await response.text()
                        self.api_responses.append({
                            'url': url,
                            'status': response.status,
                            'body': body[:5000]  # 限制大小
                        })
                except:
                    pass
        
        self.page.on('response', handle_response)
        print("✅ 浏览器启动成功")
    
    async def try_direct_api(self):
        """尝试直接访问API"""
        print("\n🌐 尝试直接访问API...")
        
        # 常见的商品API路径
        api_urls = [
            f"https://ys.fupin832.com/api/product/list?areaCode={CONFIG['area_code']}&page=1&pageSize=20",
            f"https://ys.fupin832.com/api/goods/list?areaCode={CONFIG['area_code']}&page=1&pageSize=20",
            f"https://ys.fupin832.com/product/api/list?areaCode={CONFIG['area_code']}&page=1&pageSize=20",
            f"https://ys.fupin832.com/api/supplier/product/list?provinceCode={CONFIG['area_code']}&page=1&pageSize=20",
        ]
        
        for url in api_urls:
            try:
                print(f"  尝试: {url[:60]}...")
                await self.page.goto(url, timeout=10000)
                content = await self.page.content()
                
                # 检查是否是JSON响应
                if '{' in content and '}' in content:
                    print(f"  ✅ 发现JSON数据!")
                    # 截图
                    await self.page.screenshot(path="api_response.png")
                    return content
            except Exception as e:
                print(f"  ❌ 失败: {str(e)[:50]}")
        
        return None
    
    async def visit_product_page(self):
        """访问商品页面"""
        print("\n🛒 访问商品列表页面...")
        
        # 尝试多种入口
        urls_to_try = [
            f"https://ys.fupin832.com/product/list?areaCode={CONFIG['area_code']}",
            f"https://ys.fupin832.com/product?areaCode={CONFIG['area_code']}",
            "https://ys.fupin832.com/product/list",
            "https://ys.fupin832.com/",
        ]
        
        for url in urls_to_try:
            try:
                print(f"  访问: {url}")
                await self.page.goto(url, timeout=30000)
                await self.page.wait_for_load_state("networkidle")
                
                # 截图
                safe_name = url.replace('://', '_').replace('/', '_').replace('?', '_')[:50]
                await self.page.screenshot(path=f"page_{safe_name}.png")
                
                # 检查页面内容
                content = await self.page.content()
                print(f"  📄 页面大小: {len(content)} 字符")
                
                # 获取页面标题
                title = await self.page.title()
                print(f"  📌 页面标题: {title}")
                
                if '商品' in content or '产品' in content or 'product' in content.lower():
                    print(f"  ✅ 找到商品相关内容!")
                    return url
                    
            except Exception as e:
                print(f"  ❌ 访问失败: {str(e)[:80]}")
        
        return None
    
    async def analyze_page_structure(self):
        """分析页面结构"""
        print("\n🔍 分析页面结构...")
        
        # 获取页面结构信息
        structure = await self.page.evaluate("""() => {
            const result = {
                forms: document.querySelectorAll('form').length,
                inputs: document.querySelectorAll('input').length,
                buttons: document.querySelectorAll('button').length,
                links: document.querySelectorAll('a').length,
                images: document.querySelectorAll('img').length,
                tables: document.querySelectorAll('table').length,
                lists: document.querySelectorAll('ul, ol').length,
                divs: document.querySelectorAll('div').length,
                
                // 查找可能的商品列表容器
                possibleProductContainers: [],
                
                // 查找包含价格的元素
                priceElements: [],
                
                // 获取所有链接
                productLinks: []
            };
            
            // 查找可能的商品容器
            const containerSelectors = [
                '[class*="product"]',
                '[class*="goods"]',
                '[class*="item"]',
                '[class*="list"]',
                '[class*="card"]'
            ];
            
            containerSelectors.forEach(sel => {
                const els = document.querySelectorAll(sel);
                if (els.length > 0 && els.length < 100) {
                    result.possibleProductContainers.push({
                        selector: sel,
                        count: els.length
                    });
                }
            });
            
            // 查找价格元素
            const priceRegex = /¥?\d+\.?\d*\s*元/;
            document.querySelectorAll('*').forEach(el => {
                const text = el.innerText || '';
                if (priceRegex.test(text) && text.length < 100) {
                    result.priceElements.push(text.trim());
                }
            });
            
            // 获取商品相关链接
            document.querySelectorAll('a').forEach(a => {
                const href = a.href || '';
                const text = a.innerText || '';
                if ((href.includes('product') || href.includes('goods') || href.includes('item') || href.includes('detail')) 
                    && text.length > 0 && text.length < 100) {
                    result.productLinks.push({
                        text: text.trim(),
                        href: href
                    });
                }
            });
            
            return result;
        }""")
        
        print(f"\n📊 页面结构分析:")
        print(f"  - 表单: {structure['forms']} 个")
        print(f"  - 输入框: {structure['inputs']} 个")
        print(f"  - 按钮: {structure['buttons']} 个")
        print(f"  - 链接: {structure['links']} 个")
        print(f"  - 图片: {structure['images']} 个")
        print(f"  - 表格: {structure['tables']} 个")
        
        if structure['possibleProductContainers']:
            print(f"\n📦 可能的商品容器:")
            for container in structure['possibleProductContainers'][:5]:
                print(f"  - {container['selector']}: {container['count']} 个")
        
        if structure['priceElements']:
            print(f"\n💰 找到的价格元素 (前10个):")
            for price in structure['priceElements'][:10]:
                print(f"  - {price}")
        
        if structure['productLinks']:
            print(f"\n🔗 找到的商品链接 (前10个):")
            for link in structure['productLinks'][:10]:
                print(f"  - {link['text'][:30]}: {link['href'][:60]}")
        
        return structure
    
    async def intercept_api_calls(self):
        """拦截API调用"""
        print("\n📡 监听API请求...")
        
        # 点击"按省找货"或"河南"
        try:
            # 尝试找到河南相关的入口
            henan_selectors = [
                'text="河南"',
                'text="河南省"',
                '[data-code="410000"]',
                'a:has-text("河南")'
            ]
            
            for selector in henan_selectors:
                try:
                    await self.page.click(selector, timeout=3000)
                    print(f"  ✅ 点击了: {selector}")
                    await self.page.wait_for_timeout(2000)
                    break
                except:
                    pass
            
        except Exception as e:
            print(f"  ⚠️ 无法点击河南: {e}")
        
        # 滚动页面触发加载
        for i in range(3):
            await self.page.evaluate("window.scrollBy(0, 500)")
            await self.page.wait_for_timeout(1000)
        
        # 输出捕获到的API
        if self.api_responses:
            print(f"\n📡 捕获到 {len(self.api_responses)} 个API响应:")
            for resp in self.api_responses[:5]:
                print(f"  - {resp['url'][:60]}")
                if resp['body']:
                    print(f"    数据预览: {resp['body'][:200]}")
        
        return self.api_responses
    
    async def save_screenshots(self):
        """保存截图"""
        await self.page.screenshot(path="final_page.png", full_page=True)
        print("📸 完整页面截图已保存: final_page.png")
    
    async def close(self):
        """关闭浏览器"""
        if self.browser:
            await self.browser.close()
            print("🔒 浏览器已关闭")

async def main():
    """主函数"""
    print("="*60)
    print("  832平台 河南商品爬虫 (无登录版)")
    print("  作者: 李狗蛋")
    print("  时间: " + datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    print("="*60 + "\n")
    
    spider = Spider832()
    
    try:
        # 1. 初始化
        await spider.init_browser()
        
        # 2. 先尝试直接API
        api_result = await spider.try_direct_api()
        
        # 3. 访问商品页面
        page_url = await spider.visit_product_page()
        
        # 4. 分析页面结构
        if page_url:
            structure = await spider.analyze_page_structure()
            
            # 5. 拦截API
            apis = await spider.intercept_api_calls()
            
            # 6. 保存截图
            await spider.save_screenshots()
        
        print("\n" + "="*60)
        print("  初步探索完成!")
        print("  请查看截图和日志，我们再继续优化")
        print("="*60)
        
    except Exception as e:
        print(f"❌ 错误: {e}")
        import traceback
        traceback.print_exc()
    
    finally:
        await spider.close()

if __name__ == "__main__":
    asyncio.run(main())