WhereAmI/core/data_source.py

"""代理数据源获取模块"""
import requests
from bs4 import BeautifulSoup
import json
import os
from typing import List, Optional
from datetime import datetime
from loguru import logger
from .models import ProxyInfo, ProxyProtocol


class DataSource:
    """代理数据源管理器"""

    def __init__(self, config: dict):
        self.config = config
        self.logger = logger

    def fetch_from_web(self, url: str, output_file: str = "proxy.json", pages: int = 1) -> List[ProxyInfo]:
        """从网页获取代理列表并保存到文件

        Args:
            url: 基础URL（不含页码参数）
            output_file: 输出文件路径
            pages: 抓取的页数，默认1页
        """
        self.logger.info(f"开始从网页获取代理: {url} (共{pages}页)")
        all_proxies = []

        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
            }

            # 抓取多页
            for page in range(1, pages + 1):
                # 构建带页码的URL
                if '?' in url:
                    page_url = f"{url}&page={page}"
                else:
                    page_url = f"{url}?page={page}"

                self.logger.info(f"正在获取第 {page}/{pages} 页: {page_url}")

                try:
                    response = requests.get(page_url, headers=headers, timeout=30)
                    response.encoding = 'utf-8'

                    if response.status_code != 200:
                        self.logger.warning(f"第{page}页获取失败，状态码: {response.status_code}")
                        continue

                    proxies = self._parse_html(response.text)
                    self.logger.info(f"第{page}页成功获取 {len(proxies)} 个代理")
                    all_proxies.extend(proxies)

                except Exception as e:
                    self.logger.error(f"第{page}页获取失败: {str(e)}")
                    continue

            # 去重（基于IP+端口）
            seen = set()
            unique_proxies = []
            for proxy in all_proxies:
                key = f"{proxy.ip_address}:{proxy.port}"
                if key not in seen:
                    seen.add(key)
                    unique_proxies.append(proxy)

            self.logger.info(f"成功获取 {len(unique_proxies)} 个唯一代理（共{len(all_proxies)}个，去重后）")

            # 保存到文件
            if unique_proxies:
                self.save_to_file(unique_proxies, output_file)
                self.logger.info(f"代理已保存到: {output_file}")

        except Exception as e:
            self.logger.error(f"获取网页代理失败: {str(e)}")

        return unique_proxies

    def _parse_html(self, html: str) -> List[ProxyInfo]:
        """解析HTML提取代理信息（基于page.html结构）"""
        proxies = []
        soup = BeautifulSoup(html, 'html.parser')

        # 查找表格
        table = soup.find('table', class_='table')
        if not table:
            self.logger.warning("未找到代理表格")
            return proxies

        # 解析表格行
        tbody = table.find('tbody')
        if not tbody:
            return proxies

        rows = tbody.find_all('tr')
        for row in rows:
            try:
                cells = row.find_all('td')
                if len(cells) < 12:
                    continue

                # 提取字段（根据page.html结构）
                ip_address = cells[0].text.strip()
                port_text = cells[1].text.strip()

                # 用户名和密码隐藏在img的data-text属性中
                username_img = cells[2].find('img')
                username = username_img.get('data-text', 'no need') if username_img else 'no need'

                password_img = cells[3].find('img')
                password = password_img.get('data-text', 'no need') if password_img else 'no need'

                # 国家
                country_link = cells[4].find('a')
                country = country_link.get('title', '') if country_link else ''

                # 协议
                protocol_text = cells[5].text.strip().lower()

                # 匿名级别
                anonymity = cells[6].text.strip()

                # 速度
                speed = cells[7].text.strip()

                # 运行时间百分比
                uptime = cells[8].text.strip()

                # 响应时间
                response_time = cells[9].text.strip()

                # 延迟
                latency = cells[10].text.strip()

                # 更新时间
                last_updated = cells[11].text.strip()

                # 验证必要字段
                if not ip_address or not port_text:
                    continue

                # 转换端口为整数
                try:
                    port = int(port_text)
                except ValueError:
                    continue

                # 映射协议
                protocol_map = {
                    'http': ProxyProtocol.HTTP,
                    'https': ProxyProtocol.HTTPS,
                    'socks4': ProxyProtocol.SOCKS4,
                    'socks5': ProxyProtocol.SOCKS5
                }
                protocol = protocol_map.get(protocol_text, ProxyProtocol.SOCKS5)

                proxy = ProxyInfo(
                    ip_address=ip_address,
                    port=port,
                    username=username,
                    password=password,
                    protocol=protocol,
                    country=country,
                    anonymity=anonymity,
                    speed=speed,
                    uptime_percentage=uptime,
                    response_time=response_time,
                    latency=latency,
                    last_updated=last_updated
                )
                proxies.append(proxy)

            except Exception as e:
                self.logger.debug(f"解析代理行失败: {str(e)}")
                continue

        return proxies

    def load_from_file(self, filepath: str) -> List[ProxyInfo]:
        """从本地文件加载代理列表"""
        self.logger.info(f"从本地文件加载代理: {filepath}")
        proxies = []

        if not os.path.exists(filepath):
            self.logger.warning(f"文件不存在: {filepath}")
            return proxies

        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                data = json.load(f)

            if isinstance(data, list):
                for item in data:
                    try:
                        proxy = ProxyInfo.from_dict(item)
                        proxies.append(proxy)
                    except Exception as e:
                        self.logger.debug(f"解析代理项失败: {str(e)}")
                        continue

            self.logger.info(f"成功加载 {len(proxies)} 个代理")

        except Exception as e:
            self.logger.error(f"加载本地文件失败: {str(e)}")

        return proxies

    def save_to_file(self, proxies: List[ProxyInfo], filepath: str):
        """保存代理列表到文件"""
        try:
            data = [proxy.to_dict() for proxy in proxies]
            with open(filepath, 'w', encoding='utf-8') as f:
                json.dump(data, f, indent=2, ensure_ascii=False)
            self.logger.info(f"保存 {len(proxies)} 个代理到文件: {filepath}")
        except Exception as e:
            self.logger.error(f"保存文件失败: {str(e)}")