PyBot/media/sougou-wx.py

import requests
from bs4 import BeautifulSoup
import json
import time
import os
import datetime
from requests.exceptions import RequestException
from loguru import logger

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
    "Accept-Encoding": "gzip, deflate, br",
    "Upgrade-Insecure-Requests": "1",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "none",
    "Sec-Fetch-User": "?1",
    "Priority": "u=0, i",
    "Te": "trailers",
    "Connection": "keep-alive"
}

def fetch_html(url, headers=headers, timeout=10):
    try:
        response = requests.get(url, headers=headers, timeout=timeout)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"请求出错: {e}")
        return None

def parse_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # 提取所有符合条件的<li>标签
    items = soup.find_all('li', id=lambda x: x and x.startswith('sogou_vr_11002601_box_'))
    
    results = []
    
    for item in items:
        # 提取标题和链接
        title_tag = item.find('h3')
        if title_tag:
            a_tag = title_tag.find('a')
            title = title_tag.get_text(strip=True) if title_tag else "No title found"
            link = a_tag['href'] if a_tag else "No link found"
            if link and not link.startswith('http'):
                link = "https://weixin.sogou.com" + link
        else:
            title = "No title found"
            link = "No link found"
        
        # 提取摘要
        summary_tag = item.find('p', class_='txt-info')
        summary = summary_tag.get_text(strip=True) if summary_tag else "No summary found"
        
        # 提取发布者
        publisher_tag = item.find('span', class_='all-time-y2')
        publisher = publisher_tag.get_text(strip=True) if publisher_tag else "No publisher found"
        
        # 提取时间戳并转换为标准时间格式
        timestamp_script = item.find('script', string=lambda text: 'document.write(timeConvert' in text)
        if timestamp_script:
            timestamp_str = timestamp_script.string.split("'")[1]
            timestamp = int(timestamp_str)
            standard_time = datetime.datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')
        else:
            standard_time = "No timestamp found"
        
        results.append({
            "title": title,
            "link": link,
            "description": summary,
            "author": publisher,
            "pubDate": standard_time
        })
    
    return results

def remove_surrogates(text):
    """移除非法代理对"""
    return text.encode('utf-8', 'ignore').decode('utf-8')

def sougou_wx_main(keywords):
    all_results = {}  # 用于存储所有关键词的结果

    for keyword in keywords:
        url = f"https://weixin.sogou.com/weixin?type=2&s_from=input&ie=utf8&query={keyword}"
        html_content = fetch_html(url)
        # print(html_content)

        if html_content is None:
            logger.warning(f"无法获取微信公众号-Sogou搜索内容，跳过保存操作。关键词: {keyword}")
            continue

        results = parse_html(html_content)
        # 移除非法代理对
        cleaned_results = [{k: remove_surrogates(v) for k, v in item.items()} for item in results]
        logger.warning(f"关键词【{keyword}】的微信公众号-Sogou搜索内容保存成功。")
        all_results[keyword] = cleaned_results  # 将结果存储在字典中，以关键词为键
        time.sleep(5)

    # 将所有结果转换为JSON格式
    json_results = json.dumps(all_results, ensure_ascii=False, indent=4)
    # print(json_results)

    # 确保目录存在
    os.makedirs(os.path.dirname('./JSON/sougou-wx.json'), exist_ok=True)

    # 将解析后的数据保存到 JSON 文件
    with open('./JSON/sougou-wx.json', 'w', encoding='utf-8') as f:
        f.write(json_results)
    
if __name__ == "__main__":
    keywords = ["齐鲁银行", "APP逆向", "渗透测试"]
    sougou_wx_main(keywords)