import requests from bs4 import BeautifulSoup import json import time import os import datetime from requests.exceptions import RequestException from loguru import logger headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2", "Accept-Encoding": "gzip, deflate, br", "Upgrade-Insecure-Requests": "1", "Sec-Fetch-Dest": "document", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-Site": "none", "Sec-Fetch-User": "?1", "Priority": "u=0, i", "Te": "trailers", "Connection": "keep-alive" } def fetch_html(url, headers=headers, timeout=10): try: response = requests.get(url, headers=headers, timeout=timeout) response.raise_for_status() return response.text except requests.Timeout: logger.warning(f"请求 {url} 超时,跳过保存操作。") except requests.exceptions.RequestException as e: logger.error(f"请求 {url} 时发生错误: {e}") def parse_html(html_content): soup = BeautifulSoup(html_content, 'html.parser') # 提取所有符合条件的
  • 标签 items = soup.find_all('li', id=lambda x: x and x.startswith('sogou_vr_11002601_box_')) results = [] for item in items: # 提取标题和链接 title_tag = item.find('h3') if title_tag: a_tag = title_tag.find('a') title = title_tag.get_text(strip=True) if title_tag else "No title found" link = a_tag['href'] if a_tag else "No link found" if link and not link.startswith('http'): link = "https://weixin.sogou.com" + link else: title = "No title found" link = "No link found" # 提取摘要 summary_tag = item.find('p', class_='txt-info') summary = summary_tag.get_text(strip=True) if summary_tag else "No summary found" # 提取发布者 publisher_tag = item.find('span', class_='all-time-y2') publisher = publisher_tag.get_text(strip=True) if publisher_tag else "No publisher found" # 提取时间戳并转换为标准时间格式 timestamp_script = item.find('script', string=lambda text: 'document.write(timeConvert' in text) if timestamp_script: timestamp_str = timestamp_script.string.split("'")[1] timestamp = int(timestamp_str) standard_time = datetime.datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S') else: standard_time = "No timestamp found" results.append({ "title": title, "link": link, "description": summary, "author": publisher, "pubDate": standard_time }) return results def remove_surrogates(text): """移除非法代理对""" return text.encode('utf-8', 'ignore').decode('utf-8') def sougou_wx_main(keywords): all_results = {} # 用于存储所有关键词的结果 for keyword in keywords: url = f"https://weixin.sogou.com/weixin?type=2&s_from=input&ie=utf8&query={keyword}" html_content = fetch_html(url) # print(html_content) if html_content is None: logger.warning(f"无法获取微信公众号-Sogou搜索内容,跳过保存操作。关键词: {keyword}") continue results = parse_html(html_content) # 移除非法代理对 cleaned_results = [{k: remove_surrogates(v) for k, v in item.items()} for item in results] logger.warning(f"关键词【{keyword}】的微信公众号-Sogou搜索内容保存成功。") all_results[keyword] = cleaned_results # 将结果存储在字典中,以关键词为键 time.sleep(5) # 将所有结果转换为JSON格式 json_results = json.dumps(all_results, ensure_ascii=False, indent=4) # print(json_results) # 确保目录存在 os.makedirs(os.path.dirname('./JSON/sougou-wx.json'), exist_ok=True) # 将解析后的数据保存到 JSON 文件 with open('./JSON/sougou-wx.json', 'w', encoding='utf-8') as f: f.write(json_results) if __name__ == "__main__": keywords = ["齐鲁银行", "APP逆向", "渗透测试"] sougou_wx_main(keywords)