import requests
from bs4 import BeautifulSoup
import json
import time
import os
import datetime
from requests.exceptions import RequestException
from loguru import logger
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
"Accept-Encoding": "gzip, deflate, br",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Priority": "u=0, i",
"Te": "trailers",
"Connection": "keep-alive"
}
def fetch_html(url, headers=headers, timeout=10):
try:
response = requests.get(url, headers=headers, timeout=timeout)
response.raise_for_status()
return response.text
except requests.Timeout:
logger.warning(f"请求 {url} 超时,跳过保存操作。")
except requests.exceptions.RequestException as e:
logger.error(f"请求 {url} 时发生错误: {e}")
def parse_html(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
# 提取所有符合条件的
标签
items = soup.find_all('li', id=lambda x: x and x.startswith('sogou_vr_11002601_box_'))
results = []
for item in items:
# 提取标题和链接
title_tag = item.find('h3')
if title_tag:
a_tag = title_tag.find('a')
title = title_tag.get_text(strip=True) if title_tag else "No title found"
link = a_tag['href'] if a_tag else "No link found"
if link and not link.startswith('http'):
link = "https://weixin.sogou.com" + link
else:
title = "No title found"
link = "No link found"
# 提取摘要
summary_tag = item.find('p', class_='txt-info')
summary = summary_tag.get_text(strip=True) if summary_tag else "No summary found"
# 提取发布者
publisher_tag = item.find('span', class_='all-time-y2')
publisher = publisher_tag.get_text(strip=True) if publisher_tag else "No publisher found"
# 提取时间戳并转换为标准时间格式
timestamp_script = item.find('script', string=lambda text: 'document.write(timeConvert' in text)
if timestamp_script:
timestamp_str = timestamp_script.string.split("'")[1]
timestamp = int(timestamp_str)
standard_time = datetime.datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')
else:
standard_time = "No timestamp found"
results.append({
"title": title,
"link": link,
"description": summary,
"author": publisher,
"pubDate": standard_time
})
return results
def remove_surrogates(text):
"""移除非法代理对"""
return text.encode('utf-8', 'ignore').decode('utf-8')
def sougou_wx_main(keywords):
all_results = {} # 用于存储所有关键词的结果
for keyword in keywords:
url = f"https://weixin.sogou.com/weixin?type=2&s_from=input&ie=utf8&query={keyword}"
html_content = fetch_html(url)
# print(html_content)
if html_content is None:
logger.warning(f"无法获取微信公众号-Sogou搜索内容,跳过保存操作。关键词: {keyword}")
continue
results = parse_html(html_content)
# 移除非法代理对
cleaned_results = [{k: remove_surrogates(v) for k, v in item.items()} for item in results]
logger.warning(f"关键词【{keyword}】的微信公众号-Sogou搜索内容保存成功。")
all_results[keyword] = cleaned_results # 将结果存储在字典中,以关键词为键
time.sleep(5)
# 将所有结果转换为JSON格式
json_results = json.dumps(all_results, ensure_ascii=False, indent=4)
# print(json_results)
# 确保目录存在
os.makedirs(os.path.dirname('./JSON/sougou-wx.json'), exist_ok=True)
# 将解析后的数据保存到 JSON 文件
with open('./JSON/sougou-wx.json', 'w', encoding='utf-8') as f:
f.write(json_results)
if __name__ == "__main__":
keywords = ["齐鲁银行", "APP逆向", "渗透测试"]
sougou_wx_main(keywords)