PyBot/spider/sougou_wx.py

119 lines
4.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import requests
from bs4 import BeautifulSoup
import json
import time
import os
import datetime
from requests.exceptions import RequestException
from loguru import logger
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
"Accept-Encoding": "gzip, deflate, br",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Priority": "u=0, i",
"Te": "trailers",
"Connection": "keep-alive"
}
def fetch_html(url, headers=headers, timeout=10):
try:
response = requests.get(url, headers=headers, timeout=timeout)
response.raise_for_status()
return response.text
except requests.Timeout:
logger.warning(f"请求 {url} 超时,跳过保存操作。")
except requests.exceptions.RequestException as e:
logger.error(f"请求 {url} 时发生错误: {e}")
def parse_html(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
# 提取所有符合条件的<li>标签
items = soup.find_all('li', id=lambda x: x and x.startswith('sogou_vr_11002601_box_'))
results = []
for item in items:
# 提取标题和链接
title_tag = item.find('h3')
if title_tag:
a_tag = title_tag.find('a')
title = title_tag.get_text(strip=True) if title_tag else "No title found"
link = a_tag['href'] if a_tag else "No link found"
if link and not link.startswith('http'):
link = "https://weixin.sogou.com" + link
else:
title = "No title found"
link = "No link found"
# 提取摘要
summary_tag = item.find('p', class_='txt-info')
summary = summary_tag.get_text(strip=True) if summary_tag else "No summary found"
# 提取发布者
publisher_tag = item.find('span', class_='all-time-y2')
publisher = publisher_tag.get_text(strip=True) if publisher_tag else "No publisher found"
# 提取时间戳并转换为标准时间格式
timestamp_script = item.find('script', string=lambda text: 'document.write(timeConvert' in text)
if timestamp_script:
timestamp_str = timestamp_script.string.split("'")[1]
timestamp = int(timestamp_str)
standard_time = datetime.datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')
else:
standard_time = "No timestamp found"
results.append({
"title": title,
"link": link,
"description": summary,
"author": publisher,
"pubDate": standard_time
})
return results
def remove_surrogates(text):
"""移除非法代理对"""
return text.encode('utf-8', 'ignore').decode('utf-8')
def sougou_wx_main(keywords):
all_results = {} # 用于存储所有关键词的结果
for keyword in keywords:
url = f"https://weixin.sogou.com/weixin?type=2&s_from=input&ie=utf8&query={keyword}"
html_content = fetch_html(url)
# print(html_content)
if html_content is None:
logger.warning(f"无法获取微信公众号-Sogou搜索内容跳过保存操作。关键词: {keyword}")
continue
results = parse_html(html_content)
# 移除非法代理对
cleaned_results = [{k: remove_surrogates(v) for k, v in item.items()} for item in results]
logger.warning(f"关键词【{keyword}】的微信公众号-Sogou搜索内容保存成功。")
all_results[keyword] = cleaned_results # 将结果存储在字典中,以关键词为键
time.sleep(5)
# 将所有结果转换为JSON格式
json_results = json.dumps(all_results, ensure_ascii=False, indent=4)
# print(json_results)
# 确保目录存在
os.makedirs(os.path.dirname('./JSON/sougou-wx.json'), exist_ok=True)
# 将解析后的数据保存到 JSON 文件
with open('./JSON/sougou-wx.json', 'w', encoding='utf-8') as f:
f.write(json_results)
if __name__ == "__main__":
keywords = ["齐鲁银行", "APP逆向", "渗透测试"]
sougou_wx_main(keywords)