118 lines
4.3 KiB
Python
118 lines
4.3 KiB
Python
import requests
|
||
from bs4 import BeautifulSoup
|
||
import json
|
||
import time
|
||
import os
|
||
import datetime
|
||
from requests.exceptions import RequestException
|
||
from loguru import logger
|
||
|
||
headers = {
|
||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
|
||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||
"Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
|
||
"Accept-Encoding": "gzip, deflate, br",
|
||
"Upgrade-Insecure-Requests": "1",
|
||
"Sec-Fetch-Dest": "document",
|
||
"Sec-Fetch-Mode": "navigate",
|
||
"Sec-Fetch-Site": "none",
|
||
"Sec-Fetch-User": "?1",
|
||
"Priority": "u=0, i",
|
||
"Te": "trailers",
|
||
"Connection": "keep-alive"
|
||
}
|
||
|
||
def fetch_html(url, headers=headers, timeout=10):
|
||
try:
|
||
response = requests.get(url, headers=headers, timeout=timeout)
|
||
response.raise_for_status()
|
||
return response.text
|
||
except requests.exceptions.RequestException as e:
|
||
print(f"请求出错: {e}")
|
||
return None
|
||
|
||
def parse_html(html_content):
|
||
soup = BeautifulSoup(html_content, 'html.parser')
|
||
|
||
# 提取所有符合条件的<li>标签
|
||
items = soup.find_all('li', id=lambda x: x and x.startswith('sogou_vr_11002601_box_'))
|
||
|
||
results = []
|
||
|
||
for item in items:
|
||
# 提取标题和链接
|
||
title_tag = item.find('h3')
|
||
if title_tag:
|
||
a_tag = title_tag.find('a')
|
||
title = title_tag.get_text(strip=True) if title_tag else "No title found"
|
||
link = a_tag['href'] if a_tag else "No link found"
|
||
if link and not link.startswith('http'):
|
||
link = "https://weixin.sogou.com" + link
|
||
else:
|
||
title = "No title found"
|
||
link = "No link found"
|
||
|
||
# 提取摘要
|
||
summary_tag = item.find('p', class_='txt-info')
|
||
summary = summary_tag.get_text(strip=True) if summary_tag else "No summary found"
|
||
|
||
# 提取发布者
|
||
publisher_tag = item.find('span', class_='all-time-y2')
|
||
publisher = publisher_tag.get_text(strip=True) if publisher_tag else "No publisher found"
|
||
|
||
# 提取时间戳并转换为标准时间格式
|
||
timestamp_script = item.find('script', string=lambda text: 'document.write(timeConvert' in text)
|
||
if timestamp_script:
|
||
timestamp_str = timestamp_script.string.split("'")[1]
|
||
timestamp = int(timestamp_str)
|
||
standard_time = datetime.datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')
|
||
else:
|
||
standard_time = "No timestamp found"
|
||
|
||
results.append({
|
||
"title": title,
|
||
"link": link,
|
||
"description": summary,
|
||
"author": publisher,
|
||
"pubDate": standard_time
|
||
})
|
||
|
||
return results
|
||
|
||
def remove_surrogates(text):
|
||
"""移除非法代理对"""
|
||
return text.encode('utf-8', 'ignore').decode('utf-8')
|
||
|
||
def sougou_wx_main(keywords):
|
||
all_results = {} # 用于存储所有关键词的结果
|
||
|
||
for keyword in keywords:
|
||
url = f"https://weixin.sogou.com/weixin?type=2&s_from=input&ie=utf8&query={keyword}"
|
||
html_content = fetch_html(url)
|
||
# print(html_content)
|
||
|
||
if html_content is None:
|
||
logger.warning(f"无法获取微信公众号-Sogou搜索内容,跳过保存操作。关键词: {keyword}")
|
||
continue
|
||
|
||
results = parse_html(html_content)
|
||
# 移除非法代理对
|
||
cleaned_results = [{k: remove_surrogates(v) for k, v in item.items()} for item in results]
|
||
logger.warning(f"关键词【{keyword}】的微信公众号-Sogou搜索内容保存成功。")
|
||
all_results[keyword] = cleaned_results # 将结果存储在字典中,以关键词为键
|
||
time.sleep(5)
|
||
|
||
# 将所有结果转换为JSON格式
|
||
json_results = json.dumps(all_results, ensure_ascii=False, indent=4)
|
||
# print(json_results)
|
||
|
||
# 确保目录存在
|
||
os.makedirs(os.path.dirname('./JSON/sougou-wx.json'), exist_ok=True)
|
||
|
||
# 将解析后的数据保存到 JSON 文件
|
||
with open('./JSON/sougou-wx.json', 'w', encoding='utf-8') as f:
|
||
f.write(json_results)
|
||
|
||
if __name__ == "__main__":
|
||
keywords = ["齐鲁银行", "APP逆向", "渗透测试"]
|
||
sougou_wx_main(keywords) |