2024-12-30 21:31:54 +08:00
|
|
|
|
import requests
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
import json
|
|
|
|
|
import time
|
|
|
|
|
import os
|
|
|
|
|
import datetime
|
|
|
|
|
from requests.exceptions import RequestException
|
|
|
|
|
from loguru import logger
|
|
|
|
|
|
|
|
|
|
headers = {
|
|
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
|
|
|
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
|
|
|
"Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
|
|
|
|
|
"Accept-Encoding": "gzip, deflate, br",
|
|
|
|
|
"Upgrade-Insecure-Requests": "1",
|
|
|
|
|
"Sec-Fetch-Dest": "document",
|
|
|
|
|
"Sec-Fetch-Mode": "navigate",
|
|
|
|
|
"Sec-Fetch-Site": "none",
|
|
|
|
|
"Sec-Fetch-User": "?1",
|
|
|
|
|
"Priority": "u=0, i",
|
|
|
|
|
"Te": "trailers",
|
|
|
|
|
"Connection": "keep-alive"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
def fetch_html(url, headers=headers, timeout=10):
|
|
|
|
|
try:
|
|
|
|
|
response = requests.get(url, headers=headers, timeout=timeout)
|
|
|
|
|
response.raise_for_status()
|
|
|
|
|
return response.text
|
2025-01-02 13:00:43 +08:00
|
|
|
|
except requests.Timeout:
|
|
|
|
|
logger.warning(f"请求 {url} 超时,跳过保存操作。")
|
2024-12-30 21:31:54 +08:00
|
|
|
|
except requests.exceptions.RequestException as e:
|
2025-01-02 13:00:43 +08:00
|
|
|
|
logger.error(f"请求 {url} 时发生错误: {e}")
|
2024-12-30 21:31:54 +08:00
|
|
|
|
|
|
|
|
|
def parse_html(html_content):
|
|
|
|
|
soup = BeautifulSoup(html_content, 'html.parser')
|
|
|
|
|
|
|
|
|
|
# 提取所有符合条件的<li>标签
|
|
|
|
|
items = soup.find_all('li', id=lambda x: x and x.startswith('sogou_vr_11002601_box_'))
|
|
|
|
|
|
|
|
|
|
results = []
|
|
|
|
|
|
|
|
|
|
for item in items:
|
|
|
|
|
# 提取标题和链接
|
|
|
|
|
title_tag = item.find('h3')
|
|
|
|
|
if title_tag:
|
|
|
|
|
a_tag = title_tag.find('a')
|
|
|
|
|
title = title_tag.get_text(strip=True) if title_tag else "No title found"
|
|
|
|
|
link = a_tag['href'] if a_tag else "No link found"
|
|
|
|
|
if link and not link.startswith('http'):
|
|
|
|
|
link = "https://weixin.sogou.com" + link
|
|
|
|
|
else:
|
|
|
|
|
title = "No title found"
|
|
|
|
|
link = "No link found"
|
|
|
|
|
|
|
|
|
|
# 提取摘要
|
|
|
|
|
summary_tag = item.find('p', class_='txt-info')
|
|
|
|
|
summary = summary_tag.get_text(strip=True) if summary_tag else "No summary found"
|
|
|
|
|
|
|
|
|
|
# 提取发布者
|
|
|
|
|
publisher_tag = item.find('span', class_='all-time-y2')
|
|
|
|
|
publisher = publisher_tag.get_text(strip=True) if publisher_tag else "No publisher found"
|
|
|
|
|
|
|
|
|
|
# 提取时间戳并转换为标准时间格式
|
|
|
|
|
timestamp_script = item.find('script', string=lambda text: 'document.write(timeConvert' in text)
|
|
|
|
|
if timestamp_script:
|
|
|
|
|
timestamp_str = timestamp_script.string.split("'")[1]
|
|
|
|
|
timestamp = int(timestamp_str)
|
|
|
|
|
standard_time = datetime.datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')
|
|
|
|
|
else:
|
|
|
|
|
standard_time = "No timestamp found"
|
|
|
|
|
|
|
|
|
|
results.append({
|
|
|
|
|
"title": title,
|
|
|
|
|
"link": link,
|
|
|
|
|
"description": summary,
|
|
|
|
|
"author": publisher,
|
|
|
|
|
"pubDate": standard_time
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
return results
|
|
|
|
|
|
|
|
|
|
def remove_surrogates(text):
|
|
|
|
|
"""移除非法代理对"""
|
|
|
|
|
return text.encode('utf-8', 'ignore').decode('utf-8')
|
|
|
|
|
|
|
|
|
|
def sougou_wx_main(keywords):
|
|
|
|
|
all_results = {} # 用于存储所有关键词的结果
|
|
|
|
|
|
|
|
|
|
for keyword in keywords:
|
|
|
|
|
url = f"https://weixin.sogou.com/weixin?type=2&s_from=input&ie=utf8&query={keyword}"
|
|
|
|
|
html_content = fetch_html(url)
|
|
|
|
|
# print(html_content)
|
|
|
|
|
|
|
|
|
|
if html_content is None:
|
|
|
|
|
logger.warning(f"无法获取微信公众号-Sogou搜索内容,跳过保存操作。关键词: {keyword}")
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
results = parse_html(html_content)
|
|
|
|
|
# 移除非法代理对
|
|
|
|
|
cleaned_results = [{k: remove_surrogates(v) for k, v in item.items()} for item in results]
|
|
|
|
|
logger.warning(f"关键词【{keyword}】的微信公众号-Sogou搜索内容保存成功。")
|
|
|
|
|
all_results[keyword] = cleaned_results # 将结果存储在字典中,以关键词为键
|
|
|
|
|
time.sleep(5)
|
|
|
|
|
|
|
|
|
|
# 将所有结果转换为JSON格式
|
|
|
|
|
json_results = json.dumps(all_results, ensure_ascii=False, indent=4)
|
|
|
|
|
# print(json_results)
|
|
|
|
|
|
|
|
|
|
# 确保目录存在
|
|
|
|
|
os.makedirs(os.path.dirname('./JSON/sougou-wx.json'), exist_ok=True)
|
|
|
|
|
|
|
|
|
|
# 将解析后的数据保存到 JSON 文件
|
|
|
|
|
with open('./JSON/sougou-wx.json', 'w', encoding='utf-8') as f:
|
|
|
|
|
f.write(json_results)
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
keywords = ["齐鲁银行", "APP逆向", "渗透测试"]
|
|
|
|
|
sougou_wx_main(keywords)
|