PyBot/spider/freebuf.py
2026-05-24 19:54:12 +08:00

197 lines
6.4 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# # -*- coding: utf-8 -*-
# import os
# import requests
# import xml.etree.ElementTree as ET
# import json
# from requests.exceptions import RequestException
# from loguru import logger
# # 测试用爬虫请求头
# headers = {
# "Content-Type": "application/json",
# "Cache-Control": "no-cache",
# "Upgrade-Insecure-Requests": "1",
# "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Safari/537.36",
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
# "Sec-Fetch-Site": "same-origin",
# "Sec-Fetch-Mode": "navigate",
# "Sec-Fetch-User": "?1",
# "Sec-Fetch-Dest": "document",
# "Accept-Language": "zh-CN,zh;q=0.9"
# }
# def fetch_rss(url, headers, timeout=20):
# try:
# response = requests.get(url, headers=headers, timeout=timeout)
# response.raise_for_status() # 检查请求是否成功
# return response.content
# except requests.Timeout:
# logger.warning(f"请求 {url} 超时,跳过保存操作。")
# return None
# except RequestException as e:
# logger.warning(f"请求 {url} 时发生错误: {e}")
# return None # 返回None表示请求失败
# def parse_rss(rss_content):
# items = []
# root = ET.fromstring(rss_content)
# for item in root.findall('.//item'):
# item_dict = {}
# for child in item:
# tag = child.tag
# # 将一标签替换名称方便处理
# if tag.startswith('{http://purl.org/rss/1.0/modules/content/}'):
# tag = 'body'
# item_dict[tag] = child.text
# items.append(item_dict)
# return items
# def save_to_json(data, filename):
# with open(filename, 'w', encoding='utf-8') as f:
# json.dump(data, f, ensure_ascii=False, indent=4)
# def freebuf_main():
# url = "https://www.freebuf.com/feed"
# rss_content = fetch_rss(url, headers)
# if rss_content is None:
# logger.warning("无法获取Freebuf RSS内容跳过保存操作。")
# return
# try:
# items = parse_rss(rss_content)
# # 确保目录存在
# os.makedirs(os.path.dirname('./resources/JSON/freebuf.json'), exist_ok=True)
# # 将解析后的数据保存到 JSON 文件
# save_to_json(items, './resources/JSON/freebuf.json')
# logger.info("数据已保存到 ./resources/JSON/freebuf.json")
# except Exception as e:
# logger.warning(f"解析或保存Freebuf RSS内容时发生错误: {e}")
# if __name__ == '__main__':
# freebuf_main()
# -*- coding: utf-8 -*-
import os
import subprocess
import xml.etree.ElementTree as ET
import json
from loguru import logger
def fetch_rss_curl(url, timeout=30):
"""使用 curl 直接 GET 请求获取 RSS 内容"""
try:
# 最简单的 curl GET 请求
cmd = [
'curl',
'-s', # 静默模式
'-L', # 跟随重定向
'--max-time', str(timeout),
url
]
# 执行 curl 命令
result = subprocess.run(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
timeout=timeout + 10
)
if result.returncode != 0:
logger.warning(f"curl 命令执行失败: {result.stderr.decode('utf-8', errors='ignore')}")
return None
content = result.stdout
# 检查是否为空
if not content or len(content) < 100:
logger.warning(f"curl 返回的内容过短或为空,长度: {len(content)}")
return None
logger.info(f"成功通过 curl 获取 Freebuf RSS内容长度: {len(content)}")
return content
except FileNotFoundError:
logger.error("系统中未找到 curl 命令,请确保已安装 curl")
return None
except subprocess.TimeoutExpired:
logger.warning(f"curl 命令执行超时({timeout}秒)")
return None
except Exception as e:
logger.warning(f"使用 curl 获取 RSS 时发生错误: {e}")
return None
def parse_rss(rss_content):
"""解析 RSS 内容"""
items = []
if rss_content is None:
return items
try:
# 清理可能的 BOM
if isinstance(rss_content, bytes):
if rss_content.startswith(b'\xef\xbb\xbf'):
rss_content = rss_content[3:]
rss_text = rss_content.decode('utf-8', errors='ignore')
else:
rss_text = rss_content
# 查找第一个 < 字符
first_lt = rss_text.find('<')
if first_lt != -1:
rss_text = rss_text[first_lt:]
root = ET.fromstring(rss_text)
for item in root.findall('.//item'):
item_dict = {}
for child in item:
tag = child.tag
if tag.startswith('{http://purl.org/rss/1.0/modules/content/}'):
tag = 'body'
item_dict[tag] = child.text
items.append(item_dict)
except ET.ParseError as e:
logger.warning(f"XML 解析错误: {e}")
except Exception as e:
logger.warning(f"解析 RSS 时发生错误: {e}")
return items
def save_to_json(data, filename):
with open(filename, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
def freebuf_main():
url = "https://www.freebuf.com/feed"
logger.info("开始获取 Freebuf RSS 内容...")
rss_content = fetch_rss_curl(url)
if rss_content is None:
logger.warning("无法获取Freebuf RSS内容跳过保存操作。")
return
try:
items = parse_rss(rss_content)
if not items:
logger.warning("解析后的 Freebuf RSS 数据为空。")
return
os.makedirs(os.path.dirname('./resources/JSON/freebuf.json'), exist_ok=True)
save_to_json(items, './resources/JSON/freebuf.json')
logger.info(f"数据已保存到 ./resources/JSON/freebuf.json{len(items)} 条记录。")
except Exception as e:
logger.warning(f"解析或保存Freebuf RSS内容时发生错误: {e}")
if __name__ == '__main__':
freebuf_main()