197 lines
6.4 KiB
Python
197 lines
6.4 KiB
Python
# # -*- coding: utf-8 -*-
|
||
# import os
|
||
# import requests
|
||
# import xml.etree.ElementTree as ET
|
||
# import json
|
||
# from requests.exceptions import RequestException
|
||
# from loguru import logger
|
||
|
||
# # 测试用爬虫请求头
|
||
# headers = {
|
||
# "Content-Type": "application/json",
|
||
# "Cache-Control": "no-cache",
|
||
# "Upgrade-Insecure-Requests": "1",
|
||
# "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Safari/537.36",
|
||
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
||
# "Sec-Fetch-Site": "same-origin",
|
||
# "Sec-Fetch-Mode": "navigate",
|
||
# "Sec-Fetch-User": "?1",
|
||
# "Sec-Fetch-Dest": "document",
|
||
# "Accept-Language": "zh-CN,zh;q=0.9"
|
||
# }
|
||
|
||
# def fetch_rss(url, headers, timeout=20):
|
||
# try:
|
||
# response = requests.get(url, headers=headers, timeout=timeout)
|
||
# response.raise_for_status() # 检查请求是否成功
|
||
# return response.content
|
||
# except requests.Timeout:
|
||
# logger.warning(f"请求 {url} 超时,跳过保存操作。")
|
||
# return None
|
||
# except RequestException as e:
|
||
# logger.warning(f"请求 {url} 时发生错误: {e}")
|
||
# return None # 返回None表示请求失败
|
||
|
||
# def parse_rss(rss_content):
|
||
# items = []
|
||
# root = ET.fromstring(rss_content)
|
||
# for item in root.findall('.//item'):
|
||
# item_dict = {}
|
||
# for child in item:
|
||
# tag = child.tag
|
||
# # 将一标签替换名称方便处理
|
||
# if tag.startswith('{http://purl.org/rss/1.0/modules/content/}'):
|
||
# tag = 'body'
|
||
# item_dict[tag] = child.text
|
||
# items.append(item_dict)
|
||
# return items
|
||
|
||
# def save_to_json(data, filename):
|
||
# with open(filename, 'w', encoding='utf-8') as f:
|
||
# json.dump(data, f, ensure_ascii=False, indent=4)
|
||
|
||
# def freebuf_main():
|
||
# url = "https://www.freebuf.com/feed"
|
||
# rss_content = fetch_rss(url, headers)
|
||
|
||
# if rss_content is None:
|
||
# logger.warning("无法获取Freebuf RSS内容,跳过保存操作。")
|
||
# return
|
||
|
||
# try:
|
||
# items = parse_rss(rss_content)
|
||
|
||
# # 确保目录存在
|
||
# os.makedirs(os.path.dirname('./resources/JSON/freebuf.json'), exist_ok=True)
|
||
|
||
# # 将解析后的数据保存到 JSON 文件
|
||
# save_to_json(items, './resources/JSON/freebuf.json')
|
||
# logger.info("数据已保存到 ./resources/JSON/freebuf.json!")
|
||
# except Exception as e:
|
||
# logger.warning(f"解析或保存Freebuf RSS内容时发生错误: {e}")
|
||
|
||
# if __name__ == '__main__':
|
||
# freebuf_main()
|
||
|
||
|
||
|
||
# -*- coding: utf-8 -*-
|
||
import os
|
||
import subprocess
|
||
import xml.etree.ElementTree as ET
|
||
import json
|
||
from loguru import logger
|
||
|
||
def fetch_rss_curl(url, timeout=30):
|
||
"""使用 curl 直接 GET 请求获取 RSS 内容"""
|
||
try:
|
||
# 最简单的 curl GET 请求
|
||
cmd = [
|
||
'curl',
|
||
'-s', # 静默模式
|
||
'-L', # 跟随重定向
|
||
'--max-time', str(timeout),
|
||
url
|
||
]
|
||
|
||
# 执行 curl 命令
|
||
result = subprocess.run(
|
||
cmd,
|
||
stdout=subprocess.PIPE,
|
||
stderr=subprocess.PIPE,
|
||
timeout=timeout + 10
|
||
)
|
||
|
||
if result.returncode != 0:
|
||
logger.warning(f"curl 命令执行失败: {result.stderr.decode('utf-8', errors='ignore')}")
|
||
return None
|
||
|
||
content = result.stdout
|
||
|
||
# 检查是否为空
|
||
if not content or len(content) < 100:
|
||
logger.warning(f"curl 返回的内容过短或为空,长度: {len(content)}")
|
||
return None
|
||
|
||
logger.info(f"成功通过 curl 获取 Freebuf RSS,内容长度: {len(content)}")
|
||
return content
|
||
|
||
except FileNotFoundError:
|
||
logger.error("系统中未找到 curl 命令,请确保已安装 curl")
|
||
return None
|
||
except subprocess.TimeoutExpired:
|
||
logger.warning(f"curl 命令执行超时({timeout}秒)")
|
||
return None
|
||
except Exception as e:
|
||
logger.warning(f"使用 curl 获取 RSS 时发生错误: {e}")
|
||
return None
|
||
|
||
def parse_rss(rss_content):
|
||
"""解析 RSS 内容"""
|
||
items = []
|
||
|
||
if rss_content is None:
|
||
return items
|
||
|
||
try:
|
||
# 清理可能的 BOM
|
||
if isinstance(rss_content, bytes):
|
||
if rss_content.startswith(b'\xef\xbb\xbf'):
|
||
rss_content = rss_content[3:]
|
||
rss_text = rss_content.decode('utf-8', errors='ignore')
|
||
else:
|
||
rss_text = rss_content
|
||
|
||
# 查找第一个 < 字符
|
||
first_lt = rss_text.find('<')
|
||
if first_lt != -1:
|
||
rss_text = rss_text[first_lt:]
|
||
|
||
root = ET.fromstring(rss_text)
|
||
|
||
for item in root.findall('.//item'):
|
||
item_dict = {}
|
||
for child in item:
|
||
tag = child.tag
|
||
if tag.startswith('{http://purl.org/rss/1.0/modules/content/}'):
|
||
tag = 'body'
|
||
item_dict[tag] = child.text
|
||
items.append(item_dict)
|
||
|
||
except ET.ParseError as e:
|
||
logger.warning(f"XML 解析错误: {e}")
|
||
except Exception as e:
|
||
logger.warning(f"解析 RSS 时发生错误: {e}")
|
||
|
||
return items
|
||
|
||
def save_to_json(data, filename):
|
||
with open(filename, 'w', encoding='utf-8') as f:
|
||
json.dump(data, f, ensure_ascii=False, indent=4)
|
||
|
||
def freebuf_main():
|
||
url = "https://www.freebuf.com/feed"
|
||
|
||
logger.info("开始获取 Freebuf RSS 内容...")
|
||
rss_content = fetch_rss_curl(url)
|
||
|
||
if rss_content is None:
|
||
logger.warning("无法获取Freebuf RSS内容,跳过保存操作。")
|
||
return
|
||
|
||
try:
|
||
items = parse_rss(rss_content)
|
||
|
||
if not items:
|
||
logger.warning("解析后的 Freebuf RSS 数据为空。")
|
||
return
|
||
|
||
os.makedirs(os.path.dirname('./resources/JSON/freebuf.json'), exist_ok=True)
|
||
save_to_json(items, './resources/JSON/freebuf.json')
|
||
logger.info(f"数据已保存到 ./resources/JSON/freebuf.json!共 {len(items)} 条记录。")
|
||
|
||
except Exception as e:
|
||
logger.warning(f"解析或保存Freebuf RSS内容时发生错误: {e}")
|
||
|
||
if __name__ == '__main__':
|
||
freebuf_main() |