# -*- coding: utf-8 -*- import os import requests import xml.etree.ElementTree as ET import json from requests.exceptions import RequestException from loguru import logger logger.add("./log/spider.log", format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {name}:{function}:{line} - {message}", rotation="10 MB", compression="zip", encoding="utf-8") # shell终端打印日志 # logger.add(lambda msg: print(msg), # format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {name}:{function}:{line} - {message}") # 测试用爬虫请求头 headers = { "Content-Type": "application/json", "Cache-Control": "no-cache", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "Sec-Fetch-Site": "same-origin", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-User": "?1", "Sec-Fetch-Dest": "document", "Accept-Language": "zh-CN,zh;q=0.9" } def fetch_rss(url, headers, timeout=20): try: response = requests.get(url, headers=headers, timeout=timeout) response.raise_for_status() # 检查请求是否成功 return response.content except requests.Timeout: logger.warning(f"请求 {url} 超时,跳过保存操作。") return None except RequestException as e: logger.error(f"请求 {url} 时发生错误: {e}") return None # 返回None表示请求失败 def parse_rss(rss_content): items = [] root = ET.fromstring(rss_content) for item in root.findall('.//item'): item_dict = {} for child in item: tag = child.tag # 将一标签替换名称方便处理 if tag.startswith('{http://purl.org/rss/1.0/modules/content/}'): tag = 'body' item_dict[tag] = child.text items.append(item_dict) return items def save_to_json(data, filename): with open(filename, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=4) def freebuf_main(): url = "https://www.freebuf.com/feed" rss_content = fetch_rss(url, headers) if rss_content is None: logger.warning("无法获取Freebuf RSS内容,跳过保存操作。") return try: items = parse_rss(rss_content) # 确保目录存在 os.makedirs(os.path.dirname('./JSON/freebuf.json'), exist_ok=True) # 将解析后的数据保存到 JSON 文件 save_to_json(items, './JSON/freebuf.json') logger.info("数据已保存到 ./JSON/freebuf.json!") except Exception as e: logger.error(f"解析或保存Freebuf RSS内容时发生错误: {e}") if __name__ == '__main__': freebuf_main()