# # -*- coding: utf-8 -*- # import os # import requests # import xml.etree.ElementTree as ET # import json # from requests.exceptions import RequestException # from loguru import logger # # 测试用爬虫请求头 # headers = { # "Content-Type": "application/json", # "Cache-Control": "no-cache", # "Upgrade-Insecure-Requests": "1", # "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Safari/537.36", # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", # "Sec-Fetch-Site": "same-origin", # "Sec-Fetch-Mode": "navigate", # "Sec-Fetch-User": "?1", # "Sec-Fetch-Dest": "document", # "Accept-Language": "zh-CN,zh;q=0.9" # } # def fetch_rss(url, headers, timeout=20): # try: # response = requests.get(url, headers=headers, timeout=timeout) # response.raise_for_status() # 检查请求是否成功 # return response.content # except requests.Timeout: # logger.warning(f"请求 {url} 超时,跳过保存操作。") # return None # except RequestException as e: # logger.warning(f"请求 {url} 时发生错误: {e}") # return None # 返回None表示请求失败 # def parse_rss(rss_content): # items = [] # root = ET.fromstring(rss_content) # for item in root.findall('.//item'): # item_dict = {} # for child in item: # tag = child.tag # # 将一标签替换名称方便处理 # if tag.startswith('{http://purl.org/rss/1.0/modules/content/}'): # tag = 'body' # item_dict[tag] = child.text # items.append(item_dict) # return items # def save_to_json(data, filename): # with open(filename, 'w', encoding='utf-8') as f: # json.dump(data, f, ensure_ascii=False, indent=4) # def freebuf_main(): # url = "https://www.freebuf.com/feed" # rss_content = fetch_rss(url, headers) # if rss_content is None: # logger.warning("无法获取Freebuf RSS内容,跳过保存操作。") # return # try: # items = parse_rss(rss_content) # # 确保目录存在 # os.makedirs(os.path.dirname('./resources/JSON/freebuf.json'), exist_ok=True) # # 将解析后的数据保存到 JSON 文件 # save_to_json(items, './resources/JSON/freebuf.json') # logger.info("数据已保存到 ./resources/JSON/freebuf.json!") # except Exception as e: # logger.warning(f"解析或保存Freebuf RSS内容时发生错误: {e}") # if __name__ == '__main__': # freebuf_main() # -*- coding: utf-8 -*- import os import subprocess import xml.etree.ElementTree as ET import json from loguru import logger def fetch_rss_curl(url, timeout=30): """使用 curl 直接 GET 请求获取 RSS 内容""" try: # 最简单的 curl GET 请求 cmd = [ 'curl', '-s', # 静默模式 '-L', # 跟随重定向 '--max-time', str(timeout), url ] # 执行 curl 命令 result = subprocess.run( cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=timeout + 10 ) if result.returncode != 0: logger.warning(f"curl 命令执行失败: {result.stderr.decode('utf-8', errors='ignore')}") return None content = result.stdout # 检查是否为空 if not content or len(content) < 100: logger.warning(f"curl 返回的内容过短或为空,长度: {len(content)}") return None logger.info(f"成功通过 curl 获取 Freebuf RSS,内容长度: {len(content)}") return content except FileNotFoundError: logger.error("系统中未找到 curl 命令,请确保已安装 curl") return None except subprocess.TimeoutExpired: logger.warning(f"curl 命令执行超时({timeout}秒)") return None except Exception as e: logger.warning(f"使用 curl 获取 RSS 时发生错误: {e}") return None def parse_rss(rss_content): """解析 RSS 内容""" items = [] if rss_content is None: return items try: # 清理可能的 BOM if isinstance(rss_content, bytes): if rss_content.startswith(b'\xef\xbb\xbf'): rss_content = rss_content[3:] rss_text = rss_content.decode('utf-8', errors='ignore') else: rss_text = rss_content # 查找第一个 < 字符 first_lt = rss_text.find('<') if first_lt != -1: rss_text = rss_text[first_lt:] root = ET.fromstring(rss_text) for item in root.findall('.//item'): item_dict = {} for child in item: tag = child.tag if tag.startswith('{http://purl.org/rss/1.0/modules/content/}'): tag = 'body' item_dict[tag] = child.text items.append(item_dict) except ET.ParseError as e: logger.warning(f"XML 解析错误: {e}") except Exception as e: logger.warning(f"解析 RSS 时发生错误: {e}") return items def save_to_json(data, filename): with open(filename, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=4) def freebuf_main(): url = "https://www.freebuf.com/feed" logger.info("开始获取 Freebuf RSS 内容...") rss_content = fetch_rss_curl(url) if rss_content is None: logger.warning("无法获取Freebuf RSS内容,跳过保存操作。") return try: items = parse_rss(rss_content) if not items: logger.warning("解析后的 Freebuf RSS 数据为空。") return os.makedirs(os.path.dirname('./resources/JSON/freebuf.json'), exist_ok=True) save_to_json(items, './resources/JSON/freebuf.json') logger.info(f"数据已保存到 ./resources/JSON/freebuf.json!共 {len(items)} 条记录。") except Exception as e: logger.warning(f"解析或保存Freebuf RSS内容时发生错误: {e}") if __name__ == '__main__': freebuf_main()