# -*- coding: utf-8 -*- import os import requests import xmltodict import json from loguru import logger logger.add("./log/spider.log", format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {name}:{function}:{line} - {message}", rotation="10 MB", compression="zip", encoding="utf-8") # shell终端打印日志 # logger.add(lambda msg: print(msg), # format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {name}:{function}:{line} - {message}") # 测试用爬虫请求头 headers = { "Content-Type": "application/atom+xml; charset=utf-8", "Cache-Control": "no-cache", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", "Accept-Language": "zh-CN,zh;q=0.9" } def fetch_rss(url, headers, timeout=20): try: response = requests.get(url, headers=headers, timeout=timeout) response.raise_for_status() # 检查请求是否成功 return response.content except requests.Timeout: logger.warning(f"请求 {url} 超时,跳过保存操作。") return None except requests.exceptions.RequestException as e: logger.error(f"请求 {url} 时发生错误: {e}") return None # 返回None表示请求失败 # 先知社区 爬虫 def xianzhi_main(): url = "https://xz.aliyun.com/feed" rss_content = fetch_rss(url, headers) if rss_content is None: logger.warning("无法获取先知社区RSS内容,跳过保存操作。") return try: # 将 XML 数据转换为 Python 字典 data = xmltodict.parse(rss_content) # 提取所有的 元素 entries = data['feed'].get('entry', []) # 创建一个空列表来存储每个 的 JSON 数据 entries_json = [] # 遍历每个 元素,提取信息,并添加到列表中 for entry in entries: entry_dict = { 'title': entry.get('title', ''), 'link': entry.get('link', {}).get('@href', ''), # 提取链接 'published': entry.get('published', ''), 'id': entry.get('id', ''), 'summary': entry.get('summary', '') } entries_json.append(entry_dict) # 将 元素的列表转换为 JSON 格式 json_data = json.dumps(entries_json, ensure_ascii=False, indent=4) # 保存 JSON 数据到文件 with open('./JSON/xianzhi.json', 'w', encoding='utf-8') as json_file: json_file.write(json_data) logger.info("数据已保存到 ./JSON/xianzhi.json!") except Exception as e: logger.error(f"解析或保存先知社区RSS内容时发生错误: {e}") # 示例调用 if __name__ == "__main__": xianzhi_main()