PyBot/media/xianzhi.py

# -*- coding: utf-8 -*-
import os
import requests
import xmltodict
import json
import logging

# 设置日志记录
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# 测试用爬虫请求头
headers = {
    "Content-Type": "application/atom+xml; charset=utf-8",
    "Cache-Control": "no-cache",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "Accept-Language": "zh-CN,zh;q=0.9"
}

def fetch_rss(url, headers):
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # 检查请求是否成功
        return response.content
    except requests.exceptions.RequestException as e:
        logging.error(f"请求 {url} 时发生错误: {e}")
        return None

# 先知社区 爬虫
def xianzhi_main():
    url = "https://xz.aliyun.com/feed"
    rss_content = fetch_rss(url, headers)
    
    if rss_content is None:
        logging.warning("无法获取先知社区RSS内容，跳过保存操作。")
        return
    
    try:
        # 将 XML 数据转换为 Python 字典
        data = xmltodict.parse(rss_content)

        # 提取所有的 <entry> 元素
        entries = data['feed'].get('entry', [])

        # 创建一个空列表来存储每个 <entry> 的 JSON 数据
        entries_json = []

        # 遍历每个 <entry> 元素，提取信息，并添加到列表中
        for entry in entries:
            entry_dict = {
                'title': entry.get('title', ''),
                'link': entry.get('link', {}).get('@href', ''),  # 提取链接
                'published': entry.get('published', ''),
                'id': entry.get('id', ''),
                'summary': entry.get('summary', '')
            }
            entries_json.append(entry_dict)

        # 将 <entry> 元素的列表转换为 JSON 格式
        json_data = json.dumps(entries_json, ensure_ascii=False, indent=4)

        # 保存 JSON 数据到文件
        with open('./JSON/xianzhi.json', 'w', encoding='utf-8') as json_file:
            json_file.write(json_data)

        logging.info("数据已保存到 ./JSON/xianzhi.json！")
    except Exception as e:
        logging.error(f"解析或保存先知社区RSS内容时发生错误: {e}")

# 示例调用
if __name__ == "__main__":
    xianzhi_main()