PyBot/media/xianzhi.py
2024-12-08 00:18:31 +08:00

86 lines
3.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
import os
import requests
import xmltodict
import json
import logging
# 设置日志记录
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.handlers.clear() # 清除已有的处理器
file_handler = logging.FileHandler('./log/spider.log', mode='a', encoding='utf-8')
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
console_handler = logging.StreamHandler()
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
logger.addHandler(file_handler)
logger.addHandler(console_handler)
logger.propagate = False # 禁用日志传递
# 测试用爬虫请求头
headers = {
"Content-Type": "application/atom+xml; charset=utf-8",
"Cache-Control": "no-cache",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Language": "zh-CN,zh;q=0.9"
}
def fetch_rss(url, headers):
try:
response = requests.get(url, headers=headers)
response.raise_for_status() # 检查请求是否成功
return response.content
except requests.exceptions.RequestException as e:
logger.error(f"请求 {url} 时发生错误: {e}")
return None
# 先知社区 爬虫
def xianzhi_main():
url = "https://xz.aliyun.com/feed"
rss_content = fetch_rss(url, headers)
if rss_content is None:
logger.warning("无法获取先知社区RSS内容跳过保存操作。")
return
try:
# 将 XML 数据转换为 Python 字典
data = xmltodict.parse(rss_content)
# 提取所有的 <entry> 元素
entries = data['feed'].get('entry', [])
# 创建一个空列表来存储每个 <entry> 的 JSON 数据
entries_json = []
# 遍历每个 <entry> 元素,提取信息,并添加到列表中
for entry in entries:
entry_dict = {
'title': entry.get('title', ''),
'link': entry.get('link', {}).get('@href', ''), # 提取链接
'published': entry.get('published', ''),
'id': entry.get('id', ''),
'summary': entry.get('summary', '')
}
entries_json.append(entry_dict)
# 将 <entry> 元素的列表转换为 JSON 格式
json_data = json.dumps(entries_json, ensure_ascii=False, indent=4)
# 保存 JSON 数据到文件
with open('./JSON/xianzhi.json', 'w', encoding='utf-8') as json_file:
json_file.write(json_data)
logger.info("数据已保存到 ./JSON/xianzhi.json")
except Exception as e:
logger.error(f"解析或保存先知社区RSS内容时发生错误: {e}")
# 示例调用
if __name__ == "__main__":
xianzhi_main()