PyBot/media/xianzhi.py
2024-12-03 00:03:14 +08:00

55 lines
2.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import requests
import xml.etree.ElementTree as ET
import xmltodict
import json
# 测试用爬虫请求头
headers = {
"Content-Type": "application/atom+xml; charset=utf-8",
"Cache-Control": "no-cache",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Language": "zh-CN,zh;q=0.9"
}
def fetch_rss(url, headers):
response = requests.get(url, headers=headers)
response.raise_for_status() # 检查请求是否成功
# print(response.content.decode('utf-8')) # 打印响应内容,确保获取的数据是正确的 XML 格式
return response.content
# 先知社区 爬虫
def xianzhi_main():
url = "https://xz.aliyun.com/feed"
rss_content = fetch_rss(url, headers)
# 将 XML 数据转换为 Python 字典
data = xmltodict.parse(rss_content)
# 提取所有的 <entry> 元素
entries = data['feed']['entry']
# 创建一个空列表来存储每个 <entry> 的 JSON 数据
entries_json = []
# 遍历每个 <entry> 元素,提取信息,并添加到列表中
for entry in entries:
entry_dict = {
'title': entry.get('title', ''),
'link': entry.get('link', {}).get('@href', ''), # 提取链接
'published': entry.get('published', ''),
'id': entry.get('id', ''),
'summary': entry.get('summary', '')
}
entries_json.append(entry_dict)
# 将 <entry> 元素的列表转换为 JSON 格式
json_data = json.dumps(entries_json, ensure_ascii=False, indent=4)
# 保存 JSON 数据到文件
with open('./JSON/xianzhi.json', 'w', encoding='utf-8') as json_file:
json_file.write(json_data)
print("数据已保存到./JSON/xianzhi.json")