PyBot/media/xianzhi.py

56 lines
2.0 KiB
Python
Raw Normal View History

2024-12-06 16:32:34 +08:00
# -*- coding: utf-8 -*-
2024-12-03 00:03:14 +08:00
import os
import requests
import xml.etree.ElementTree as ET
import xmltodict
import json
# 测试用爬虫请求头
headers = {
"Content-Type": "application/atom+xml; charset=utf-8",
"Cache-Control": "no-cache",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Language": "zh-CN,zh;q=0.9"
}
def fetch_rss(url, headers):
response = requests.get(url, headers=headers)
response.raise_for_status() # 检查请求是否成功
# print(response.content.decode('utf-8')) # 打印响应内容,确保获取的数据是正确的 XML 格式
return response.content
# 先知社区 爬虫
def xianzhi_main():
url = "https://xz.aliyun.com/feed"
rss_content = fetch_rss(url, headers)
# 将 XML 数据转换为 Python 字典
data = xmltodict.parse(rss_content)
# 提取所有的 <entry> 元素
entries = data['feed']['entry']
# 创建一个空列表来存储每个 <entry> 的 JSON 数据
entries_json = []
# 遍历每个 <entry> 元素,提取信息,并添加到列表中
for entry in entries:
entry_dict = {
'title': entry.get('title', ''),
'link': entry.get('link', {}).get('@href', ''), # 提取链接
'published': entry.get('published', ''),
'id': entry.get('id', ''),
'summary': entry.get('summary', '')
}
entries_json.append(entry_dict)
# 将 <entry> 元素的列表转换为 JSON 格式
json_data = json.dumps(entries_json, ensure_ascii=False, indent=4)
# 保存 JSON 数据到文件
with open('./JSON/xianzhi.json', 'w', encoding='utf-8') as json_file:
json_file.write(json_data)
print("数据已保存到./JSON/xianzhi.json")