2024-12-06 16:32:34 +08:00
|
|
|
|
# -*- coding: utf-8 -*-
|
2024-12-03 00:03:14 +08:00
|
|
|
|
import os
|
|
|
|
|
import requests
|
|
|
|
|
import xmltodict
|
|
|
|
|
import json
|
2024-12-09 23:52:49 +08:00
|
|
|
|
from loguru import logger
|
2024-12-06 16:53:58 +08:00
|
|
|
|
|
2024-12-09 23:52:49 +08:00
|
|
|
|
logger.add("./log/spider.log",
|
|
|
|
|
format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {name}:{function}:{line} - {message}",
|
|
|
|
|
rotation="10 MB",
|
|
|
|
|
compression="zip",
|
|
|
|
|
encoding="utf-8")
|
|
|
|
|
# shell终端打印日志
|
|
|
|
|
# logger.add(lambda msg: print(msg),
|
|
|
|
|
# format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {name}:{function}:{line} - {message}")
|
2024-12-03 00:03:14 +08:00
|
|
|
|
|
|
|
|
|
# 测试用爬虫请求头
|
|
|
|
|
headers = {
|
|
|
|
|
"Content-Type": "application/atom+xml; charset=utf-8",
|
|
|
|
|
"Cache-Control": "no-cache",
|
|
|
|
|
"Upgrade-Insecure-Requests": "1",
|
|
|
|
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Safari/537.36",
|
|
|
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
|
|
|
|
"Accept-Language": "zh-CN,zh;q=0.9"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
def fetch_rss(url, headers):
|
2024-12-06 16:53:58 +08:00
|
|
|
|
try:
|
|
|
|
|
response = requests.get(url, headers=headers)
|
|
|
|
|
response.raise_for_status() # 检查请求是否成功
|
|
|
|
|
return response.content
|
|
|
|
|
except requests.exceptions.RequestException as e:
|
2024-12-08 00:18:31 +08:00
|
|
|
|
logger.error(f"请求 {url} 时发生错误: {e}")
|
2024-12-06 16:53:58 +08:00
|
|
|
|
return None
|
2024-12-03 00:03:14 +08:00
|
|
|
|
|
|
|
|
|
# 先知社区 爬虫
|
|
|
|
|
def xianzhi_main():
|
|
|
|
|
url = "https://xz.aliyun.com/feed"
|
|
|
|
|
rss_content = fetch_rss(url, headers)
|
2024-12-06 16:53:58 +08:00
|
|
|
|
|
|
|
|
|
if rss_content is None:
|
2024-12-08 00:18:31 +08:00
|
|
|
|
logger.warning("无法获取先知社区RSS内容,跳过保存操作。")
|
2024-12-06 16:53:58 +08:00
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
# 将 XML 数据转换为 Python 字典
|
|
|
|
|
data = xmltodict.parse(rss_content)
|
|
|
|
|
|
|
|
|
|
# 提取所有的 <entry> 元素
|
|
|
|
|
entries = data['feed'].get('entry', [])
|
|
|
|
|
|
|
|
|
|
# 创建一个空列表来存储每个 <entry> 的 JSON 数据
|
|
|
|
|
entries_json = []
|
|
|
|
|
|
|
|
|
|
# 遍历每个 <entry> 元素,提取信息,并添加到列表中
|
|
|
|
|
for entry in entries:
|
|
|
|
|
entry_dict = {
|
|
|
|
|
'title': entry.get('title', ''),
|
|
|
|
|
'link': entry.get('link', {}).get('@href', ''), # 提取链接
|
|
|
|
|
'published': entry.get('published', ''),
|
|
|
|
|
'id': entry.get('id', ''),
|
|
|
|
|
'summary': entry.get('summary', '')
|
|
|
|
|
}
|
|
|
|
|
entries_json.append(entry_dict)
|
|
|
|
|
|
|
|
|
|
# 将 <entry> 元素的列表转换为 JSON 格式
|
|
|
|
|
json_data = json.dumps(entries_json, ensure_ascii=False, indent=4)
|
|
|
|
|
|
|
|
|
|
# 保存 JSON 数据到文件
|
|
|
|
|
with open('./JSON/xianzhi.json', 'w', encoding='utf-8') as json_file:
|
|
|
|
|
json_file.write(json_data)
|
|
|
|
|
|
2024-12-08 00:18:31 +08:00
|
|
|
|
logger.info("数据已保存到 ./JSON/xianzhi.json!")
|
2024-12-06 16:53:58 +08:00
|
|
|
|
except Exception as e:
|
2024-12-08 00:18:31 +08:00
|
|
|
|
logger.error(f"解析或保存先知社区RSS内容时发生错误: {e}")
|
2024-12-06 16:53:58 +08:00
|
|
|
|
|
|
|
|
|
# 示例调用
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
xianzhi_main()
|