PyBot/spider/uni.py

86 lines
3.1 KiB
Python
Raw Normal View History

2025-03-14 11:50:28 +08:00
import feedparser
import json
from datetime import datetime, timezone, timedelta
from loguru import logger
import yaml
def fetch_rss_data(feed_urls, wechat_urls):
all_entries = []
# 处理通用 RSS 链接
for rss_url in feed_urls:
source, url = rss_url.split('|')
feed = feedparser.parse(url)
for entry in feed.entries:
entry_data = {
'title': entry.get('title', ''),
'link': entry.get('link', ''),
'author': entry.get('author', ''),
'description': entry.get('summary', ''),
'published': entry.get('published', ''),
'source': source.strip()
}
# 处理 published 字段
if entry_data['published']:
try:
# 尝试解析 published 字段
dt = datetime.strptime(entry_data['published'], '%a, %d %b %Y %H:%M:%S %z')
dt_china = dt.astimezone(timezone(timedelta(hours=8)))
entry_data['published'] = dt_china.strftime('%Y-%m-%d %H:%M:%S')
except ValueError:
# 如果解析失败,直接使用 published 字段
pass
all_entries.append(entry_data)
# 处理微信 RSS 链接
for wx_url in wechat_urls:
author, url = wx_url.split('|')
feed = feedparser.parse(url)
for entry in feed.entries:
entry_data = {
'title': entry.get('title', ''),
'link': entry.get('link', ''),
'author': author.strip(), # 使用 YAML 文件中指定的作者名
'description': entry.get('summary', ''),
'published': entry.get('published', ''),
'source': '微信公众号'
}
# 处理 published 字段
if entry_data['published']:
try:
# 尝试解析 published 字段
dt = datetime.strptime(entry_data['published'], '%a, %d %b %Y %H:%M:%S %z')
dt_china = dt.astimezone(timezone(timedelta(hours=8)))
entry_data['published'] = dt_china.strftime('%Y-%m-%d %H:%M:%S')
except ValueError:
# 如果解析失败,直接使用 published 字段
pass
all_entries.append(entry_data)
return all_entries
def save_to_json(data, filename):
with open(filename, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
logger.info(f"通用爬虫数据已保存到 {filename}")
def uni_spider():
# 读取 links.yaml 文件
with open('./config/uni_rss.yaml', 'r', encoding='utf-8') as f:
links = yaml.safe_load(f)
feed_urls = links.get('link', [])
wechat_urls = links.get('wechat', [])
# 获取 RSS 数据
rss_data = fetch_rss_data(feed_urls, wechat_urls)
# 保存到 JSON 文件
save_to_json(rss_data, './resources/JSON/uni_rss.json')
if __name__ == "__main__":
uni_spider()