86 lines
3.1 KiB
Python
86 lines
3.1 KiB
Python
|
import feedparser
|
||
|
import json
|
||
|
from datetime import datetime, timezone, timedelta
|
||
|
from loguru import logger
|
||
|
import yaml
|
||
|
|
||
|
def fetch_rss_data(feed_urls, wechat_urls):
|
||
|
all_entries = []
|
||
|
|
||
|
# 处理通用 RSS 链接
|
||
|
for rss_url in feed_urls:
|
||
|
source, url = rss_url.split('|')
|
||
|
feed = feedparser.parse(url)
|
||
|
for entry in feed.entries:
|
||
|
entry_data = {
|
||
|
'title': entry.get('title', ''),
|
||
|
'link': entry.get('link', ''),
|
||
|
'author': entry.get('author', ''),
|
||
|
'description': entry.get('summary', ''),
|
||
|
'published': entry.get('published', ''),
|
||
|
'source': source.strip()
|
||
|
}
|
||
|
|
||
|
# 处理 published 字段
|
||
|
if entry_data['published']:
|
||
|
try:
|
||
|
# 尝试解析 published 字段
|
||
|
dt = datetime.strptime(entry_data['published'], '%a, %d %b %Y %H:%M:%S %z')
|
||
|
dt_china = dt.astimezone(timezone(timedelta(hours=8)))
|
||
|
entry_data['published'] = dt_china.strftime('%Y-%m-%d %H:%M:%S')
|
||
|
except ValueError:
|
||
|
# 如果解析失败,直接使用 published 字段
|
||
|
pass
|
||
|
|
||
|
all_entries.append(entry_data)
|
||
|
|
||
|
# 处理微信 RSS 链接
|
||
|
for wx_url in wechat_urls:
|
||
|
author, url = wx_url.split('|')
|
||
|
feed = feedparser.parse(url)
|
||
|
for entry in feed.entries:
|
||
|
entry_data = {
|
||
|
'title': entry.get('title', ''),
|
||
|
'link': entry.get('link', ''),
|
||
|
'author': author.strip(), # 使用 YAML 文件中指定的作者名
|
||
|
'description': entry.get('summary', ''),
|
||
|
'published': entry.get('published', ''),
|
||
|
'source': '微信公众号'
|
||
|
}
|
||
|
|
||
|
# 处理 published 字段
|
||
|
if entry_data['published']:
|
||
|
try:
|
||
|
# 尝试解析 published 字段
|
||
|
dt = datetime.strptime(entry_data['published'], '%a, %d %b %Y %H:%M:%S %z')
|
||
|
dt_china = dt.astimezone(timezone(timedelta(hours=8)))
|
||
|
entry_data['published'] = dt_china.strftime('%Y-%m-%d %H:%M:%S')
|
||
|
except ValueError:
|
||
|
# 如果解析失败,直接使用 published 字段
|
||
|
pass
|
||
|
|
||
|
all_entries.append(entry_data)
|
||
|
|
||
|
return all_entries
|
||
|
|
||
|
def save_to_json(data, filename):
|
||
|
with open(filename, 'w', encoding='utf-8') as f:
|
||
|
json.dump(data, f, ensure_ascii=False, indent=4)
|
||
|
logger.info(f"通用爬虫数据已保存到 {filename}")
|
||
|
|
||
|
def uni_spider():
|
||
|
# 读取 links.yaml 文件
|
||
|
with open('./config/uni_rss.yaml', 'r', encoding='utf-8') as f:
|
||
|
links = yaml.safe_load(f)
|
||
|
|
||
|
feed_urls = links.get('link', [])
|
||
|
wechat_urls = links.get('wechat', [])
|
||
|
|
||
|
# 获取 RSS 数据
|
||
|
rss_data = fetch_rss_data(feed_urls, wechat_urls)
|
||
|
|
||
|
# 保存到 JSON 文件
|
||
|
save_to_json(rss_data, './resources/JSON/uni_rss.json')
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
uni_spider()
|