import feedparser import json from datetime import datetime, timezone, timedelta from loguru import logger import yaml def fetch_rss_data(feed_urls, wechat_urls): all_entries = [] # 处理通用 RSS 链接 for rss_url in feed_urls: source, url = rss_url.split('|') feed = feedparser.parse(url) for entry in feed.entries: entry_data = { 'title': entry.get('title', ''), 'link': entry.get('link', ''), 'author': entry.get('author', ''), 'description': entry.get('summary', ''), 'published': entry.get('published', ''), 'source': source.strip() } # 处理 published 字段 if entry_data['published']: try: # 尝试解析 published 字段 dt = datetime.strptime(entry_data['published'], '%a, %d %b %Y %H:%M:%S %z') dt_china = dt.astimezone(timezone(timedelta(hours=8))) entry_data['published'] = dt_china.strftime('%Y-%m-%d %H:%M:%S') except ValueError: # 如果解析失败,直接使用 published 字段 pass all_entries.append(entry_data) # 处理微信 RSS 链接 for wx_url in wechat_urls: author, url = wx_url.split('|') feed = feedparser.parse(url) for entry in feed.entries: entry_data = { 'title': entry.get('title', ''), 'link': entry.get('link', ''), 'author': author.strip(), # 使用 YAML 文件中指定的作者名 'description': entry.get('summary', ''), 'published': entry.get('published', ''), 'source': '微信公众号' } # 处理 published 字段 if entry_data['published']: try: # 尝试解析 published 字段 dt = datetime.strptime(entry_data['published'], '%a, %d %b %Y %H:%M:%S %z') dt_china = dt.astimezone(timezone(timedelta(hours=8))) entry_data['published'] = dt_china.strftime('%Y-%m-%d %H:%M:%S') except ValueError: # 如果解析失败,直接使用 published 字段 pass all_entries.append(entry_data) return all_entries def save_to_json(data, filename): with open(filename, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=4) logger.info(f"通用爬虫数据已保存到 {filename}") def uni_spider(): # 读取 links.yaml 文件 with open('./config/uni_rss.yaml', 'r', encoding='utf-8') as f: links = yaml.safe_load(f) feed_urls = links.get('link', []) wechat_urls = links.get('wechat', []) # 获取 RSS 数据 rss_data = fetch_rss_data(feed_urls, wechat_urls) # 保存到 JSON 文件 save_to_json(rss_data, './resources/JSON/uni_rss.json') if __name__ == "__main__": uni_spider()