56 lines
1.8 KiB
Python
56 lines
1.8 KiB
Python
|
import os
|
|||
|
import requests
|
|||
|
import xml.etree.ElementTree as ET
|
|||
|
import json
|
|||
|
|
|||
|
# 测试用爬虫请求头
|
|||
|
headers = {
|
|||
|
"Content-Type": "application/json",
|
|||
|
"Cache-Control": "no-cache",
|
|||
|
"Upgrade-Insecure-Requests": "1",
|
|||
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Safari/537.36",
|
|||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
|||
|
"Sec-Fetch-Site": "same-origin",
|
|||
|
"Sec-Fetch-Mode": "navigate",
|
|||
|
"Sec-Fetch-User": "?1",
|
|||
|
"Sec-Fetch-Dest": "document",
|
|||
|
"Accept-Language": "zh-CN,zh;q=0.9"
|
|||
|
}
|
|||
|
|
|||
|
def fetch_rss(url, headers):
|
|||
|
response = requests.get(url, headers=headers)
|
|||
|
response.raise_for_status() # 检查请求是否成功
|
|||
|
return response.content
|
|||
|
|
|||
|
def parse_rss(rss_content):
|
|||
|
items = []
|
|||
|
root = ET.fromstring(rss_content)
|
|||
|
for item in root.findall('.//item'):
|
|||
|
item_dict = {}
|
|||
|
for child in item:
|
|||
|
tag = child.tag
|
|||
|
# 将一标签替换名称方便处理
|
|||
|
if tag.startswith('{http://purl.org/rss/1.0/modules/content/}'):
|
|||
|
tag = 'body'
|
|||
|
item_dict[tag] = child.text
|
|||
|
items.append(item_dict)
|
|||
|
return items
|
|||
|
|
|||
|
def save_to_json(data, filename):
|
|||
|
with open(filename, 'w', encoding='utf-8') as f:
|
|||
|
json.dump(data, f, ensure_ascii=False, indent=4)
|
|||
|
|
|||
|
def freebuf_main():
|
|||
|
url = "https://www.freebuf.com/feed"
|
|||
|
rss_content = fetch_rss(url, headers)
|
|||
|
items = parse_rss(rss_content)
|
|||
|
|
|||
|
# 确保目录存在
|
|||
|
os.makedirs(os.path.dirname('./JSON/freebuf.json'), exist_ok=True)
|
|||
|
|
|||
|
# 将解析后的数据保存到 JSON 文件
|
|||
|
save_to_json(items, './JSON/freebuf.json')
|
|||
|
print("数据已保存到 ./JSON/freebuf.json!")
|
|||
|
|
|||
|
if __name__ == '__main__':
|
|||
|
freebuf_main()
|