PyBot/spider/github.py

236 lines
8.5 KiB
Python
Raw Normal View History

2025-01-02 13:00:43 +08:00
# -*- coding: utf-8 -*-
import time
import yaml
import requests
import json
import datetime
from requests.exceptions import RequestException
import xml.etree.ElementTree as ET
from loguru import logger
MAX_DESCRIPTION_LENGTH = 300
with open('./config/github_config.yaml', 'r', encoding="utf-8") as file:
config = yaml.safe_load(file)
# list = yaml.load(f,Loader=yaml.FullLoader)
token = config['github_token']
tool_list, keyword_list, user_list, black_words = config['tool_list'], config['keyword_list'], config['user_list'], config['black_words']
def fetch_rss(url, timeout=10):
if token is None:
headers = {
"Content-Type": "application/json"
}
else:
headers = {
'Authorization': f"token {token}",
"Content-Type": "application/json"
}
try:
response = requests.get(url, headers=headers, timeout=timeout)
response.raise_for_status()
return response.json()
except requests.Timeout:
logger.warning(f"请求 {url} 超时,跳过保存操作。")
2025-01-02 17:20:36 +08:00
return None
2025-01-02 13:00:43 +08:00
except requests.exceptions.RequestException as e:
logger.error(f"请求 {url} 时发生错误: {e}")
2025-01-02 17:20:36 +08:00
return None
2025-01-02 13:00:43 +08:00
def save_to_json(data, filename):
with open(filename, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
def github_main_keyword(key):
all_results = [] # 用于存储所有结果
for keyword in key:
logger.info(f"github_keyword:关键词【{keyword}】获取开始。")
api_node = "https://api.github.com/search/repositories?q={}&sort=updated&per_page=20".format(keyword)
result = fetch_rss(api_node)
2025-01-02 17:20:36 +08:00
if result == None:
time.sleep(5)
continue
2025-01-02 13:00:43 +08:00
for i in range(0, 20):
description = result['items'][i]['description']
if description is None:
pass
# 检查描述中是否包含黑名单词汇
elif any(word in description for word in black_words):
continue # 跳过本次执行
# 截断描述并在300字节处添加...
elif len(description.encode('utf-8')) > MAX_DESCRIPTION_LENGTH:
# 找到300字节处的索引
byte_index = 0
char_index = 0
while byte_index < MAX_DESCRIPTION_LENGTH and char_index < len(description):
byte_index += len(description[char_index].encode('utf-8'))
char_index += 1
description = description[:char_index - 1] + '...'
link = result['items'][i]['html_url']
name = result['items'][i]['name']
created_at = result['items'][i]['created_at']
author = result['items'][i]['owner']['login']
language = result['items'][i]['language']
# 将每个项目的详细信息存储在一个字典中
project_info = {
'link': link,
'name': name,
'created_at': created_at,
'description': description,
'author': author,
'language': language,
'keyword': keyword
}
# print(project_info)
all_results.append(project_info)
time.sleep(5)
2025-01-02 13:00:43 +08:00
# 将所有结果写入JSON文件
2025-01-02 17:11:11 +08:00
save_to_json(all_results, './resources/JSON/github_keyword.json')
2025-01-02 13:00:43 +08:00
def github_main_repo(key):
all_results = [] # 用于存储所有结果
for keyword in key:
logger.info(f"github_repo:项目【{keyword}】更新情况获取开始。")
api_node = "https://api.github.com/repos/{}/commits?per_page=1".format(keyword)
result = fetch_rss(api_node)
2025-01-02 17:20:36 +08:00
if result == None:
time.sleep(5)
continue
2025-01-02 13:00:43 +08:00
commit = result[0] # 获取最新的提交记录
description = commit['commit']['message']
if description is None:
pass
# 截断描述并在300字节处添加...
elif len(description.encode('utf-8')) > MAX_DESCRIPTION_LENGTH:
byte_index = 0
char_index = 0
while byte_index < MAX_DESCRIPTION_LENGTH and char_index < len(description):
byte_index += len(description[char_index].encode('utf-8'))
char_index += 1
description = description[:char_index - 1] + '...'
author = commit['commit']['author']['name']
updated_at = commit['commit']['author']['date']
link_2 = commit['html_url']
# 将每个项目的详细信息存储在一个字典中
project_info = {
'link': f"https://api.github.com/{keyword}",
'name': keyword,
'updated_at': updated_at,
'description': description,
'author': author,
'link_2': link_2,
'keyword': keyword
}
# print(project_info)
all_results.append(project_info)
time.sleep(5)
2025-01-02 13:00:43 +08:00
# 将所有结果写入JSON文件
2025-01-02 17:11:11 +08:00
save_to_json(all_results, './resources/JSON/github_repo.json')
2025-01-02 13:00:43 +08:00
def github_main_release(key):
all_results = [] # 用于存储所有结果
for keyword in key:
logger.info(f"github_repo:项目【{keyword}】发版情况获取开始。")
api_node = "https://api.github.com/repos/{}/releases?per_page=1".format(keyword)
result = fetch_rss(api_node)
2025-01-02 17:20:36 +08:00
if result == None:
time.sleep(5)
continue
if not result:
logger.warning(f"github_repo:项目【{keyword}】不存在版本发布情况。")
time.sleep(5)
continue
# print(result)
# print(keyword)
commit = result[0] # 获取最新的提交记录
author = commit['author']['login']
published_at = commit['published_at']
link = commit['html_url']
# 将每个项目的详细信息存储在一个字典中
project_info = {
'link': link,
'published_at': published_at,
'author': author,
'keyword': keyword
}
# print(project_info)
all_results.append(project_info)
time.sleep(5)
# 将所有结果写入JSON文件
2025-01-02 17:11:11 +08:00
save_to_json(all_results, './resources/JSON/github_release.json')
2025-01-02 13:00:43 +08:00
def github_main_user(key):
all_results = [] # 用于存储所有结果
for keyword in key:
logger.info(f"github_user:作者【{keyword}】更新情况获取开始。")
api_node = "https://api.github.com/users/{}/repos?sort=created&per_page=10".format(keyword)
result = fetch_rss(api_node)
2025-01-02 17:20:36 +08:00
if result == None:
time.sleep(5)
continue
2025-01-02 13:00:43 +08:00
for i in range(0, len(result)):
description = result[i]['description']
if description is None:
pass
# 检查描述中是否包含黑名单词汇
elif any(word in description for word in black_words):
continue # 跳过本次执行
# 截断描述并在300字节处添加...
elif len(description.encode('utf-8')) > MAX_DESCRIPTION_LENGTH:
# 找到300字节处的索引
byte_index = 0
char_index = 0
while byte_index < MAX_DESCRIPTION_LENGTH and char_index < len(description):
byte_index += len(description[char_index].encode('utf-8'))
char_index += 1
description = description[:char_index - 1] + '...'
link = result[i]['html_url']
name = result[i]['name']
created_at = result[i]['created_at']
author = result[i]['owner']['login']
language = result[i]['language']
# 将每个项目的详细信息存储在一个字典中
project_info = {
'link': link,
'name': name,
'created_at': created_at,
'description': description,
'author': author,
'language': language,
'keyword': keyword
}
# print(project_info)
all_results.append(project_info)
time.sleep(5)
2025-01-02 13:00:43 +08:00
# 将所有结果写入JSON文件
2025-01-02 17:11:11 +08:00
save_to_json(all_results, './resources/JSON/github_user.json')
2025-01-02 13:00:43 +08:00
def github_main():
if keyword_list:
github_main_keyword(keyword_list)
if tool_list:
github_main_repo(tool_list)
github_main_release(tool_list)
2025-01-02 13:00:43 +08:00
if user_list:
github_main_user(user_list)
if __name__ == "__main__":
github_main()