# -*- coding: utf-8 -*- import json import sqlite3 import os from datetime import datetime, timedelta def create_database(): conn = sqlite3.connect('./db/github.db') cursor = conn.cursor() cursor.executescript(''' CREATE TABLE IF NOT EXISTS keywords ( id INTEGER PRIMARY KEY AUTOINCREMENT, title TEXT, link TEXT, description TEXT, pubDate DATETIME, author TEXT, keyword TEXT, language TEXT, is_sended BOOLEAN ); CREATE TABLE IF NOT EXISTS repos ( id INTEGER PRIMARY KEY AUTOINCREMENT, title TEXT, link TEXT, description TEXT, pubDate DATETIME, author TEXT, keyword TEXT, link2 TEXT ); CREATE TABLE IF NOT EXISTS releases ( id INTEGER PRIMARY KEY AUTOINCREMENT, link TEXT, pubDate DATETIME, author TEXT, keyword TEXT ); CREATE TABLE IF NOT EXISTS users ( id INTEGER PRIMARY KEY AUTOINCREMENT, title TEXT, link TEXT, description TEXT, pubDate DATETIME, author TEXT, keyword TEXT, language TEXT, is_sended BOOLEAN ); ''') conn.commit() conn.close() def insert_data(): # 检查文件是否存在 # 打开并读取JSON文件 # 假设data是一个包含多个JSON对象的列表,然后校验JSON格式是否异常 if not os.path.exists('./JSON/github_keyword.json'): raise FileNotFoundError(f"github_keyword文件不存在,请检查程序是否运行正常!") with open('./JSON/github_keyword.json', 'r', encoding='utf-8') as file: data_keyword = json.load(file) if not isinstance(data_keyword, list): raise ValueError("JSON文件格式错误,请检查爬取程序是否异常!") if not os.path.exists('./JSON/github_repo.json'): raise FileNotFoundError(f"github_repo文件不存在,请检查程序是否运行正常!") with open('./JSON/github_repo.json', 'r', encoding='utf-8') as file: data_repo = json.load(file) if not isinstance(data_repo, list): raise ValueError("JSON文件格式错误,请检查爬取程序是否异常!") if not os.path.exists('./JSON/github_release.json'): raise FileNotFoundError(f"github_release文件不存在,请检查程序是否运行正常!") with open('./JSON/github_release.json', 'r', encoding='utf-8') as file: data_release = json.load(file) if not isinstance(data_release, list): raise ValueError("JSON文件格式错误,请检查爬取程序是否异常!") if not os.path.exists('./JSON/github_user.json'): raise FileNotFoundError(f"github_user文件不存在,请检查程序是否运行正常!") with open('./JSON/github_user.json', 'r', encoding='utf-8') as file: data_user = json.load(file) if not isinstance(data_user, list): raise ValueError("JSON文件格式错误,请检查爬取程序是否异常!") conn = sqlite3.connect('./db/github.db') cursor = conn.cursor() # 提取所需字段并编号 for index, item in enumerate(data_keyword, start=1): entry = { "id": index, "title": item.get("name", ""), "link": item.get("link", ""), "description": item.get("description", ""), "pubDate": item.get("created_at", ""), "author": item.get("author", ""), "keyword": item.get("keyword", ""), "language": item.get("language", "") } try: # 解析 pubDate 字符串为 datetime 对象 pub_date = datetime.strptime(entry['pubDate'], '%Y-%m-%dT%H:%M:%SZ') # 格式化 pubDate 为所需的格式 formatted_pub_date = pub_date.strftime('%Y-%m-%d %H:%M:%S') except ValueError: # 如果解析失败,使用原始 pubDate 字符串 formatted_pub_date = entry['pubDate'] # 检查是否存在相同 title 和 author 的记录 cursor.execute(''' SELECT 1 FROM keywords WHERE title = ? AND author = ? ''', (entry['title'], entry['author'])) if cursor.fetchone() is None: # 如果没有找到相同记录,则插入新记录 cursor.execute(''' INSERT INTO keywords (title, link, description, pubDate, author, language, keyword) VALUES (?, ?, ?, ?, ?, ?, ?) ''', (entry['title'], entry['link'], entry['description'], formatted_pub_date, entry['author'], entry['language'], entry['keyword'])) for index, item in enumerate(data_repo, start=1): entry = { "id": index, "title": item.get("name", ""), "link": item.get("link", ""), "description": item.get("description", ""), "pubDate": item.get("updated_at", ""), "author": item.get("author", ""), "keyword": item.get("keyword", ""), "link2": item.get("link_2", "") } try: # 解析 pubDate 字符串为 datetime 对象 pub_date = datetime.strptime(entry['pubDate'], '%Y-%m-%dT%H:%M:%SZ') # 格式化 pubDate 为所需的格式 formatted_pub_date = pub_date.strftime('%Y-%m-%d %H:%M:%S') except ValueError: # 如果解析失败,使用原始 pubDate 字符串 formatted_pub_date = entry['pubDate'] cursor.execute(''' INSERT INTO repos (title, link, description, pubDate, author, link2, keyword) VALUES (?, ?, ?, ?, ?, ?, ?) ''', (entry['title'], entry['link'], entry['description'], formatted_pub_date, entry['author'], entry['link2'], entry['keyword'])) for index, item in enumerate(data_release, start=1): entry = { "id": index, "link": item.get("link", ""), "pubDate": item.get("published_at", ""), "author": item.get("author", ""), "keyword": item.get("keyword", "") } try: # 解析 pubDate 字符串为 datetime 对象 pub_date = datetime.strptime(entry['pubDate'], '%Y-%m-%dT%H:%M:%SZ') # 格式化 pubDate 为所需的格式 formatted_pub_date = pub_date.strftime('%Y-%m-%d %H:%M:%S') except ValueError: # 如果解析失败,使用原始 pubDate 字符串 formatted_pub_date = entry['pubDate'] cursor.execute(''' INSERT INTO releases (link, pubDate, author, keyword) VALUES (?, ?, ?, ?) ''', (entry['link'], formatted_pub_date, entry['author'], entry['keyword'])) # 插入 users 数据 for index, item in enumerate(data_user, start=1): entry = { "id": index, "title": item.get("name", ""), "link": item.get("link", ""), "description": item.get("description", ""), "pubDate": item.get("created_at", ""), "author": item.get("author", ""), "keyword": item.get("keyword", ""), "language": item.get("language", "") } try: # 解析 pubDate 字符串为 datetime 对象 pub_date = datetime.strptime(entry['pubDate'], '%Y-%m-%dT%H:%M:%SZ') # 格式化 pubDate 为所需的格式 formatted_pub_date = pub_date.strftime('%Y-%m-%d %H:%M:%S') except ValueError: # 如果解析失败,使用原始 pubDate 字符串 formatted_pub_date = entry['pubDate'] # 检查是否存在相同 title 和 author 的记录 cursor.execute(''' SELECT 1 FROM users WHERE title = ? AND author = ? ''', (entry['title'], entry['author'])) if cursor.fetchone() is None: # 如果没有找到相同记录,则插入新记录 cursor.execute(''' INSERT INTO users (title, link, description, pubDate, author, keyword, language) VALUES (?, ?, ?, ?, ?, ?, ?) ''', (entry['title'], entry['link'], entry['description'], formatted_pub_date, entry['author'], entry['keyword'], entry['language'])) conn.commit() conn.close() def select_articles(e_hour): conn = sqlite3.connect('./db/github.db') cursor = conn.cursor() # 获取当前日期和时间 now = datetime.now() two_months_ago = now - timedelta(days=60) # 假设两个月大约60天 start_time = now - timedelta(hours=e_hour, minutes=3) # 查询指定时间段内的数据 cursor.execute(''' SELECT * FROM keywords WHERE is_sended IS NULL AND pubDate BETWEEN ? AND ? ORDER BY pubDate DESC ''', (start_time.strftime('%Y-%m-%d %H:%M:%S'), now.strftime('%Y-%m-%d %H:%M:%S'))) result_1 = cursor.fetchall() if result_1: for row in result_1: keyword_id = row[0] cursor.execute(''' UPDATE keywords SET is_sended = True WHERE id = ? ''', (keyword_id,)) conn.commit() cursor.execute(''' SELECT * FROM repos WHERE pubDate BETWEEN ? AND ? ''', (start_time.strftime('%Y-%m-%d %H:%M:%S'), now.strftime('%Y-%m-%d %H:%M:%S'))) result_2 = cursor.fetchall() # 查询最近的5条未被标记为True的消息且发布时间不超过两个月 cursor.execute(''' SELECT * FROM users WHERE is_sended IS NULL AND pubDate BETWEEN ? AND ? ORDER BY pubDate DESC LIMIT 5 ''', (two_months_ago.strftime('%Y-%m-%d %H:%M:%S'), now.strftime('%Y-%m-%d %H:%M:%S'))) result_3 = cursor.fetchall() # print(results) if result_3: for row in result_3: user_id = row[0] cursor.execute(''' UPDATE users SET is_sended = True WHERE id = ? ''', (user_id,)) conn.commit() # 提交事务 cursor.execute(''' SELECT * FROM releases WHERE pubDate BETWEEN ? AND ? ''', (start_time.strftime('%Y-%m-%d %H:%M:%S'), now.strftime('%Y-%m-%d %H:%M:%S'))) result_4 = cursor.fetchall() cursor.close() conn.close() return result_1, result_2, result_3, result_4 def clear_table(): conn = sqlite3.connect('./db/github.db') cursor = conn.cursor() cursor.execute('DELETE FROM repos') cursor.execute('DELETE FROM releases') conn.commit() conn.close() def record_md(result, filename="./history/github.md"): # 读取现有内容 if os.path.exists(filename): with open(filename, 'r', encoding='utf-8') as file: existing_content = file.read() else: existing_content = "" # 将新内容插入到现有内容的开头 new_content = result + existing_content # 写回文件 with open(filename, 'w', encoding='utf-8') as file: file.write(new_content) def get_filtered_articles(entries, Is_short, choice): result = "" record = "" for entry in entries: if Is_short == False: if choice == 1: result += f"关键词【{entry[6]}】发现新项目:[{entry[1]}]({entry[2]})\n" result += f"项目描述:{entry[3]}\n" result += f"上传时间:{entry[4]}\n" result += f"开发语言:{entry[7]}\t\t作者:{entry[5]}\n" result += "\n" + "-" * 10 + "\n" # 添加分隔线以便区分不同文章 elif choice == 2: result += f"项目:[{entry[1]}]({entry[2]})存在更新!!!\n" result += f"更新描述:{entry[3]}\n" result += f"更新时间:{entry[4]}\n" result += f"提交者:{entry[5]},[点此查看提交详情]({entry[2]})\n" result += "\n" + "-" * 10 + "\n" # 添加分隔线以便区分不同文章 elif choice == 3: result += f"大佬 {entry[5]} 上传了一个新工具:[{entry[1]}]({entry[2]})\n" result += f"项目描述:{entry[3]}\n" result += f"上传时间:{entry[4]}\n" result += "\n" + "-" * 10 + "\n" # 添加分隔线以便区分不同文章 elif choice == 4: result += f"【{entry[3]}】为[{entry[4]}]({entry[1]})发布了新版本,请及时查收!\n" result += f"发布时间:{entry[2]}\n" result += "\n" + "-" * 10 + "\n" # 添加分隔线以便区分不同文章 elif Is_short == True: if choice == 1: result += f"关键词【{entry[7]}】发现新项目:[{entry[1]}]({entry[2]})\n" result += f"上传时间:{entry[4]}\n" result += f"开发语言:{entry[6]}\t\t作者:{entry[5]}\n" result += "\n" + "-" * 3 + "\n" # 添加分隔线以便区分不同文章 elif choice == 2: result += f"项目:[{entry[1]}]({entry[2]})存在更新!!!\n" result += f"更新描述:{entry[3]}\n" result += f"更新时间:{entry[4]}\n" result += f"提交者:{entry[5]},[点此查看提交详情]({entry[2]})\n" result += "\n" + "-" * 3 + "\n" # 添加分隔线以便区分不同文章 elif choice == 3: result += f"大佬 {entry[5]} 上传了一个新工具:[{entry[1]}]({entry[2]})\n" result += f"项目描述:{entry[3]}\n" result += f"上传时间:{entry[4]}\n" result += "\n" + "-" * 3 + "\n" # 添加分隔线以便区分不同文章 elif choice == 4: result += f"【{entry[3]}】为[{entry[4]}]({entry[1]})发布了新版本,请及时查收!\n" result += f"发布时间:{entry[2]}\n" result += "\n" + "-" * 3 + "\n" # 添加分隔线以便区分不同文章 if choice == 1: record += f"#### 关键词【{entry[7]}】发现新项目:[{entry[1]}]({entry[2]})\n" record += f"**项目描述**:{entry[3]}\n" record += f"**上传时间**:{entry[4]}\n" record += f"**开发语言**:{entry[6]}\t\t**作者**:{entry[5]}\n" record += "\n" + "-" * 10 + "\n" # 添加分隔线以便区分不同文章 elif choice == 2: record += f"#### 项目:[{entry[1]}]({entry[2]})存在更新!!!\n" record += f"**更新描述**:{entry[3]}\n" record += f"**更新时间**:{entry[4]}\n" record += f"**提交者**:{entry[5]},[点此查看提交详情]({entry[2]})\n" record += "\n" + "-" * 10 + "\n" # 添加分隔线以便区分不同文章 elif choice == 3: record += f"#### 大佬 {entry[5]} 上传了一个新工具:[{entry[1]}]({entry[2]})\n" record += f"**项目描述**:{entry[3]}\n" record += f"**上传时间**:{entry[4]}\n" record += "\n" + "-" * 10 + "\n" # 添加分隔线以便区分不同文章 elif choice == 4: record += f"#### 【{entry[3]}】为[{entry[4]}]({entry[1]})发布了新版本,请及时查收!\n" record += f"**发布时间**:{entry[2]}\n" record += "\n" + "-" * 10 + "\n" # 添加分隔线以便区分不同文章 record_md(record) return result def Src_github(e_hour, Is_short): if not os.path.exists('./db/github.db'): # 创建数据库和表 create_database() # 清空表 clear_table() # 插入数据到数据库 insert_data() # 查询指定时间段内的数据 keyword_data, repo_data, user_data, release_data = select_articles(e_hour) if keyword_data: result_1 = get_filtered_articles(keyword_data, Is_short, 1) else: result_1 = "" if repo_data: result_2 = get_filtered_articles(repo_data, Is_short, 2) else: result_2 = "" if user_data: result_3 = get_filtered_articles(user_data, Is_short, 3) else: result_3 = "" if release_data: result_4 = get_filtered_articles(release_data, Is_short, 4) else: result_4 = "" return result_1, result_2, result_3, result_4 if __name__ == "__main__": result_1, result_2, result_3, result_4 = Src_github(24000, False) if result_1 != "": print(result_1) if result_2 != "": print(result_2) if result_3 != "": print(result_3) if result_4 != "": print(result_4) if result_1 == "" and result_2 == "" and result_3 == "" and result_4 == "": # 如果为空,则跳过执行 print("-" * 10) print("github数据为空,跳过执行。")