510 lines
18 KiB
Python
510 lines
18 KiB
Python
# -*- coding: utf-8 -*-
|
||
|
||
import json
|
||
import sqlite3
|
||
import os
|
||
from datetime import datetime, timedelta
|
||
|
||
def create_database():
|
||
conn = sqlite3.connect('./resources/db/github.db')
|
||
cursor = conn.cursor()
|
||
|
||
cursor.executescript('''
|
||
CREATE TABLE IF NOT EXISTS keywords (
|
||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||
title TEXT,
|
||
link TEXT,
|
||
description TEXT,
|
||
pubDate DATETIME,
|
||
author TEXT,
|
||
keyword TEXT,
|
||
language TEXT,
|
||
is_sended BOOLEAN
|
||
);
|
||
CREATE TABLE IF NOT EXISTS repos (
|
||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||
title TEXT,
|
||
link TEXT,
|
||
description TEXT,
|
||
pubDate DATETIME,
|
||
author TEXT,
|
||
keyword TEXT,
|
||
link2 TEXT
|
||
);
|
||
CREATE TABLE IF NOT EXISTS releases (
|
||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||
link TEXT,
|
||
pubDate DATETIME,
|
||
author TEXT,
|
||
keyword TEXT
|
||
);
|
||
CREATE TABLE IF NOT EXISTS users (
|
||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||
title TEXT,
|
||
link TEXT,
|
||
description TEXT,
|
||
pubDate DATETIME,
|
||
author TEXT,
|
||
keyword TEXT,
|
||
language TEXT,
|
||
is_sended BOOLEAN
|
||
);
|
||
''')
|
||
|
||
conn.commit()
|
||
conn.close()
|
||
|
||
|
||
def insert_data():
|
||
# 检查文件是否存在
|
||
# 打开并读取JSON文件
|
||
# 假设data是一个包含多个JSON对象的列表,然后校验JSON格式是否异常
|
||
if not os.path.exists('./resources/JSON/github_keyword.json'):
|
||
raise FileNotFoundError(f"github_keyword文件不存在,请检查程序是否运行正常!")
|
||
with open('./resources/JSON/github_keyword.json', 'r', encoding='utf-8') as file:
|
||
content = file.read()
|
||
if not content:
|
||
pass
|
||
else:
|
||
data_keyword = json.loads(content)
|
||
if not isinstance(data_keyword, list):
|
||
raise ValueError("JSON文件格式错误,请检查爬取程序是否异常!")
|
||
|
||
if not os.path.exists('./resources/JSON/github_repo.json'):
|
||
raise FileNotFoundError(f"github_repo文件不存在,请检查程序是否运行正常!")
|
||
with open('./resources/JSON/github_repo.json', 'r', encoding='utf-8') as file:
|
||
content = file.read()
|
||
if not content:
|
||
pass
|
||
else:
|
||
data_repo = json.loads(content)
|
||
if not isinstance(data_repo, list):
|
||
raise ValueError("JSON文件格式错误,请检查爬取程序是否异常!")
|
||
|
||
if not os.path.exists('./resources/JSON/github_release.json'):
|
||
raise FileNotFoundError(f"github_release文件不存在,请检查程序是否运行正常!")
|
||
with open('./resources/JSON/github_release.json', 'r', encoding='utf-8') as file:
|
||
content = file.read()
|
||
if not content:
|
||
pass
|
||
else:
|
||
data_release = json.loads(content)
|
||
if not isinstance(data_release, list):
|
||
raise ValueError("JSON文件格式错误,请检查爬取程序是否异常!")
|
||
|
||
if not os.path.exists('./resources/JSON/github_user.json'):
|
||
raise FileNotFoundError(f"github_user文件不存在,请检查程序是否运行正常!")
|
||
with open('./resources/JSON/github_user.json', 'r', encoding='utf-8') as file:
|
||
content = file.read()
|
||
if not content:
|
||
pass
|
||
else:
|
||
data_user = json.loads(content)
|
||
if not isinstance(data_user, list):
|
||
raise ValueError("JSON文件格式错误,请检查爬取程序是否异常!")
|
||
|
||
conn = sqlite3.connect('./resources/db/github.db')
|
||
cursor = conn.cursor()
|
||
|
||
# 提取所需字段并编号
|
||
for index, item in enumerate(data_keyword, start=1):
|
||
entry = {
|
||
"id": index,
|
||
"title": item.get("name", ""),
|
||
"link": item.get("link", ""),
|
||
"description": item.get("description", ""),
|
||
"pubDate": item.get("created_at", ""),
|
||
"author": item.get("author", ""),
|
||
"keyword": item.get("keyword", ""),
|
||
"language": item.get("language", "")
|
||
}
|
||
try:
|
||
pub_date = datetime.strptime(entry['pubDate'], '%Y-%m-%dT%H:%M:%SZ')
|
||
pub_date += timedelta(hours=8)
|
||
formatted_pub_date = pub_date.strftime('%Y-%m-%d %H:%M:%S')
|
||
except ValueError:
|
||
# 如果解析失败,使用原始 pubDate 字符串
|
||
formatted_pub_date = entry['pubDate']
|
||
|
||
|
||
# 检查是否存在相同 title 和 author 的记录
|
||
cursor.execute('''
|
||
SELECT 1 FROM keywords WHERE title = ? AND author = ?
|
||
''', (entry['title'], entry['author']))
|
||
|
||
if cursor.fetchone() is None:
|
||
# 如果没有找到相同记录,则插入新记录
|
||
cursor.execute('''
|
||
INSERT INTO keywords (title, link, description, pubDate, author, language, keyword)
|
||
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||
''', (entry['title'], entry['link'], entry['description'], formatted_pub_date, entry['author'], entry['language'], entry['keyword']))
|
||
|
||
for index, item in enumerate(data_repo, start=1):
|
||
entry = {
|
||
"id": index,
|
||
"title": item.get("name", ""),
|
||
"link": f"https://github.com/{item.get('keyword', '')}",
|
||
"description": item.get("description", ""),
|
||
"pubDate": item.get("updated_at", ""),
|
||
"author": item.get("author", ""),
|
||
"keyword": item.get("keyword", ""),
|
||
"link2": item.get("link_2", "")
|
||
}
|
||
try:
|
||
pub_date = datetime.strptime(entry['pubDate'], '%Y-%m-%dT%H:%M:%SZ')
|
||
pub_date += timedelta(hours=8)
|
||
formatted_pub_date = pub_date.strftime('%Y-%m-%d %H:%M:%S')
|
||
except ValueError:
|
||
# 如果解析失败,使用原始 pubDate 字符串
|
||
formatted_pub_date = entry['pubDate']
|
||
|
||
cursor.execute('''
|
||
INSERT INTO repos (title, link, description, pubDate, author, link2, keyword)
|
||
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||
''', (entry['title'], entry['link'], entry['description'], formatted_pub_date, entry['author'], entry['link2'], entry['keyword']))
|
||
|
||
for index, item in enumerate(data_release, start=1):
|
||
entry = {
|
||
"id": index,
|
||
"link": item.get("link", ""),
|
||
"pubDate": item.get("published_at", ""),
|
||
"author": item.get("author", ""),
|
||
"keyword": item.get("keyword", "")
|
||
}
|
||
try:
|
||
pub_date = datetime.strptime(entry['pubDate'], '%Y-%m-%dT%H:%M:%SZ')
|
||
pub_date += timedelta(hours=8)
|
||
formatted_pub_date = pub_date.strftime('%Y-%m-%d %H:%M:%S')
|
||
except ValueError:
|
||
# 如果解析失败,使用原始 pubDate 字符串
|
||
formatted_pub_date = entry['pubDate']
|
||
|
||
cursor.execute('''
|
||
INSERT INTO releases (link, pubDate, author, keyword)
|
||
VALUES (?, ?, ?, ?)
|
||
''', (entry['link'], formatted_pub_date, entry['author'], entry['keyword']))
|
||
|
||
|
||
# 插入 users 数据
|
||
for index, item in enumerate(data_user, start=1):
|
||
entry = {
|
||
"id": index,
|
||
"title": item.get("name", ""),
|
||
"link": item.get("link", ""),
|
||
"description": item.get("description", ""),
|
||
"pubDate": item.get("created_at", ""),
|
||
"author": item.get("author", ""),
|
||
"keyword": item.get("keyword", ""),
|
||
"language": item.get("language", "")
|
||
}
|
||
try:
|
||
pub_date = datetime.strptime(entry['pubDate'], '%Y-%m-%dT%H:%M:%SZ')
|
||
pub_date += timedelta(hours=8)
|
||
formatted_pub_date = pub_date.strftime('%Y-%m-%d %H:%M:%S')
|
||
except ValueError:
|
||
# 如果解析失败,使用原始 pubDate 字符串
|
||
formatted_pub_date = entry['pubDate']
|
||
|
||
# 检查是否存在相同 title 和 author 的记录
|
||
cursor.execute('''
|
||
SELECT 1 FROM users WHERE title = ? AND author = ?
|
||
''', (entry['title'], entry['author']))
|
||
|
||
if cursor.fetchone() is None:
|
||
# 如果没有找到相同记录,则插入新记录
|
||
cursor.execute('''
|
||
INSERT INTO users (title, link, description, pubDate, author, keyword, language)
|
||
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||
''', (entry['title'], entry['link'], entry['description'], formatted_pub_date, entry['author'], entry['keyword'], entry['language']))
|
||
|
||
conn.commit()
|
||
conn.close()
|
||
|
||
def select_articles(e_hour):
|
||
conn = sqlite3.connect('./resources/db/github.db')
|
||
cursor = conn.cursor()
|
||
|
||
# 获取当前日期和时间
|
||
now = datetime.now()
|
||
two_months_ago = now - timedelta(days=60) # 假设两个月大约60天
|
||
start_time = now - timedelta(hours=e_hour, minutes=3)
|
||
|
||
# 查询指定时间段内的数据
|
||
cursor.execute('''
|
||
SELECT * FROM keywords
|
||
WHERE is_sended IS NULL AND pubDate BETWEEN ? AND ?
|
||
ORDER BY pubDate DESC
|
||
''', (start_time.strftime('%Y-%m-%d %H:%M:%S'), now.strftime('%Y-%m-%d %H:%M:%S')))
|
||
|
||
result_1 = cursor.fetchall()
|
||
|
||
if result_1:
|
||
for row in result_1:
|
||
keyword_id = row[0]
|
||
cursor.execute('''
|
||
UPDATE keywords
|
||
SET is_sended = True
|
||
WHERE id = ?
|
||
''', (keyword_id,))
|
||
|
||
conn.commit()
|
||
|
||
cursor.execute('''
|
||
SELECT * FROM repos
|
||
WHERE pubDate BETWEEN ? AND ?
|
||
''', (start_time.strftime('%Y-%m-%d %H:%M:%S'), now.strftime('%Y-%m-%d %H:%M:%S')))
|
||
|
||
result_2 = cursor.fetchall()
|
||
|
||
# 查询最近的5条未被标记为True的消息且发布时间不超过两个月
|
||
cursor.execute('''
|
||
SELECT * FROM users
|
||
WHERE is_sended IS NULL AND pubDate BETWEEN ? AND ?
|
||
ORDER BY pubDate DESC
|
||
LIMIT 5
|
||
''', (two_months_ago.strftime('%Y-%m-%d %H:%M:%S'), now.strftime('%Y-%m-%d %H:%M:%S')))
|
||
|
||
result_3 = cursor.fetchall()
|
||
# print(results)
|
||
|
||
if result_3:
|
||
for row in result_3:
|
||
user_id = row[0]
|
||
cursor.execute('''
|
||
UPDATE users
|
||
SET is_sended = True
|
||
WHERE id = ?
|
||
''', (user_id,))
|
||
|
||
conn.commit() # 提交事务
|
||
|
||
cursor.execute('''
|
||
SELECT * FROM releases
|
||
WHERE pubDate BETWEEN ? AND ?
|
||
''', (start_time.strftime('%Y-%m-%d %H:%M:%S'), now.strftime('%Y-%m-%d %H:%M:%S')))
|
||
|
||
result_4 = cursor.fetchall()
|
||
|
||
cursor.close()
|
||
conn.close()
|
||
|
||
return result_1, result_2, result_3, result_4
|
||
|
||
def clear_table():
|
||
conn = sqlite3.connect('./resources/db/github.db')
|
||
cursor = conn.cursor()
|
||
cursor.execute('DELETE FROM repos')
|
||
cursor.execute('DELETE FROM releases')
|
||
conn.commit()
|
||
conn.close()
|
||
|
||
def record(title, link, description, upload_time, author, keyword, language, source):
|
||
db_path = './resources/db/web.db'
|
||
conn = sqlite3.connect(db_path)
|
||
cursor = conn.cursor()
|
||
|
||
# 创建表(如果不存在)— 添加 language 字段
|
||
cursor.execute('''CREATE TABLE IF NOT EXISTS github (
|
||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||
article_title TEXT,
|
||
link TEXT,
|
||
description TEXT,
|
||
author TEXT,
|
||
upload_time DATETIME,
|
||
keyword TEXT,
|
||
language TEXT,
|
||
source TEXT
|
||
)''')
|
||
|
||
# 插入数据 — 添加 author 和 language 字段
|
||
cursor.execute('''
|
||
INSERT INTO github (article_title, link, description, author, upload_time, keyword, language, source)
|
||
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
||
''', (title, link, description, author, upload_time, keyword, language, source))
|
||
|
||
conn.commit()
|
||
conn.close()
|
||
|
||
def get_filtered_articles(entries, choice):
|
||
result_long = ""
|
||
result_short = ""
|
||
short_results = []
|
||
|
||
for entry in entries:
|
||
if choice == 1:
|
||
# 构建长文本结果
|
||
result_long += f"关键词【{entry[6]}】发现新项目:[{entry[1]}]({entry[2]})\n"
|
||
result_long += f"项目描述:{entry[3]}\n"
|
||
result_long += f"上传时间:{entry[4]}\n"
|
||
result_long += f"开发语言:{entry[7]}\t\t作者:{entry[5]}\n"
|
||
result_long += "\n" + "-" * 10 + "\n" # 添加分隔线以便区分不同文章
|
||
|
||
# 构建短文本结果并进行分块处理
|
||
current_entry = (
|
||
f"关键词【{entry[6]}】发现新项目:[{entry[1]}]({entry[2]})\n"
|
||
f"上传时间:{entry[4]}\n"
|
||
f"开发语言:{entry[7]}\t\t作者:{entry[5]}\n"
|
||
"\n" + "-" * 3 + "\n" # 添加分隔线以便区分不同文章
|
||
)
|
||
|
||
# 存入数据库
|
||
record(
|
||
title=entry[1],
|
||
link=entry[2],
|
||
description=entry[3],
|
||
upload_time=entry[4],
|
||
author=entry[5],
|
||
keyword=entry[6],
|
||
language=entry[7],
|
||
source="New project"
|
||
)
|
||
elif choice == 2:
|
||
# 构建长文本结果
|
||
result_long += f"项目:[{entry[1]}]({entry[2]})存在更新!!!\n"
|
||
result_long += f"更新描述:{entry[3]}\n"
|
||
result_long += f"更新时间:{entry[4]}\n"
|
||
result_long += f"提交者:{entry[5]},[点此查看提交详情]({entry[7]})\n"
|
||
result_long += "\n" + "-" * 10 + "\n" # 添加分隔线以便区分不同文章
|
||
|
||
# 构建短文本结果并进行分块处理
|
||
current_entry = (
|
||
f"项目:[{entry[1]}]({entry[2]})存在更新!!!\n"
|
||
f"更新时间:{entry[4]}\n"
|
||
f"提交者:{entry[5]},[点此查看提交详情]({entry[7]})\n"
|
||
"\n" + "-" * 3 + "\n" # 添加分隔线以便区分不同文章
|
||
)
|
||
|
||
# 存入数据库
|
||
record(
|
||
title=entry[1],
|
||
link=entry[2],
|
||
description=entry[3],
|
||
upload_time=entry[4],
|
||
author=entry[5],
|
||
keyword=entry[7],
|
||
language=None,
|
||
source="Project update"
|
||
)
|
||
elif choice == 3:
|
||
# 构建长文本结果
|
||
result_long += f"大佬 {entry[5]} 上传了一个新工具:[{entry[1]}]({entry[2]})\n"
|
||
result_long += f"项目描述:{entry[3]}\n"
|
||
result_long += f"上传时间:{entry[4]}\n"
|
||
result_long += "\n" + "-" * 10 + "\n" # 添加分隔线以便区分不同文章
|
||
|
||
# 构建短文本结果并进行分块处理
|
||
current_entry = (
|
||
f"大佬 {entry[5]} 上传了一个新工具:[{entry[1]}]({entry[2]})\n"
|
||
f"上传时间:{entry[4]}\n"
|
||
"\n" + "-" * 3 + "\n" # 添加分隔线以便区分不同文章
|
||
)
|
||
|
||
# 存入数据库
|
||
record(
|
||
title=entry[1],
|
||
link=entry[2],
|
||
description=entry[3],
|
||
upload_time=entry[4],
|
||
author=entry[5],
|
||
keyword=None,
|
||
language=None,
|
||
source="New tool"
|
||
)
|
||
elif choice == 4:
|
||
# 构建长文本结果
|
||
result_long += f"【{entry[3]}】为[{entry[4]}]({entry[1]})发布了新版本,请及时查收!\n"
|
||
result_long += f"发布时间:{entry[2]}\n"
|
||
result_long += "\n" + "-" * 10 + "\n" # 添加分隔线以便区分不同文章
|
||
|
||
# 构建短文本结果并进行分块处理
|
||
current_entry = (
|
||
f"【{entry[3]}】为[{entry[4]}]({entry[1]})发布了新版本,请及时查收!\n"
|
||
f"发布时间:{entry[2]}\n"
|
||
"\n" + "-" * 3 + "\n" # 添加分隔线以便区分不同文章
|
||
)
|
||
|
||
# 存入数据库
|
||
record(
|
||
title=entry[4],
|
||
link=entry[1],
|
||
description=None,
|
||
upload_time=entry[2],
|
||
author=entry[3],
|
||
keyword=None,
|
||
language=None,
|
||
source="New version"
|
||
)
|
||
|
||
temp_result = result_short + current_entry
|
||
if len(temp_result.encode('utf-8')) > 4096:
|
||
short_results.append(result_short)
|
||
result_short = current_entry
|
||
else:
|
||
result_short = temp_result
|
||
|
||
# 处理最后一个结果
|
||
if result_short:
|
||
short_results.append(result_short)
|
||
|
||
return result_long, short_results
|
||
|
||
def Src_github(e_hour):
|
||
if not os.path.exists('./resources/db/github.db'):
|
||
# 创建数据库和表
|
||
create_database()
|
||
|
||
# 清空表
|
||
clear_table()
|
||
|
||
# 插入数据到数据库
|
||
insert_data()
|
||
|
||
# 查询指定时间段内的数据
|
||
keyword_data, repo_data, user_data, release_data = select_articles(e_hour)
|
||
|
||
results = []
|
||
if keyword_data:
|
||
result_long_1, short_results_1 = get_filtered_articles(keyword_data, 1)
|
||
results.append((result_long_1, short_results_1))
|
||
else:
|
||
results.append(("", []))
|
||
|
||
if repo_data:
|
||
result_long_2, short_results_2 = get_filtered_articles(repo_data, 2)
|
||
results.append((result_long_2, short_results_2))
|
||
else:
|
||
results.append(("", []))
|
||
|
||
if user_data:
|
||
result_long_3, short_results_3 = get_filtered_articles(user_data, 3)
|
||
results.append((result_long_3, short_results_3))
|
||
else:
|
||
results.append(("", []))
|
||
|
||
if release_data:
|
||
result_long_4, short_results_4 = get_filtered_articles(release_data, 4)
|
||
results.append((result_long_4, short_results_4))
|
||
else:
|
||
results.append(("", []))
|
||
|
||
return results
|
||
|
||
if __name__ == "__main__":
|
||
results = Src_github(240)
|
||
for i, (result_long, short_results) in enumerate(results, start=1):
|
||
if result_long != "":
|
||
print(f"长文本结果 {i}:")
|
||
print(result_long)
|
||
print("\n" + "-" * 10 + "\n")
|
||
|
||
if short_results:
|
||
print(f"分块的短文本结果 {i}:")
|
||
for short_result in short_results:
|
||
print(short_result)
|
||
print("\n" + "-" * 10 + "\n")
|
||
|
||
if all(result_long == "" and not short_results for result_long, short_results in results):
|
||
# 如果为空,则跳过执行
|
||
print("-" * 10)
|
||
print("github数据为空,跳过执行。")
|