From 8ec55edd2a90cee5f772e356e27db7ccce2f9413 Mon Sep 17 00:00:00 2001 From: MasonLiu <2857911564@qq.com> Date: Fri, 10 Jan 2025 16:56:13 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=8A=E7=BA=BF=E7=99=BE=E5=BA=A6=E6=90=9C?= =?UTF-8?q?=E7=B4=A2=E7=9B=91=E6=B5=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Core.py | 46 ++++- Dev_test.py | 177 +++++++++++----- GotoSend/__pycache__/github.cpython-312.pyc | Bin 17391 -> 17622 bytes GotoSend/baidu.py | 194 ++++++++++++++++++ GotoSend/github.py | 8 +- UpdateLOG.md | 3 +- config/check_config.py | 24 ++- config/config.yaml | 1 + config/keywords.yaml | 3 +- resources/JSON/baidu.json | 144 +++++++++++++ resources/db/baidu.db | Bin 0 -> 24576 bytes resources/history/baidu_news.md | 70 +++++++ resources/log/core.log | 1 + spider/__pycache__/github.cpython-312.pyc | Bin 9453 -> 9571 bytes spider/__pycache__/sougou_wx.cpython-312.pyc | Bin 5853 -> 5687 bytes spider/baidu.py | 150 ++++++++++++++ spider/sougou_wx.py | 6 - baidu开发文档.md => 开发文档/baidu开发文档.md | 5 +- .../github开发文档.md | 0 19 files changed, 753 insertions(+), 79 deletions(-) create mode 100644 GotoSend/baidu.py create mode 100644 resources/JSON/baidu.json create mode 100644 resources/db/baidu.db create mode 100644 resources/history/baidu_news.md create mode 100644 spider/baidu.py rename baidu开发文档.md => 开发文档/baidu开发文档.md (83%) rename github开发文档.md => 开发文档/github开发文档.md (100%) diff --git a/Core.py b/Core.py index 2262828..31dc1d4 100644 --- a/Core.py +++ b/Core.py @@ -19,6 +19,7 @@ from spider.freebuf import freebuf_main from spider.xianzhi import xianzhi_main from spider.sougou_wx import sougou_wx_main from spider.github import github_main, load_github_config +from spider.baidu import baidu_main from GotoSend.M_4hou import Src_4hou from GotoSend.anquanke import Src_anquanke from GotoSend.doonsec import Src_doonsec @@ -28,6 +29,7 @@ from GotoSend.qianxin import Src_qianxin from GotoSend.seebug import Src_seebug from GotoSend.sougou_wx import Src_sougou_wx from GotoSend.github import Src_github +from GotoSend.baidu import Src_baidu from config.check_config import get_core_config, get_debug_config, get_kewords_config from loguru import logger @@ -86,7 +88,7 @@ def check_avaliable(info_long, info_short, title, webhook_url, timestamp, sign): logger.info(f"{title}数据为空,跳过执行。") def send_job_RSS(time_1): - Sogou_WX, Doonsec_switch, Doonsec = get_kewords_config() + Doonsec_switch, Doonsec = get_kewords_config('Doonsec') # print(f"当前配置信息:Doonsec_switch:{Doonsec_switch}") # 爬取数据 seebug_main() @@ -127,7 +129,7 @@ def send_job_RSS(time_1): check_avaliable(result_seebug_long, result_seebug_short, "Seebug社区资讯", webhook_url, timestamp, sign) def send_job_SX(): - Sogou_WX, Doonsec_switch, Doonsec = get_kewords_config() + Sogou_WX = get_kewords_config('Sogou_WX') sougou_wx_main(Sogou_WX) result_sx_long = Src_sougou_wx(False) result_sx_short = Src_sougou_wx(True) @@ -146,6 +148,14 @@ def send_job_github(time_1): check_avaliable(result_github_3_long, result_github_3_short, "Github项目监控-大佬工具", webhook_url, timestamp, sign) check_avaliable(result_github_4_long, result_github_4_short, "Github项目监控-项目版本发布监测", webhook_url, timestamp, sign) +def send_job_baidu(): + Baidu = get_kewords_config('Baidu') + baidu_main(Baidu) + result_baidu_long = Src_baidu(False) + result_baidu_short = Src_baidu(True) + webhook_url, timestamp, sign = gen_sign() + check_avaliable(result_baidu_long, result_baidu_short, "百度搜索关键词相关内容", webhook_url, timestamp, sign) + # 探测rss源状态 def check_rss_status(url): try: @@ -190,6 +200,8 @@ def main_job(e_hour): send_job_SX() if 2 in choice: send_job_github(e_hour) + if 3 in choice: + send_job_baidu() logger.info("单次运行结束,等待下一次运行...") def main_loop(time_choice): @@ -227,25 +239,39 @@ def send_first_message(): start_info += "时间配置:每隔" + str(e_hour) + "小时执行一次推送\n" elif time_choice == 0: start_info += "时间配置:每天固定时间点执行推送\n" + start_info += "开启状态:\n" + + if 0 in choice: + start_info += "RSS源监测\n" + if 1 in choice: + start_info += "搜狗-微信公众号监测\n" + if 2 in choice: + start_info += "Github项目监测\n" + if 3 in choice: + start_info += "百度搜索关键词内容监测\n" + + if fs_activate == "True": + result = SendToFeishu(start_info, "程序信息", webhook_url_once, timestamp_once, sign_once) + logger.info(result) + send_result = SendToFeishu(f"[点此访问]({url_web})网站以查看全部文章。", "首次运行提醒", webhook_url_once, timestamp_once, sign_once) + logger.info(send_result) + if wx_activate == "True": + result = SendToWX(start_info, "程序信息") + logger.info(result) + send_result = SendToWX(f"[点此访问]({url_web})网站以查看全部文章。", "首次运行提醒") + logger.info(send_result) + if 0 in choice: if fs_activate == "True": - result = SendToFeishu(start_info, "程序信息", webhook_url_once, timestamp_once, sign_once) - logger.info(result) result = SendToFeishu(rss_info, "RSS源状态", webhook_url_once, timestamp_once, sign_once) # logger.info(rss_info) logger.info(result) - send_result = SendToFeishu(f"[点此访问]({url_web})网站以查看全部文章。", "首次运行提醒", webhook_url_once, timestamp_once, sign_once) - logger.info(send_result) else: pass if wx_activate == "True": - result = SendToWX(start_info, "程序信息") - logger.info(result) result = SendToWX(rss_info, "RSS源状态") # logger.info(rss_info) logger.info(result) - send_result = SendToWX(f"[点此访问]({url_web})网站以查看全部文章。", "首次运行提醒") - logger.info(send_result) else: pass diff --git a/Dev_test.py b/Dev_test.py index b1dc55e..c50a39d 100644 --- a/Dev_test.py +++ b/Dev_test.py @@ -1,50 +1,135 @@ -# -*- coding: utf-8 -*- -""" -@Author: MasonLiu -@Description: 测试用脚本,无需关注。 -""" +# # -*- coding: utf-8 -*- +# """ +# @Author: MasonLiu +# @Description: 测试用脚本,无需关注。 +# """ -import schedule -import os -import signal -import sys -import time -import yaml -import requests -from datetime import datetime, timedelta -from SendCore.FeishuSendBot import SendToFeishu, gen_sign -from SendCore.QiweiSendBot import SendToWX -from spider.common import run, seebug_main, M_4hou_main, anquanke_main, sec_wiki_main, huawei_main, doonsec_main, qianxin_main -from spider.freebuf import freebuf_main -from spider.xianzhi import xianzhi_main -from spider.sougou_wx import sougou_wx_main -from spider.github import github_main, load_github_config -from GotoSend.M_4hou import Src_4hou -from GotoSend.anquanke import Src_anquanke -from GotoSend.doonsec import Src_doonsec -from GotoSend.xianzhi import Src_xianzhi -from GotoSend.freebuf import Src_freebuf -from GotoSend.qianxin import Src_qianxin -from GotoSend.seebug import Src_seebug -from GotoSend.sougou_wx import Src_sougou_wx -from GotoSend.github import Src_github -from config.check_config import get_core_config, get_debug_config, get_kewords_config -from loguru import logger +# import schedule +# import os +# import signal +# import sys +# import time +# import yaml +# import requests +# from datetime import datetime, timedelta +# from SendCore.FeishuSendBot import SendToFeishu, gen_sign +# from SendCore.QiweiSendBot import SendToWX +# from spider.common import run, seebug_main, M_4hou_main, anquanke_main, sec_wiki_main, huawei_main, doonsec_main, qianxin_main +# from spider.freebuf import freebuf_main +# from spider.xianzhi import xianzhi_main +# from spider.sougou_wx import sougou_wx_main +# from spider.github import github_main, load_github_config +# from GotoSend.M_4hou import Src_4hou +# from GotoSend.anquanke import Src_anquanke +# from GotoSend.doonsec import Src_doonsec +# from GotoSend.xianzhi import Src_xianzhi +# from GotoSend.freebuf import Src_freebuf +# from GotoSend.qianxin import Src_qianxin +# from GotoSend.seebug import Src_seebug +# from GotoSend.sougou_wx import Src_sougou_wx +# from GotoSend.github import Src_github +# from config.check_config import get_core_config, get_debug_config, get_kewords_config +# from loguru import logger -# 全局变量 -webhook_url_once, timestamp_once, sign_once = gen_sign() -e_hour, time_choice, choice, fs_activate, wx_activate, ding_activate, lx_activate, url_web = get_core_config() -Sogou_WX, Doonsec_switch, Doonsec = get_kewords_config() -# print(f"当前配置信息:Doonsec_switch:{Doonsec_switch}") +# from baiduspider import BaiduSpider +# from pprint import pprint + +# # 全局变量 +# webhook_url_once, timestamp_once, sign_once = gen_sign() +# e_hour, time_choice, choice, fs_activate, wx_activate, ding_activate, lx_activate, url_web = get_core_config() +# Sogou_WX, Doonsec_switch, Doonsec = get_kewords_config() +# # print(f"当前配置信息:Doonsec_switch:{Doonsec_switch}") -if __name__ == "__main__": - # result_doonsec_long = Src_doonsec(False, Doonsec_switch, Doonsec) - # print(result_doonsec_long) - while True: - Sogou_WX, Doonsec_switch, Doonsec = get_kewords_config() - print(f"当前配置信息:Doonsec_switch:{Doonsec_switch}") - print(f"当前配置信息:Sogou_WX:{Sogou_WX}") - print(f"当前配置信息:Doonsec:{Doonsec}") - print("\n") - time.sleep(10) \ No newline at end of file +# # if __name__ == "__main__": +# # result_doonsec_long = Src_doonsec(False, Doonsec_switch, Doonsec) +# # print(result_doonsec_long) +# # while True: +# # Sogou_WX, Doonsec_switch, Doonsec = get_kewords_config() +# # print(f"当前配置信息:Doonsec_switch:{Doonsec_switch}") +# # print(f"当前配置信息:Sogou_WX:{Sogou_WX}") +# # print(f"当前配置信息:Doonsec:{Doonsec}") +# # print("\n") +# # time.sleep(10) +# # pprint(BaiduSpider().search_web(input(), exclude=['all'])) + +# import requests + +# def fetch_url(url, headers): +# try: +# response = requests.get(url, headers=headers) +# response.raise_for_status() # 如果响应状态码不是200,会抛出异常 +# return response.text +# except requests.RequestException as e: +# print(f"请求失败: {e}") +# return None + +# # 示例使用 +# if __name__ == "__main__": +# url = "https://www.baidu.com/s?tn=baidurt&cl=3&rn=20&ie=utf-8&rsv_bp=1&wd=齐鲁银行" # 替换为你要爬取的URL +# headers = { +# "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3", +# "Accept-Language": "en-US,en;q=0.9", +# # 可以在这里添加其他需要的头信息 +# } + +# content = fetch_url(url, headers) +# if content: +# print(content) +''' +
++ + + 山东省城市商业银行合作联盟有限公司 - 爱企查 + ++ + + +
+
+ 2024-11-29
+
+
+
+ 目前,联盟共有15家城商行股东,注册资本10.45亿元,线上运营银行资产突破3万亿元。公司自成立以来,以提升成员行信息科技支撑水平和风险治理水平为重点,为67家成员行提供了持续、稳定、强大的信息系统支撑,引领成员行通过科技创新推动业务发展和转型升级。公司接受监管部门严格的信息科技风险评估和监管,被原中国银监会列入首...
+
+ + + + + www.aiqicha.com/company_detail_82845... + + - + 百度快照 + + + + |
+
r?fY<>t63&loMa2%#{(oL}3X`-5=n~2HAN*UOde3B$@qMj%`6+(hPa))-6lO?+M)T1iUp|MzGG; zBf)Afc9ghmqRU~Im%>uAyDXcdUfz~j EZ%I+EU=g(DcJ}0 zs;teEctlShbrDN~{(Rn{-%{ZR%5FzyJYX$4yh3}Im|lH2kfrcm-c!=wEXDh1_c-ku z#>7MAg9F^qkY}JMVH)rjnSs y2{jagMrY2^X@z6gg z_pR_^C?H^}G2w!;HX|kG<;1SMHY4yqC9c#AMzkU&Uy$&YC?$5WL 2#c~#IWVL1=in|HZ%Xj^ u;ZH3 7rU&5Dg9vmqTxoZ_4cjmYOA*z zUtis}`mu?*ZK(;>)#}G3Kxcu+K#A0NyZk&UCt<&hk)9FK+X1!9*|< Q{}WID0G@E;AdJ+vTYUS%PuDpe!?) z<9e4@sfiq4_??H&^M$;VrnBeTK7ZLXA3DLXIbr6*JXU&I_yPgT2! xkkHY*}}*te_v4tE1Jb&T0N>+eenVbPby0*u)m 2?G()2= z4y)e2*E1+cGPJ~)5B-K=f}T(68|IrIwt&x$P}M-@1Y^8DGr?^0-r+)K@S>b>dT&S> z#cA8Oc48lY^c22cKhDnP^G^M%3IB|bK>>pT1_cZX7!)ulU{JuIfI$I+0tN*P3K$eH zDDb_Zz?*qT-pxyc8h*(3|HpCM|907%pPu}O {7>G}U2 8*zRe?IY#Z~V*A ze>(D?cr-qLMigjuo_f<1`BB8(<_8uhOXa53y|(%RSx2X~Dmmz^QdwUiUAts_2jCZ= z?Sa^1tHLtC)+!PXaB~iS^LZLWb-{q|j*1y>3iz+zu{Kym3rE(SyWSA2JL|3WG&cIZ z@A+;50_P1@`Of~rcdb$_Ux}0k;3<9L%iwSvo(J>_?;P(NkZDq4h)zF}$HxF{#$zfx zAU_?F60>yd9t%#;f#}lcH9+EYF&$|Npk|Jd 0e!y!v_UBmAoLuzb;F0Ipz1+v+OM}~!8eMl#E$;2ot&Y^cb9e`J| zw<@GZnQsP=mc3Q+d$jK^fKGS>Gz(Zry0S(#pR>2k(_~;?O56ov5W@#j^PsT;k_50) z2F_M|D1__7w@;eNKWn?33*jmf=s-;F9!0@@Xzj*eb&y9M+;jn9?X)`?7*F2~q42W5 zwn;b-Z29R7npGZJ<>4sYC}%^{MWkdBdGHB;1cI2yU}yj_HQ+=ak-|W=-A@r!!oynF z1Wj!*7D|Vv=z~nm;1!j)Vc-FI0R#jfeq|4vtBbs0aR&fu=b&^msu)N}1TK)iNz9U# z$fH^YqJW!oteF|8Gt|fc %^~veF@{z)Ap| z5#)iM%JVBZFlUW$$w$icLRqSLRZ0xXPo_A!nGG;DS;nYXWMlsrD3gh^8tYWHKkkRW zMh?Wkl%-l0>BcyHzChZ0xJZ==3_};|m0K185H2gR*_^f_x6@Ht%0-t?QCWtTsp$lL z_FS1B6 C&$g?}(W|60|q}d&WDd8LKk;#NS9R@O<$5;VIuhp}Uqt0mT#F2e= z7LfE3NOa)MeABNZXIoi7(t(QtMg}0UoV 7)E_(sc zqbz!ksg{xZv=wmb2 ?b=f(m;vM~Rm4N4I4Y*D+5HJqBbOP-DM zQd~}1HX8j#>A*6lYD_iSt5{^Yvthf+4p6h5E<2OGXgk&&1l*wKh0yM&nb4AFqhxT5 zyqFV9Z8mB9KB6I2?XHFjAoy&(6>$nq(Pb%iaPDCd8oU$IW;cnl=o-;Y!o6o$Y|r#2 z0J*?1qauKFK~9TfUopV`QO0iDE5otRyiBp9*rLTg?6s6GqDsDqMZjKUKOC9;S0`Z) z9*vx6Y0MRUho!_;2q`OZ7Gy_6yj=bdlKKXpr&bkh$Ul#eX{_NQZVh-XjZDh*I)yEI z%?q!#0QD9fHv-wh%_1dKWH>Xk1`y(?Cg}}2v)+C)L!2^p64qg6Ekf(k$g&x>1YW|{ z0`6UnbU=)Q-Xa5!AYkTj+lm#QThO8=5rCuC$|7de5G@uc`l^7r{1ccRD~mrCbK;K? zK?vpYUAPP1bZs WU(=@)0X?e@;oMtlS7mC!^4%G z4$*;7Wk(74-CZIgDANd6{+@*^UkVB#f1R%} 5tXw*P eZf^!taY>KpqX zZp%^?^xjq-UTmY|I?Q-!{rkV Hp^ QbcO)VM*_t9t6YwtPWYE>m z^2m==M}NyS{C=c;JSTHHd=@TO+o8ektylg6xBh%k*M4rF;z;`y;YGu46B9uu!t&6% zvOK}>XQ|Z76VFt{d_i5qcqoJe3I_w3XH9Xa*HGaS7oFqwPR*? 1*ed(5|q7 zzl>e{RETRN;KLDZxH##X{h3b-0(TH;`Ea?=!MXTlEm|(uA-Ujf#`P?GR4?fn=^2$0 zPX#)$hzms+kO)38fnAo6Xcv_3el@XI%|bSJ)u+D4sF!mUcM%+}*J^%{Z@LlrQO7&E zT&106S+4x-H1{}iZ(-3V7b<^tC38llxd5( iry2$_l literal 0 HcmV?d00001 diff --git a/resources/history/baidu_news.md b/resources/history/baidu_news.md new file mode 100644 index 0000000..190362e --- /dev/null +++ b/resources/history/baidu_news.md @@ -0,0 +1,70 @@ +#### 文章:[齐鲁银行信用卡申请进度入口查询 - 首页](https://ebank.qlbchina.com/pbank/) +描述:正在加载请稍候…… +**上传时间**:2025-01-08 +**来源**:百度快照 +**关键词**:齐鲁银行 + +---------------------------------------- +#### 文章:[齐鲁银行5.40(-1.46%)_股票行情_新浪财经_新浪网](http://finance.sina.com.cn/realstock/company/sh601665/nc.shtml) +描述:新浪财经为您提供 齐鲁银行 (601665)股票实时行情走势,实时资金流向,实时新闻资讯,研究报告,股吧互动,交易信息,个股点评,公告,财务指标分析等与齐鲁银行(601665)股票相关的信息与服务. +**上传时间**:2025-01-08 +**来源**:百度快照 +**关键词**:齐鲁银行 + +---------------------------------------- +#### 文章:[齐鲁银行山东省网点地址电话查询_齐鲁银行网点-金投网(手机金投网...](http://bank.cngold.org/yhwd/list_city_51_16.html) +描述:齐鲁银行 客服热线:40060-96588 简要名称:齐鲁银行 法定名称:齐鲁银行 银行性质: 城市商业银行 公司总部:济南市顺河街176号山东省-齐鲁银行网点查询 济南(共有105个齐鲁银行网点) 青岛(共有2个齐鲁银行网点) 聊城(共有7个齐鲁银行网点) 热门城市-齐鲁银行网点查询 北京 杭州 天津 重庆 苏州 南京 广州 厦门... +**上传时间**:2025-01-07 +**来源**:百度快照 +**关键词**:齐鲁银行 + +---------------------------------------- +#### 文章:[齐鲁银行在您身旁总行](https://www.qlbchina.com/qlbchina/jrql/xwgg/cggg/zx/index.html) +描述:齐鲁银行 银企大集系统优化升级项目(二次)竞争性磋商公告 [2025-01-06] 齐鲁银行信用卡多渠道进件及信用卡审批系统人力外包项目(二次)竞争性磋商公告 [2025-01-06] 齐鲁银行村镇智能柜台系统改造项目竞争性磋商公告 [2025-01-03] 齐鲁银行商用密码安全评估服务项目 ... +**上传时间**:2025-01-06 +**来源**:百度快照 +**关键词**:齐鲁银行 + +---------------------------------------- +#### 文章:[齐鲁银行(601665)_股票价格_行情_走势图—东方财富网](http://quote.eastmoney.com/unify/r/1.601665?from=classic&eventcode=Web_quote_entrance1) +描述:提供 齐鲁银行 (601665)股票的行情走势、五档盘口、逐笔交易等实时行情数据,及齐鲁银行(601665)的新闻资讯、公司公告、研究报告、行业研报、F10资料、行业资讯、资金流分析、阶段涨幅、所属板块、财务指标、机构观点、行业排名、估值水平、股吧互动等与齐鲁银行(601665)有关 +**上传时间**:2025-01-03 +**来源**:百度快照 +**关键词**:齐鲁银行 + +---------------------------------------- +#### 文章:[齐鲁银行(601665)_股票行情,行情首页_中财网](https://gg.cfi.cn/quote.aspx?stockid=92680&contenttype=outline&client=pc) +描述:中财网提供 齐鲁银行 (601665)实时行情动态分析,全面报道齐鲁银行(601665)基本资料及重大新闻、行业资讯,浏览齐鲁银行(601665)财务数据、行情数据,公司公告,重大事件。 +**上传时间**:2024-12-31 +**来源**:百度快照 +**关键词**:齐鲁银行 + +---------------------------------------- +#### 文章:[齐鲁银行(601665)股本结构_新浪财经_新浪网](http://vip.stock.finance.sina.com.cn/corp/go.php/vCI_StockStructure/stockid/601665.phtml) +描述:齐鲁银行 5.59 -0.06-1.06% 2024-12-31 15:00:01 昨收盘:5.65今开盘:5.65最高价:5.70最低价:5.59 市值:270.28亿元 流通:270.28成交:347797手 换手:0.72% 公司资料意见反馈 公司资料: 公司简介 股本结构 主要股东 流通股股东 基金持股 公司高管 公司章程 ... +**上传时间**:2024-12-31 +**来源**:百度快照 +**关键词**:齐鲁银行 + +---------------------------------------- +#### 文章:[齐鲁银行在您身旁齐鲁银行信用卡多渠道进件及信用卡审批系统人力...](https://www.qlbchina.com/qlbchina/2025-01/06/article_2025010613411012428.html) +描述:齐鲁银行 信用卡多渠道进件及信用卡审批系统人力外包项目(二次)竞争性磋商公告 齐鲁银行信用卡多渠道进件及信用卡审批系统人力外包项目(二次)竞争性磋商公告.docx +**上传时间**:2024-12-27 +**来源**:百度快照 +**关键词**:齐鲁银行 + +---------------------------------------- +#### 文章:[齐鲁银行股份有限公司德州分行 - 天眼查](https://www.tianyancha.com/company/2354701752) +描述:简介: 齐鲁银行 股份有限公司德州分行,成立于2014年,位于山东省德州市,是一家以从事货币金融服务为主的企业。通过天眼查大数据分析,齐鲁银行股份有限公司德州分行参与招投标项目9次;此外企业还拥有行政许可8个。风险方面共发现企业有 展开 财产线索 线索47预估价值1亿元 ... +**上传时间**:2024-12-09 +**来源**:百度快照 +**关键词**:齐鲁银行 + +---------------------------------------- +#### 文章:[齐鲁银行(601665.SH)公司高管-PC_HSF10资料](http://f10.eastmoney.com/f10_v2/CompanyManagement.aspx?code=sh601665) +描述:信贷处办事员、办公室科长,中国农业银行泰安市分行党委委员、副行长,中国农业银行山东省分行办公室副主任、主任,中国农业银行山东省分行党委委员、行长助理,中国农业银行山东省分行党委委员、行长助理兼东营市分行党委书记、行长,中国农业银行山东省分行党委委员、副行长,中国农业银行天津市分行党委书记、行长, 齐鲁银行 党委... +**上传时间**:2024-12-06 +**来源**:百度快照 +**关键词**:齐鲁银行 + +---------------------------------------- diff --git a/resources/log/core.log b/resources/log/core.log index e69de29..8b13789 100644 --- a/resources/log/core.log +++ b/resources/log/core.log @@ -0,0 +1 @@ + diff --git a/spider/__pycache__/github.cpython-312.pyc b/spider/__pycache__/github.cpython-312.pyc index 0c274585717f8909c257ac5d51056deed624a7ce..57e49d9712b67163fb87af77e3c0f9a1cb8db621 100644 GIT binary patch delta 1642 zcmah}U1(cX96zVYy}3#5O>Ua>GqcUIG8@)TT8JO8WbM`!+s;j$&Tg7r(%#!PO?q2T zZgs6Uqz{9xqYssXU$8NRJtzfHf{6GazKAf`i=gu|e9#v~1@QxP2zvhKHZ8=j^YF|6 z{6BvGkMqy{VeIR1<))$p5Ic9;E3;$oD4$EQ`#lJ)tFNG0RB5u&2-?EAOSBq)t7;3n z(7{}{F$5q~5v{N*A*748xZUL5Mo{igX7$zmZ68GG9IvPw>KZm*vm*7#%jyi)rAv~H zN6_|;vYvt9&4ZS*qZH7FIv=G2ktp`lclYg|5G#@{R{XYP#p+~5Rsxx@{kuUdrcTCf z8RlIlWH#FmrMawCf8DeGyUIK(lI_<6T>5*a@PA~&o0$z&M?9KMxD)BYRhH|D+AtOw zDyRf(q3TX|XIctZ2OFG2Tq`$d!=TkwZc`d;5v~o{0Xt~RtYO6t*&e<5s;?5Zo95eC zVExeRK?#TH8^KKXx))gx$B*5lN9#|E(F#h%s8?Q>9id fM =%kG{AD`X`MXzR z61^q0){607k2u9{t67?r+vwX}gI?&Fz9|oNkq4MW9t3!ZL79EUP$IELT0nDzj&%37 zd;($yAPe9FI7;pAL-a Pjns(d#|&* z6z>0o&mag(OL72d`dO%p4knrflU?7hjU{fe7o+XT%{FM1JqfHYf#)PJDq_+Pl4)Uy ziQmy*lbH^fnj>l2LcU;XS_*&5K0+w9kL}aMfigZrKRnQvX`}!iV1PI?zhG#ZJ5b=_ zgS;ypSy Z41PQ4$jZmdS1qcbL%Muo?ynqdlu2>+trOcuWx e! z!RY30!w+9X so2?en$xx Ogz3(vhu^Gf9M!B+8BVS?cDEuJKcR5+MoKr1Fwp_CuMYRlbox@|-yDUdRtucHJ zmN#20%1qU(MMG!DQ4zAMnq00Zs^x0x@JVNMXcj9xu5}C_k7BrQl~rC>mkn7fuT ib7Wd6OeK&B#}mMwTn3(UhA#>d^93S2078 zvxdDC`zp&-)XD{UuRUaa03L{a@MFO5ox@ru5Kn**jLp7`<~T=6WT(+SgTT$OR}u3F z1%dl!uOSu?cE35a@`$sb1&5s)H}j1TgI6hCY>fAQPw5hPLz{c;;3M40_g7siYI?re zE)JP$Z_ZM{YwSR*4!?+DxEBhX`B%djkU^Y8BoU9>_xw-$_RV$;dvj;uWO&$F +I=g|2-14$&-SrC;pl;T5OU!ysNIDl}& z$MHRG_8hmjxBhF~pZ?Dwp2wUQ5icNKLYzW88jQWdzp*iAg^QmxzD(Tan}v6ho4wc> zUmbpT4A;(Z7FL1r!3pO%ZvF&?!E|VmTb3ZprL0z# 9_I=Tqs{ zD|iIM8!)e&Evd5H+`u9_aPo)3v!#ksEvi$@gpYQfca6~_^tTlV?;44ma8sImKsX-i SULN~t&-9ez)|B&*+x<7x+A=Eu diff --git a/spider/__pycache__/sougou_wx.cpython-312.pyc b/spider/__pycache__/sougou_wx.cpython-312.pyc index 0f07af26524f6c41835e0ba52ca3837e2f77228f..fc6f715195264049cdd84ba82ca8bde477fa93f2 100644 GIT binary patch delta 728 zcmYjOO=uHA7@gT9n{ 1ZnKD{A+3tqRInCNPZlZl=Mscvli9|^Y}U>u)_`IU z@!-!v8IOgQ;w@ezhu*vjdZ{2Hp*?vPDuM@3&TJd?JG_1S=FPWnzi)SHYcl&$mIXj7 z)?2L 2(l9o6mcVJOMN(U5Wq*7E+oTBQ)k#=VS04!Gk zN^HWS#sNolv@YCl1VJ0NVJ`W0X9N<1OF^DYbA{eESKxSwTChx{$c@MdrOBQXnG7q3 zDg7Ort=n!1*>hx1Fre9cH)KGKW(7X$o59S4GvvEq!gE9wQ_v*G#Am$0 XnMjP*WAPa(Z`OIe>dPKEzE_ciRe-aW_iTNry4yXpA& zCbe88N7O7$Gp}YK5yCoo6gQwsc0;3NjTaISJnx1
ByqO#PM9 z1NQG(8zX16yg5El&=8W4{6^`9_LXVyP<$ZP%w9)ZhlNEd=3qbOG|C>THYzy{vjO4= z1NQmg5Aj9DSnhtf+F0_)^<*VJ%(M#(E-|=FJ|xelg8?bN*Q$6eYvsP>+SLYLrss>q z)L$jUC*q6x#>_@;J8y66rO&Z4d9VK(SE!*MD*0zH3yEz87K1W_S+X;npAS|XWMeZA z!UT=`G0SS$u47ra$_}O&m<$fq?|P_JcjoX4)!54YWpXWbQW5m83FC(x6}i>uUV?0< IX2R^&A3@liHUIzs delta 920 zcmZ8eOH9;I6zwYm)|qMPe3?NQh5&;T7zH8V0#t$@h%rI=7-!SezDFrAExfiU<3bW~ zfuDsw7cNL3u1wrWX5mf~m$I3Ki%An##)YmlCdL>|?5mEVH_hoi=iT1kckZ+9H@VJT zp7%3uoPS(Ye}ow3EhW|9e q0L}EfC4Z)%fiSy$OmC`oyqes z=Ar=OF4iLvHY^-u7|g2-OgsYl5X+!?6nX-FH3iHnSOwYSPB{rqfL@1#hGG|xZaJJT zuV|`lA;+g1Ix4cqdy94Ua8Gwyb2tmj+9JYc(Z7JuQchmfR!}y0pB=bfOinMUScWJ! zuA4})G0I&)%QiACvnWg=C3hNG%3SWGE9~Xx42bsfNzM9)Pn!rk+yvGPtXZoLJB1wo zsG;jfu{1+}Pt8+mTqsSjO>CX(AcMmM`=)ABZkjCZ`nr9fnSABCKxe6ae+q=+u5MBj z(zuyccaTYcYhr||)bBN>JSvqHwLGe>F*rl+_;1&>(Yk)p99XRzr0G$z8MqmzUpxLK z8JkF&vkHrN$`#s5gTY$>WXMZ?E7<1>)nAc<&`_Iqg-Nm@bb?H2M@R$UBg-`r@~%Dw zcyhR=iKN9)fZoB2`7jw0JBDL_I34}Ju%cCAK2gp7zUL2k{ddHK>%l9r1+ M)|9P*5q1 zkg-_TS#OozRMb~*46m`nNs=xXkR;&(Jvc%kLxIlnjT 标签 + tables = soup.find_all('table', class_='result') + + results = [] + + for table in tables: + # 提取标题和链接 + h3_tag = table.find('h3', class_='t') + if h3_tag: + a_tag = h3_tag.find('a') + title = a_tag.get_text(strip=True) if a_tag else "No title found" + link = a_tag['href'] if a_tag else "No link found" + else: + title = "No title found" + link = "No link found" + + # 提取摘要 + td_element = table.find('td', class_='f') + + # 从td中进一步查找div.realtime之后的所有文本 + realtime_div = td_element.find('div', class_='realtime') + if realtime_div: + text_parts = [] + for sibling in realtime_div.next_siblings: + if sibling.name == 'font': + break + if isinstance(sibling, str) and sibling.strip(): + text_parts.append(sibling.strip()) + elif sibling.name and sibling.get_text(strip=True): + text_parts.append(sibling.get_text(strip=True)) + + # 将所有文本片段合并成一个字符串,并整理格式 + cleaned_text = ' '.join(text_parts) + + # 提取发布者 + publisher_tag = table.find('a', class_='m') + publisher = publisher_tag.get_text(strip=True) if publisher_tag else "百度快照" + + # 提取时间戳 + time_tag = table.find('div', class_='realtime') + pub_date = time_tag.get_text(strip=True) if time_tag else "No timestamp found" + pub_date = normalize_pub_date(pub_date) + + results.append({ + "title": title, + "link": link, + "description": cleaned_text, + "author": publisher, + "pubDate": pub_date + }) + + return results + +def baidu_main(keywords): + all_results = {} # 用于存储所有关键词的结果 + + for keyword in keywords: + url = f"https://www.baidu.com/s?tn=baidurt&cl=3&rn=20&ie=utf-8&rsv_bp=1&wd={keyword}" + # print(url) + html_content = fetch_html(url) + # 将解析后的数据保存到 JSON 文件 + with open('./test.html', 'w', encoding='utf-8') as f: + f.write(html_content) + # print(html_content) + + if html_content is None: + logger.warning(f"无法获取百度搜索内容,跳过保存操作。关键词: {keyword}") + continue + + results = parse_html(html_content) + # 移除非法代理对 + logger.info(f"关键词【{keyword}】的百度搜索内容保存成功。") + all_results[keyword] = results # 将结果存储在字典中,以关键词为键 + time.sleep(5) + + # 将所有结果转换为JSON格式 + json_results = json.dumps(all_results, ensure_ascii=False, indent=4) + # print(json_results) + + # 确保目录存在 + os.makedirs(os.path.dirname('./resources/JSON/baidu.json'), exist_ok=True) + + # 将解析后的数据保存到 JSON 文件 + with open('./resources/JSON/baidu.json', 'w', encoding='utf-8') as f: + f.write(json_results) + +if __name__ == "__main__": + keywords = ["齐鲁银行"] + baidu_main(keywords) \ No newline at end of file diff --git a/spider/sougou_wx.py b/spider/sougou_wx.py index e125381..8c1c7f9 100644 --- a/spider/sougou_wx.py +++ b/spider/sougou_wx.py @@ -12,12 +12,6 @@ headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2", "Accept-Encoding": "gzip, deflate, br", - "Upgrade-Insecure-Requests": "1", - "Sec-Fetch-Dest": "document", - "Sec-Fetch-Mode": "navigate", - "Sec-Fetch-Site": "none", - "Sec-Fetch-User": "?1", - "Priority": "u=0, i", "Te": "trailers", "Connection": "keep-alive" } diff --git a/baidu开发文档.md b/开发文档/baidu开发文档.md similarity index 83% rename from baidu开发文档.md rename to 开发文档/baidu开发文档.md index 0721f06..febf165 100644 --- a/baidu开发文档.md +++ b/开发文档/baidu开发文档.md @@ -16,4 +16,7 @@ www.baidu.com/s?wd={关键词}&cl=3&pn=1&ie=utf-8&rn=20&tn=baidurt - python-bs4网页解析 - python-sqlite联动 - python-request爬虫 -- sqlite筛选 \ No newline at end of file +- sqlite筛选 + +### 问题阐述 +- 百度抓取的文章若是时间是近日,则不会显示具体时间,而是显示【*天前】,需要处理 \ No newline at end of file diff --git a/github开发文档.md b/开发文档/github开发文档.md similarity index 100% rename from github开发文档.md rename to 开发文档/github开发文档.md