Poc_Scanner/new_poc_tools.py

import os
import time
import logging
import tldextract
import base64
import urllib.parse
import sys
import docx
import os
import re
import warnings
import requests
import argparse
from docx.shared import Cm  #单位换算函数
from bs4 import BeautifulSoup
from docx import Document
from docx.oxml.ns import qn
from base_tool import validate_main, check_url_status
from screenshot import screenshot
from colorama import init, Fore

# 初始化 colorama
init()

os.system("")
warnings.filterwarnings("ignore")
# 配置日志记录
# logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s', datefmt='%H:%M')

DEFAULT_HEADERS = {
    'Accept': '*/*',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8',
    'Referer': 'https://www.baidu.com',
    'Accept-Encoding': 'gzip, deflate',
    'Connection': 'keep-alive',
}

def get_company_name(url):
    # 发送HTTP请求获取网页内容
    response = requests.get(url, headers=DEFAULT_HEADERS, verify=False)
    
    # 检查请求是否成功
    if response.status_code != 200:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
        return None
    
    # 解析HTML
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # 查找公司名称的<a>标签
    company_name_tag = soup.find('div', {'tag': 'company_name'})
    
    # 提取公司名称
    if company_name_tag:
        text = company_name_tag.text
        cleaned_text = re.sub(r'[^\w\s]', '', text)  # 去除英文符号
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()  # 去除多余空格和换行符
        return cleaned_text
        # return company_name_tag.text
    else:
        print("公司名称未找到")
        return None

def get_website_title(url):
    try:
        # 发送HTTP请求获取网页内容
        response = requests.get(url, headers=DEFAULT_HEADERS, verify=False)
        response.raise_for_status()  # 检查请求是否成功
    except requests.exceptions.RequestException as e:
        print(f"Failed to retrieve the page: {e}")
        return None
    
    # 解析HTML
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # 查找<title>标签
    title_tag = soup.find('title')
    
    # 提取标题内容
    if title_tag:
        return title_tag.text
    else:
        print("网站标题未找到")
        return None

def extract_domains_from_file(file_path):
    domains = []
    try:
        with open(file_path, 'r') as file:
            for line in file:
                domains.append(line.strip())
    except FileNotFoundError:
        logging.error(f"文件未找到: {file_path}")
    except Exception as e:
        logging.error(f"读取文件时出错: {e}")
    return domains

def create_document():
    document = Document()
    document.styles['Normal'].font.name = 'Times New Roman'
    document.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')
    return document

def extract_root_domain(url):
    extracted = tldextract.extract(url)
    root_domain = f"{extracted.domain}.{extracted.suffix}"
    # encoded_bytes = base64.b64encode(root_domain.encode('utf-8'))
    # encoded_str = encoded_bytes.decode('utf-8')    
    return urllib.parse.quote(root_domain)

def add_scan_results_to_document(document, domain, results, include_all, description, choice_3):
    for name, result, status_code, url, res_time in results:
        if include_all or result == "存在漏洞":
            company_name = get_company_name("https://whois.west.cn/icp/" + extract_root_domain(domain))
            document.add_heading(f"目标：{domain}", level=3)
            document.add_paragraph(f"漏洞名称：{name}")
            document.add_paragraph(f"公司名称：{company_name}")
            document.add_paragraph(f"漏洞链接：{url}")
            document.add_paragraph(f"响应状态：{status_code}")
            document.add_paragraph(f"响应时间：{res_time}")
            document.add_paragraph(f"漏洞情况：{result}")
            document.add_paragraph("\n")
            
            if result == "存在漏洞" and choice_3 == "y":
                screenshot_path_1 = screenshot(url)
                # print(screenshot_path_1)
                # 站长工具反爬，该截图路径已废弃
                # screenshot_path_2 = screenshot("https://icp.chinaz.com/home/info?host=" + extract_root_domain(domain))
                # print(screenshot_path_2)
                #word处理部分
                #导入模板
                doc = docx.Document("./file/模板.docx")
                #固定重复部分，以下对应模板中：网站域名，漏洞名称，测试用例
                #word中对应的位置，分别为：网站名称，网站域名，漏洞名称，测试用例
                title = get_website_title(domain)
                doc.paragraphs[2].text = title
                doc.paragraphs[4].text = domain
                doc.paragraphs[6].text = name
                doc.paragraphs[8].text = url
                doc.paragraphs[10].text = description
                #验证截图，添加图片对应word的位置
                paragraph = doc.paragraphs[10]	#图片位置
                #添加图片部分，宽高自行调整，单位cm
                run = paragraph.add_run()
                if screenshot_path_1:
                    run.add_picture(screenshot_path_1, width=Cm(16.52), height=Cm(9.13))	#添加图片
                # run.add_picture(screenshot_path_2, width=Cm(16.52), height=Cm(9.13))	#添加ICP备案图片，已废弃寻找新方法
                doc_save_path = './file/result/'
                if not os.path.exists(doc_save_path):
                    os.mkdir(doc_save_path)
                #保存word，根据需要自行更改
                
                doc_name = str(company_name) + "_" + name + ".docx"
                doc.save(doc_save_path + doc_name)


def mass_poc_scan(domains, include_all, choice_2, docx_name, status):
    document = create_document()
    current_domain = None  # 用于记录当前正在扫描的域名
    
    try:
        for domain in domains:
            logging.info(f"正在扫描域名：{domain}")
            current_domain = domain  # 记录当前正在扫描的域名
            
            if status == 'y':
                if not check_url_status(domain):
                    logging.warning(f"访问失败，跳过当前域名的扫描：{domain}")
                    print("--------------------------------------------------")
                    if choice_2.lower() == 'y':
                        document.add_heading(f"目标：{domain} 无法访问！", level=3)  # 将标题升级为level=3
                    continue

                try:
                    results, description = validate_main(domain)
                    add_scan_results_to_document(document, domain, results, include_all, description, choice_3)
                except Exception as e:
                    logging.error(f"扫描域名 {domain} 时出错：{e}")
                print("--------------------------------------------------")
                
            else:
                try:
                    results, description = validate_main(domain)
                    add_scan_results_to_document(document, domain, results, include_all, description, choice_3)
                except Exception as e:
                    logging.error(f"扫描域名 {domain} 时出错：{e}")
                print("--------------------------------------------------")
    
    except KeyboardInterrupt:
        print(Fore.RED + '\n检测到Ctrl+C，中断程序：')
        if current_domain is not None:
            print(f"当前正在扫描的域名为：{current_domain} 。")
        else:
            print("暂未开始扫描计划。")
        print(Fore.RESET)
    
    save_document(document, docx_name)

def save_document(document, docx_name):
    timestamp = str(int(time.time()))
    # 如果 docx_name 为空，则结束程序
    if not docx_name:
        logging.info("程序结束!")
        sys.exit()
    results_dir = "./file/report/"
    if not os.path.exists(results_dir):
        os.makedirs(results_dir)
    document.save(f"{results_dir}/{docx_name}_{timestamp}.docx")
    logging.info("扫描报告已生成！")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="大部分参数在启动程序后输入，若需要启用默认模式，请查看以下说明。")
    parser.add_argument('--batch', action='store_true', help='启用批处理模式，默认使用urls目标/删除无洞链接/生成报告')
    parser.add_argument('-n', '--name', type=str, help='输入文件名称')
    args = parser.parse_args()

    if args.batch:
        # 批处理模式
        print("默认模式（默认目标文档/筛选漏洞链接/生成报告）！")
        file_path = "./urls.txt"
        include_all = False
        choice_3 = 'y'
        status = 'y'
    else:
        # 交互模式
        choice = input(Fore.BLUE + "请问是否需要输入其他目标文件？(y/n): " + Fore.RESET).lower()
        if choice == 'n':
            print("默认目标文档：urls.txt")
            file_path = "./urls.txt"
        else:
            file_path = input(Fore.BLUE + "请输入需要扫描的目标文件：" + Fore.RESET)
        
        print("--------------------------------------------------")
        domains = extract_domains_from_file(file_path)
        
        status = input(Fore.BLUE + "请问是否需要检查目标网站存活状态？(y/n): " + Fore.RESET).lower()
        print("--------------------------------------------------")
        choice_2 = input(Fore.BLUE + "请问是否删除无漏洞网站记录？(y/n): " + Fore.RESET).lower()
        include_all = choice_2 != 'y'
        print("--------------------------------------------------")
        
        choice_3 = input(Fore.BLUE + "请问是否生成漏洞报告？(y/n): " + Fore.RESET).lower()
        print("--------------------------------------------------")
    # 执行扫描
    domains = extract_domains_from_file(file_path)
    if args.batch:
        mass_poc_scan(domains, include_all, choice_3, args.name, status)
    else:
        docx_name = input(Fore.BLUE + "请输入总报告文件名(回车可跳过生成报告步骤)：" + Fore.RESET)
        print("--------------------------------------------------")
        mass_poc_scan(domains, include_all, choice_3, docx_name, status)
    
# if __name__ == "__main__":
#     domain = 'http://vr.sh-fit.com:9090'
#     company_name = get_company_name("https://whois.west.cn/icp/" + extract_root_domain(domain))
#     print(company_name)