Merge branch 'main' into feature/pickle-data

2024-06-03 20:31:12 +08:00
parent b518fef6d2 4f4860342c
commit f113449fc4
17 changed files with 938 additions and 42 deletions
--- a/detection/Regexdetection.py
+++ b/detection/Regexdetection.py
@@ -25,15 +25,25 @@ def find_dangerous_functions(
        ".cpp": {
            r"\bsystem\(": "high",
        },
+        ".pyc": {
+            r"\bexec\b": "high",
+            r"\beval\b": "high",
+            r"\bos\.system\b": "high",
+            r"\bos\.exec\b": "high",
+            r"\bos\.fork\b": "high",
+            r"\bos\.kill\b": "high",
+            r"\bos\.popen\b": "medium",
+            r"\bos\.spawn\b": "medium",
+        },
    }
    risk_patterns = patterns.get(file_extension, {})
    classified_results = {"high": [], "medium": [], "low": [], "none": []}
-    for line_number, line in enumerate(file_content.split("\n"), start=1):
-        clean_line = remove_comments(line, file_extension)
-        if not clean_line:
-            continue
-        for pattern, risk_level in risk_patterns.items():
-            if re.search(pattern, clean_line):
-                classified_results[risk_level].append((line_number, clean_line))
+    if file_content is not None:
+        for line_number, line in enumerate(file_content.split("\n"), start=1):
+            clean_line = remove_comments(line, file_extension)
+            if not clean_line:
+                continue
+            for pattern, risk_level in risk_patterns.items():
+                if re.search(pattern, clean_line):
+                    classified_results[risk_level].append((line_number, clean_line))
    return classified_results
-
--- a/detection/main.py
+++ b/detection/main.py
@@ -0,0 +1,452 @@
+import os
+from typing import Dict, List, Tuple, Optional
+from reportlab.lib.pagesizes import letter
+from reportlab.lib.styles import getSampleStyleSheet
+from reportlab.platypus import Paragraph, Spacer, SimpleDocTemplate
+from .Regexdetection import find_dangerous_functions
+from .GPTdetection import detectGPT
+from .pyc_detection import disassemble_pyc
+from .utils import *
+import sys
+from colorama import init, Fore, Style
+from tqdm import tqdm
+from pathlib import Path
+
+PYCDC_FLAG = True
+PYCDC_ADDR_FLAG = True
+SUPPORTED_EXTENSIONS = {".py", ".js", ".cpp", ".pyc"}
+OUTPUT_FORMATS = ["html", "md", "txt", "pdf"]
+ORDERS = [
+    "__import__",
+    "system",
+    "exec",
+    "popen",
+    "eval",
+    "subprocess",
+    "__getattribute__",
+    "getattr",
+    "child_process",
+]
+
+# Initialize colorama
+init(autoreset=True)
+
+ORANGE = "\033[38;5;214m"
+CYAN = Fore.CYAN
+
+
+def supports_color() -> bool:
+    """
+    Checks if the running terminal supports color output.
+
+    Returns:
+        bool: True if the terminal supports color, False otherwise.
+    """
+    # Windows support
+    if sys.platform == "win32":
+        return True
+    # Check if output is a TTY (terminal)
+    if hasattr(sys.stdout, "isatty") and sys.stdout.isatty():
+        return True
+    return False
+
+
+def supports_emoji() -> bool:
+    """
+    Checks if the running terminal supports emoji output.
+
+    Returns:
+        bool: True if the terminal supports emoji, False otherwise.
+    """
+    # This is a simple check. Modern terminals typically support emoji.
+    return sys.platform != "win32" or os.getenv("WT_SESSION") is not None
+
+
+def highlight_orders(line: str, risk_level: str, use_color: bool) -> str:
+    """
+    Highlights specific orders in the line based on risk level.
+
+    Args:
+        line (str): The line to highlight.
+        risk_level (str): The risk level of the line ("high", "medium", "low").
+        use_color (bool): Whether to use color for highlighting.
+
+    Returns:
+        str: The highlighted line.
+    """
+    risk_colors = {
+        "high": Fore.RED,
+        "medium": Fore.YELLOW,
+        "low": CYAN,
+    }
+    color = risk_colors.get(risk_level, Fore.WHITE) if use_color else ""
+    reset = Style.RESET_ALL if use_color else ""
+
+    for order in ORDERS:
+        line = line.replace(order, f"{color}{order}{reset}")
+    return line
+
+
+def generate_text_content(results: Dict[str, List[Tuple[int, str]]]) -> str:
+    """
+    Generates a formatted text report for security analysis results.
+
+    Args:
+        results (Dict[str, List[Tuple[int, str]]]): The security analysis results categorized by risk levels.
+
+    Returns:
+        str: The formatted text report as a string.
+    """
+    use_color = supports_color()
+    use_emoji = supports_emoji()
+
+    text_output = "Security Analysis Report\n"
+    text_output += "=" * 30 + "\n\n"
+
+    for risk_level, entries in results.items():
+        if entries and risk_level != "none":
+            risk_color = (
+                {
+                    "high": Fore.RED,
+                    "medium": Fore.YELLOW,
+                    "low": Fore.GREEN,
+                }.get(risk_level, Fore.WHITE)
+                if use_color
+                else ""
+            )
+
+            risk_title = (
+                {
+                    "High": "👹",
+                    "Medium": "👾",
+                    "Low": "👻",
+                }
+                if use_emoji
+                else {
+                    "High": "",
+                    "Medium": "",
+                    "Low": "",
+                }
+            )
+
+            text_output += f"{risk_color}{risk_level.capitalize()} Risk{risk_title[risk_level.capitalize()]}:{Style.RESET_ALL if use_color else ''}\n"
+            text_output += "-" * (len(risk_level) + 6) + "\n"
+            for line_num, line in entries:
+                line = highlight_orders(line, risk_level, use_color)
+                line_text = f"{Style.RESET_ALL if use_color else ''} {Fore.GREEN if use_color else ''}{line_num}{Style.RESET_ALL if use_color else ''}: {line}{Style.RESET_ALL if use_color else ''}\n"
+                text_output += line_text
+            text_output += "\n"
+
+    return text_output
+
+
+def output_results(
+    results: Dict[str, List[Tuple[int, str]]],
+    output_format: str,
+    output_file: Optional[str] = None,
+) -> None:
+    """
+    Outputs the security analysis results in the specified format.
+
+    Args:
+        results (Dict[str, List[Tuple[int, str]]]): The security analysis results categorized by risk levels.
+        output_format (str): The format to output the results in. Supported formats: "pdf", "html", "md", "txt".
+        output_file (Optional[str]): The name of the file to save the output. If None, prints to the terminal.
+    """
+    OUTPUT_FORMATS = {"pdf", "html", "md", "txt"}
+
+    if output_file:
+        file_name, file_ext = os.path.splitext(output_file)
+        if output_format not in OUTPUT_FORMATS:
+            output_format = "txt"
+            output_file = f"{file_name}.txt"
+        results_dir = os.path.dirname(output_file)
+        if not os.path.exists(results_dir) and results_dir != "":
+            os.makedirs(results_dir)
+        if output_format == "pdf":
+            output_pdf(results, output_file)
+        elif output_format == "html":
+            output_html(results, output_file)
+        elif output_format == "md":
+            output_markdown(results, output_file)
+        else:  # Default to txt
+            output_text(results, output_file)
+    else:
+        # If no output file is specified, default to text output to the terminal.
+        txt_output = generate_text_content(results)
+        print(txt_output)
+
+
+def output_pdf(results: Dict[str, List[Tuple[int, str]]], file_name):
+    doc = SimpleDocTemplate(file_name, pagesize=letter)
+    story = []
+    styles = getSampleStyleSheet()
+
+    # Add the title centered
+    title_style = styles["Title"]
+    title_style.alignment = 1  # Center alignment
+    title = Paragraph("Security Analysis Report", title_style)
+    story.append(title)
+    story.append(Spacer(1, 20))  # Space after title
+
+    # Add risk levels and entries
+    normal_style = styles["BodyText"]
+    for risk_level, entries in results.items():
+        if risk_level != "none":
+            story.append(
+                Paragraph(f"{risk_level.capitalize()} Risk:", styles["Heading2"])
+            )
+            for line_num, line in entries:
+                entry = Paragraph(f"Line {line_num}: {line}", normal_style)
+                story.append(entry)
+            story.append(Spacer(1, 12))  # Space between sections
+
+    doc.build(story)
+
+
+def output_html(results: Dict[str, List[Tuple[int, str]]], file_name=None):
+    """
+    Generates an HTML report for security analysis results.
+
+    Args:
+        results (Dict[str, List[Tuple[int, str]]]): The security analysis results categorized by risk levels.
+        file_name (Optional[str]): The name of the file to save the HTML output. If None, returns the HTML string.
+
+    Returns:
+        Optional[str]: The HTML string if file_name is None, otherwise None.
+    """
+    html_output = """
+    <html>
+    <head>
+        <meta charset="UTF-8">
+        <meta name="viewport" content="width=device-width, initial-scale=1.0">
+        <link rel="icon" href="https://s2.loli.net/2024/05/30/WDc6MekjbuCU9Qo.png">
+        <title>Security Analysis Report</title>
+        <style>
+            body {
+                background-image: url('https://s2.loli.net/2024/05/30/85Mv7leB2IRWNp6.jpg');
+                background-size: 100%, auto;
+                background-attachment: fixed;
+                font-family: Arial, sans-serif;
+            }
+            h1, h2 {
+                color: white;
+            }
+            ul {
+                list-style-type: none;
+                padding: 0;
+            }
+            li {
+                background: rgba(255, 255, 255, 0.8);
+                margin: 5px 0;
+                padding: 10px;
+                border-radius: 5px;
+            }
+        </style>
+    </head>
+    <body>
+        <h1>Security Analysis Report</h1>
+    """
+
+    for risk_level, entries in results.items():
+        if risk_level != "none":
+            risk_title = {
+                "High": f"<h2>{risk_level.capitalize()} Risk👹</h2><ul>",
+                "Medium": f"<h2>{risk_level.capitalize()} Risk👾</h2><ul>",
+                "Low": f"<h2>{risk_level.capitalize()} Risk👻</h2><ul>",
+            }
+            html_output += risk_title[risk_level.capitalize()]
+            for line_num, line in entries:
+                html_output += f"<li>{line_num}: {line}</li>"
+            html_output += "</ul>"
+
+    html_output += "</body></html>"
+
+    if file_name:
+        with open(file_name, "w", encoding="utf-8") as file:
+            file.write(html_output)
+        return None
+    else:
+        return html_output
+
+
+def output_markdown(results: Dict[str, List[Tuple[int, str]]], file_name=None):
+    """
+    Generates a Markdown report for security analysis results.
+
+    Args:
+        results (Dict[str, List[Tuple[int, str]]]): The security analysis results categorized by risk levels.
+        file_name (Optional[str]): The name of the file to save the Markdown output. If None, returns the Markdown string.
+
+    Returns:
+        Optional[str]: The Markdown string if file_name is None, otherwise None.
+    """
+    md_output = "# Security Analysis Report\n\n"
+
+    for risk_level, entries in results.items():
+        if risk_level != "none":
+            md_output += f"## {risk_level.capitalize()} Risk\n\n"
+            md_output += "| Line Number | Description |\n"
+            md_output += "|-------------|-------------|\n"
+            for line_num, line in entries:
+                md_output += f"| {line_num} | {line} |\n"
+            md_output += "\n"
+
+    if file_name:
+        with open(file_name, "w") as file:
+            file.write(md_output)
+        return None
+    else:
+        return md_output
+
+
+def output_text(results: Dict[str, List[Tuple[int, str]]], file_name=None):
+    """
+    Generates a plain text report for security analysis results.
+
+    Args:
+        results (Dict[str, List[Tuple[int, str]]]): The security analysis results categorized by risk levels.
+        file_name (Optional[str]): The name of the file to save the text output. If None, returns the text string.
+
+    Returns:
+        Optional[str]: The text string if file_name is None, otherwise None.
+    """
+    text_output = "Security Analysis Report\n"
+    text_output += "=" * len("Security Analysis Report") + "\n\n"
+
+    for risk_level, entries in results.items():
+        if risk_level != "none":
+            text_output += f"{risk_level.capitalize()} Risk:\n"
+            text_output += "-" * len(f"{risk_level.capitalize()} Risk:") + "\n"
+            for line_num, line in entries:
+                text_output += f"  Line {line_num}: {line}\n"
+            text_output += "\n"
+
+    if file_name:
+        with open(file_name, "w") as file:
+            file.write(text_output)
+        return None
+    else:
+        return text_output
+
+
+def checkModeAndDetect(mode: str, filePath: str, fileExtension: str, pycdc_addr: str):
+    # TODO:添加更多方式，这里提高代码的复用性和扩展性
+    if fileExtension == ".pyc":
+        # 反汇编pyc文件
+        file_content = disassemble_pyc(filePath, pycdc_addr)
+        if file_content == "none":
+            global PYCDC_FLAG
+            PYCDC_FLAG = False
+            return ""
+        elif file_content == "invalid":
+            global PYCDC_ADDR_FLAG
+            PYCDC_ADDR_FLAG = False
+        if mode == "regex":
+            return find_dangerous_functions(file_content, fileExtension)
+        elif mode == "llm":
+            return detectGPT(file_content)
+        else:
+            return find_dangerous_functions(file_content, fileExtension)
+    else:
+        file_content = read_file_content(filePath)
+        if mode == "regex":
+            return find_dangerous_functions(file_content, fileExtension)
+        elif mode == "llm":
+            return detectGPT(file_content)
+        else:
+            return find_dangerous_functions(file_content, fileExtension)
+
+
+def process_path(
+    path: str, output_format: str, mode: str, pycdc_addr: str, output_file=None
+):
+    results = {"high": [], "medium": [], "low": [], "none": []}
+    if os.path.isdir(path):
+        # 使用rglob获取所有文件
+        all_files = [
+            file_path
+            for file_path in Path(path).rglob("*")
+            if file_path.suffix in SUPPORTED_EXTENSIONS
+        ]
+
+        # 扫描动画
+        for file_path in tqdm(all_files, desc="Scanning files", unit="file"):
+            file_extension = file_path.suffix
+            file_results = checkModeAndDetect(
+                mode, str(file_path), file_extension, pycdc_addr
+            )
+            if file_results is not None:
+                for key in file_results:
+                    if key != "none":  # Exclude 'none' risk level
+                        results[key].extend(
+                            [
+                                (f"{file_path}: Line {line_num}", line)
+                                for line_num, line in file_results[key]
+                            ]
+                        )
+    elif os.path.isfile(path):
+        file_extension = os.path.splitext(path)[1]
+        if file_extension in SUPPORTED_EXTENSIONS:
+            file_results = checkModeAndDetect(mode, path, file_extension, pycdc_addr)
+            if file_results is not None:
+                for key in file_results:
+                    if key != "none":  # Exclude 'none' risk level
+                        results[key].extend(
+                            [
+                                (f"{path}: Line {line_num}", line)
+                                for line_num, line in file_results[key]
+                            ]
+                        )
+        else:
+            print("Unsupported file type.")
+            return
+    else:
+        print("Invalid path.")
+        sys.exit(1)
+
+    output_results(results, output_format, output_file)
+
+
+def main():
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description="Backdoor detection tool.", prog="detection"
+    )
+    parser.add_argument("path", help="Path to the code to analyze")
+    parser.add_argument("-o", "--output", help="Output file path", default=None)
+    parser.add_argument(
+        "-m", "--mode", help="Mode of operation:[regex,llm]", default="regex"
+    )
+    parser.add_argument(
+        "-p", "--pycdc", help="Path to pycdc.exe to decompile", default=None
+    )
+    args = parser.parse_args()
+    output_format = "txt"  # Default output format
+    output_file = None
+    if args.output:
+        _, ext = os.path.splitext(args.output)
+        ext = ext.lower()
+        if ext in [".html", ".md", ".txt", ".pdf"]:
+            output_format = ext.replace(".", "")
+            output_file = args.output
+        else:
+            print(
+                "Your input file format was incorrect, the output has been saved as a TXT file."
+            )
+            output_file = args.output.rsplit(".", 1)[0] + ".txt"
+    # 如果未指定输出文件，则输出到 stdout；否则写入文件
+    process_path(args.path, output_format, args.mode, args.pycdc, output_file)
+    if PYCDC_FLAG == False:
+        print(
+            "ERROR: Detected Python 3.11 or above .pyc files. You need to install pycdc and compile it yourself to obtain pycdc."
+        )
+        print("Repo: https://github.com/zrax/pycdc.git")
+    if PYCDC_ADDR_FLAG == False:
+        print("ERROR: The specified pycdc.exe path is not valid")
+        print("Please check your pycdc path.")
+
+
+if __name__ == "__main__":
+    main()
--- a/detection/cngptdetection.py
+++ b/detection/cngptdetection.py
@@ -0,0 +1,113 @@
+import os
+import requests
+import re
+import json
+from typing import List, Dict, Any
+
+
+class TimeoutException(Exception):
+    """自定义异常用于处理超时情况。"""
+    pass
+
+
+def detectGPT(content: str) -> str:
+    """
+    检测给定的代码内容中的潜在安全漏洞。
+
+    参数:
+    - content: 要检测的代码字符串。
+
+    返回:
+    - 分类后的漏洞信息的JSON字符串。
+    """
+    api_key = os.getenv("BAIDU_API_KEY")
+    secret_key = os.getenv("BAIDU_SECRET_KEY")
+    #api_key = "DUBWNIrB6QJLOsLkpnEz2ZZa"
+    #secret_key = "9WK4HIV2n9r1ePPirqD4EQ6Ea33rH1m7"
+    if not api_key or not secret_key:
+        raise ValueError("BAIDU_API_KEY or BAIDU_SECRET_KEY is not set")
+
+    url = "https://aip.baidubce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/ernie-4.0-8k-0329?access_token=" + get_access_token(
+        api_key, secret_key)
+
+    payload = json.dumps({
+        "messages": [
+            {
+                "role": "user",
+                "content": (
+                        "You are a Python code reviewer. Read the code below and identify any potential "
+                        "security vulnerabilities. Classify them by risk level (high, medium, low, none). "
+                        'Only report the line number and the risk level.\nYou should output the result as '
+                        'json format in one line. For example: [{"Line": {the line number}, "Risk": "{choose from (high,medium,low)}","Reason":"{how it is vulnerable}"}] '
+                        "Each of these three fields is required.\nYou are required to only output the json format. "
+                        "Do not output any other information." + content
+                )
+            }
+        ]
+    })
+    headers = {
+        'Content-Type': 'application/json'
+    }
+
+    try:
+        response = requests.post(url, headers=headers, data=payload)
+        response.raise_for_status()
+        res_json = response.json()
+        message_content = res_json.get('result')
+        if message_content is None:
+            raise ValueError("API response content is None")
+    except requests.RequestException as e:
+        raise ValueError(f"Request failed: {str(e)}")
+
+    extracted_data = extract_json_from_text(message_content)
+
+    classified_results = {"high": [], "medium": [], "low": [], "none": []}
+    for res in extracted_data:
+        try:
+            line_number = int(res["Line"])
+            classified_results[res["Risk"]].append(
+                (line_number, content.split("\n")[line_number - 1].strip())
+            )
+        except (ValueError, IndexError, KeyError):
+            continue
+
+    return json.dumps(classified_results, indent=2, ensure_ascii=False)
+
+
+def get_access_token(api_key: str, secret_key: str) -> str:
+    """
+    使用API密钥和秘密生成访问令牌。
+
+    返回:
+    - access_token字符串。
+    """
+    url = "https://aip.baidubce.com/oauth/2.0/token"
+    params = {"grant_type": "client_credentials", "client_id": api_key, "client_secret": secret_key}
+    response = requests.post(url, params=params)
+    response.raise_for_status()
+    return response.json().get("access_token")
+
+
+def extract_json_from_text(text: str) -> List[Dict[str, Any]]:
+    """
+    从文本中提取JSON数据。
+
+    参数:
+    - text: 包含JSON数据的字符串文本。
+
+    返回:
+    - 包含提取JSON数据的字典列表。
+    """
+    json_match = re.search(r'\[\s*{.*?}\s*\]', text, re.DOTALL)
+    if not json_match:
+        print("未找到 JSON 数据")
+        return []
+
+    json_string = json_match.group(0)
+    try:
+        data = json.loads(json_string)
+    except json.JSONDecodeError as e:
+        print(f"解码 JSON 时出错: {e}")
+        return []
+
+    return data
--- a/detection/pyc_detection.py
+++ b/detection/pyc_detection.py
@@ -0,0 +1,49 @@
+from typing import List, Tuple
+import uncompyle6
+import io
+import os
+import subprocess
+from contextlib import redirect_stdout, redirect_stderr
+
+
+def run_pycdc(exe_path: str, pyc_file: str) -> str:
+    """
+    Executes pycdc.exe with the given .pyc file using a command line string and captures the output.
+
+    Args:
+        exe_path (str): Path to the pycdc.exe executable.
+        pyc_file (str): Path to the .pyc file to decompile.
+
+    Returns:
+        str: Output from pycdc.exe.
+    """
+    if not os.path.isfile(exe_path):
+        return "invalid"
+
+    command = f'"{exe_path}" "{pyc_file}"'
+    result = subprocess.run(
+        command, capture_output=True, text=True, shell=True, encoding="utf-8"
+    )
+
+    return result.stdout
+
+
+def disassemble_pyc(file_path: str, pycdc_addr=None) -> str:
+    """
+    Disassembles a .pyc file using uncompyle6.
+
+    Args:
+        file_path (str): The path to the .pyc file.
+
+    Returns:
+        str: The disassembled code as a string.
+    """
+    output = io.StringIO()
+    try:
+        uncompyle6.main.decompile_file(file_path, output)
+        return output.getvalue()
+    except Exception as e:
+        if pycdc_addr is None:
+            return "none"
+        else:
+            return run_pycdc(pycdc_addr, file_path)
--- a/detection/utils.py
+++ b/detection/utils.py
@@ -4,7 +4,7 @@ import sys

 def read_file_content(file_path: str) -> str:
    try:
-        with open(file_path, "r", encoding="utf-8") as file:
+        with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
            return file.read()
    except FileNotFoundError:
        print("Error: File not found.")
@@ -21,4 +21,4 @@ def remove_comments(code: str, extension: str) -> str:
        code = re.sub(r"//.*", "", code)
        code = re.sub(r"/\*.*?\*/", "", code, flags=re.DOTALL)
        return code.strip()
-    return code.strip()
+    return code.strip()