BackDoorBuster/detection/__main__.py

import json
import os
from typing import Dict, List, Tuple, Optional
from reportlab.lib.pagesizes import letter
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.platypus import Paragraph, Spacer, SimpleDocTemplate

from detection.pickle_detection import pickleDataDetection
from .Regexdetection import find_dangerous_functions
from .GPTdetection import detectGPT
from .pyc_detection import disassemble_pyc
from .utils import *
import sys
from colorama import init, Fore, Style
from tqdm import tqdm
from pathlib import Path

PYCDC_FLAG = True
PYCDC_ADDR_FLAG = True
SUPPORTED_EXTENSIONS = {".py", ".js", ".cpp", ".pyc"}
OUTPUT_FORMATS = ["html", "md", "txt", "pdf"]
ORDERS = [
    "__import__",
    "system",
    "exec",
    "popen",
    "eval",
    "subprocess",
    "__getattribute__",
    "getattr",
    "child_process",
]

# Initialize colorama
init(autoreset=True)

ORANGE = "\033[38;5;214m"
CYAN = Fore.CYAN


def supports_color() -> bool:
    """
    Checks if the running terminal supports color output.

    Returns:
        bool: True if the terminal supports color, False otherwise.
    """
    # Windows support
    if sys.platform == "win32":
        return True
    # Check if output is a TTY (terminal)
    if hasattr(sys.stdout, "isatty") and sys.stdout.isatty():
        return True
    return False


def supports_emoji() -> bool:
    """
    Checks if the running terminal supports emoji output.

    Returns:
        bool: True if the terminal supports emoji, False otherwise.
    """
    # This is a simple check. Modern terminals typically support emoji.
    return sys.platform != "win32" or os.getenv("WT_SESSION") is not None


def highlight_orders(line: str, risk_level: str, use_color: bool) -> str:
    """
    Highlights specific orders in the line based on risk level.

    Args:
        line (str): The line to highlight.
        risk_level (str): The risk level of the line ("high", "medium", "low").
        use_color (bool): Whether to use color for highlighting.

    Returns:
        str: The highlighted line.
    """
    risk_colors = {
        "high": Fore.RED,
        "medium": Fore.YELLOW,
        "low": CYAN,
    }
    color = risk_colors.get(risk_level, Fore.WHITE) if use_color else ""
    reset = Style.RESET_ALL if use_color else ""

    for order in ORDERS:
        line = line.replace(order, f"{color}{order}{reset}")
    return line


def generate_text_content(results: Dict[str, List[Tuple[int, str]]]) -> str:
    """
    Generates a formatted text report for security analysis results.

    Args:
        results (Dict[str, List[Tuple[int, str]]]): The security analysis results categorized by risk levels.

    Returns:
        str: The formatted text report as a string.
    """
    use_color = supports_color()
    use_emoji = supports_emoji()

    text_output = "Security Analysis Report\n"
    text_output += "=" * 30 + "\n\n"

    for risk_level, entries in results.items():
        if risk_level == "pickles":
            text_output += f"Pickles:\n"
            for i in entries:
                text_output += f"  {i['file']}:{json.dumps(i['result'])}\n"
        elif entries and risk_level != "none":
            risk_color = (
                {
                    "high": Fore.RED,
                    "medium": Fore.YELLOW,
                    "low": Fore.GREEN,
                }.get(risk_level, Fore.WHITE)
                if use_color
                else ""
            )

            risk_title = (
                {
                    "High": "👹",
                    "Medium": "👾",
                    "Low": "👻",
                }
                if use_emoji
                else {
                    "High": "",
                    "Medium": "",
                    "Low": "",
                }
            )

            text_output += f"{risk_color}{risk_level.capitalize()} Risk{risk_title[risk_level.capitalize()]}:{Style.RESET_ALL if use_color else ''}\n"
            text_output += "-" * (len(risk_level) + 6) + "\n"
            for line_num, line in entries:
                line = highlight_orders(line, risk_level, use_color)
                line_text = f"{Style.RESET_ALL if use_color else ''} {Fore.GREEN if use_color else ''}{line_num}{Style.RESET_ALL if use_color else ''}: {line}{Style.RESET_ALL if use_color else ''}\n"
                text_output += line_text
            text_output += "\n"


    return text_output


def output_results(
    results: Dict[str, List[Tuple[int, str]]],
    output_format: str,
    output_file: Optional[str] = None,
) -> None:
    """
    Outputs the security analysis results in the specified format.

    Args:
        results (Dict[str, List[Tuple[int, str]]]): The security analysis results categorized by risk levels.
        output_format (str): The format to output the results in. Supported formats: "pdf", "html", "md", "txt".
        output_file (Optional[str]): The name of the file to save the output. If None, prints to the terminal.
    """
    OUTPUT_FORMATS = {"pdf", "html", "md", "txt"}

    if output_file:
        file_name, file_ext = os.path.splitext(output_file)
        if output_format not in OUTPUT_FORMATS:
            output_format = "txt"
            output_file = f"{file_name}.txt"
        results_dir = os.path.dirname(output_file)
        if not os.path.exists(results_dir) and results_dir != "":
            os.makedirs(results_dir)
        if output_format == "pdf":
            output_pdf(results, output_file)
        elif output_format == "html":
            output_html(results, output_file)
        elif output_format == "md":
            output_markdown(results, output_file)
        else:  # Default to txt
            output_text(results, output_file)
    else:
        # If no output file is specified, default to text output to the terminal.
        txt_output = generate_text_content(results)
        print(txt_output)


def output_pdf(results: Dict[str, List[Tuple[int, str]]], file_name):
    doc = SimpleDocTemplate(file_name, pagesize=letter)
    story = []
    styles = getSampleStyleSheet()

    # Add the title centered
    title_style = styles["Title"]
    title_style.alignment = 1  # Center alignment
    title = Paragraph("Security Analysis Report", title_style)
    story.append(title)
    story.append(Spacer(1, 20))  # Space after title

    # Add risk levels and entries
    normal_style = styles["BodyText"]
    for risk_level, entries in results.items():
        if risk_level != "none":
            story.append(
                Paragraph(f"{risk_level.capitalize()} Risk:", styles["Heading2"])
            )
            for line_num, line in entries:
                entry = Paragraph(f"Line {line_num}: {line}", normal_style)
                story.append(entry)
            story.append(Spacer(1, 12))  # Space between sections

    doc.build(story)


def output_html(results: Dict[str, List[Tuple[int, str]]], file_name=None):
    """
    Generates an HTML report for security analysis results.

    Args:
        results (Dict[str, List[Tuple[int, str]]]): The security analysis results categorized by risk levels.
        file_name (Optional[str]): The name of the file to save the HTML output. If None, returns the HTML string.

    Returns:
        Optional[str]: The HTML string if file_name is None, otherwise None.
    """
    html_output = """
    <html>
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <link rel="icon" href="https://s2.loli.net/2024/05/30/WDc6MekjbuCU9Qo.png">
        <title>Security Analysis Report</title>
        <style>
            body {
                background-image: url('https://s2.loli.net/2024/05/30/85Mv7leB2IRWNp6.jpg');
                background-size: 100%, auto;
                background-attachment: fixed;
                font-family: Arial, sans-serif;
            }
            h1, h2 {
                color: white;
            }
            ul {
                list-style-type: none;
                padding: 0;
            }
            li {
                background: rgba(255, 255, 255, 0.8);
                margin: 5px 0;
                padding: 10px;
                border-radius: 5px;
            }
        </style>
    </head>
    <body>
        <h1>Security Analysis Report</h1>
    """

    for risk_level, entries in results.items():
        if risk_level != "none":
            risk_title = {
                "High": f"<h2>{risk_level.capitalize()} Risk👹</h2><ul>",
                "Medium": f"<h2>{risk_level.capitalize()} Risk👾</h2><ul>",
                "Low": f"<h2>{risk_level.capitalize()} Risk👻</h2><ul>",
            }
            html_output += risk_title[risk_level.capitalize()]
            for line_num, line in entries:
                html_output += f"<li>{line_num}: {line}</li>"
            html_output += "</ul>"

    html_output += "</body></html>"

    if file_name:
        with open(file_name, "w", encoding="utf-8") as file:
            file.write(html_output)
        return None
    else:
        return html_output


def output_markdown(results: Dict[str, List[Tuple[int, str]]], file_name=None):
    """
    Generates a Markdown report for security analysis results.

    Args:
        results (Dict[str, List[Tuple[int, str]]]): The security analysis results categorized by risk levels.
        file_name (Optional[str]): The name of the file to save the Markdown output. If None, returns the Markdown string.

    Returns:
        Optional[str]: The Markdown string if file_name is None, otherwise None.
    """
    md_output = "# Security Analysis Report\n\n"

    for risk_level, entries in results.items():
        if risk_level != "none":
            md_output += f"## {risk_level.capitalize()} Risk\n\n"
            md_output += "| Line Number | Description |\n"
            md_output += "|-------------|-------------|\n"
            for line_num, line in entries:
                md_output += f"| {line_num} | {line} |\n"
            md_output += "\n"

    if file_name:
        with open(file_name, "w") as file:
            file.write(md_output)
        return None
    else:
        return md_output


def output_text(results: Dict[str, List[Tuple[int, str]]], file_name=None):
    """
    Generates a plain text report for security analysis results.

    Args:
        results (Dict[str, List[Tuple[int, str]]]): The security analysis results categorized by risk levels.
        file_name (Optional[str]): The name of the file to save the text output. If None, returns the text string.

    Returns:
        Optional[str]: The text string if file_name is None, otherwise None.
    """
    text_output = "Security Analysis Report\n"
    text_output += "=" * len("Security Analysis Report") + "\n\n"

    for risk_level, entries in results.items():
        if risk_level != "none":
            text_output += f"{risk_level.capitalize()} Risk:\n"
            text_output += "-" * len(f"{risk_level.capitalize()} Risk:") + "\n"
            for line_num, line in entries:
                text_output += f"  Line {line_num}: {line}\n"
            text_output += "\n"

    if file_name:
        with open(file_name, "w") as file:
            file.write(text_output)
        return None
    else:
        return text_output


def checkModeAndDetect(mode: str, filePath: str, fileExtension: str, pycdc_addr: str):
    # TODO:添加更多方式，这里提高代码的复用性和扩展性
    if fileExtension == ".pyc":
        # 反汇编pyc文件
        file_content = disassemble_pyc(filePath, pycdc_addr)
        if file_content == "none":
            global PYCDC_FLAG
            PYCDC_FLAG = False
            return ""
        elif file_content == "invalid":
            global PYCDC_ADDR_FLAG
            PYCDC_ADDR_FLAG = False
        if mode == "regex":
            return find_dangerous_functions(file_content, fileExtension)
        elif mode == "llm":
            return detectGPT(file_content)
        else:
            return find_dangerous_functions(file_content, fileExtension)
    else:
        file_content = read_file_content(filePath)
        if mode == "regex":
            return find_dangerous_functions(file_content, fileExtension)
        elif mode == "llm":
            return detectGPT(file_content)
        else:
            return find_dangerous_functions(file_content, fileExtension)


def process_path(
    path: str, output_format: str, mode: str, pycdc_addr: str, output_file=None
):
    results = {"high": [], "medium": [], "low": [], "none": [],"pickles": []}
    if os.path.isdir(path):
        # 使用rglob获取所有文件
        all_files = [
            file_path
            for file_path in Path(path).rglob("*")
            if file_path.suffix in SUPPORTED_EXTENSIONS
        ]

        # 扫描动画
        for file_path in tqdm(all_files, desc="Scanning files", unit="file"):
            file_extension = file_path.suffix
            if file_extension in [".pkl",".pickle"]:
                res = pickleDataDetection(str(file_path), output_file)
                results["pickles"].append({
                    "file": str(file_path),
                    "result": res
                })
                continue
            file_results = checkModeAndDetect(
                mode, str(file_path), file_extension, pycdc_addr
            )
            if file_results is not None:
                for key in file_results:
                    if key != "none":  # Exclude 'none' risk level
                        results[key].extend(
                            [
                                (f"{file_path}: Line {line_num}", line)
                                for line_num, line in file_results[key]
                            ]
                        )
    elif os.path.isfile(path):
        file_extension = os.path.splitext(path)[1]
        if file_extension in [".pkl", ".pickle"]:
            res = pickleDataDetection(str(path), output_file)
            results["pickles"].append({
                "file": str(path),
                "result": res
            })
        elif file_extension in SUPPORTED_EXTENSIONS:
            file_results = checkModeAndDetect(mode, path, file_extension, pycdc_addr)
            if file_results is not None:
                for key in file_results:
                    if key != "none":  # Exclude 'none' risk level
                        results[key].extend(
                            [
                                (f"{path}: Line {line_num}", line)
                                for line_num, line in file_results[key]
                            ]
                        )
        else:
            print("Unsupported file type.")
            return
    else:
        print("Invalid path.")
        sys.exit(1)

    output_results(results, output_format, output_file)


def main():
    import argparse

    parser = argparse.ArgumentParser(
        description="Backdoor detection tool.", prog="detection"
    )
    parser.add_argument("path", help="Path to the code to analyze")
    parser.add_argument("-o", "--output", help="Output file path", default=None)
    parser.add_argument(
        "-m", "--mode", help="Mode of operation:[regex,llm]", default="regex"
    )
    parser.add_argument(
        "-p",
        "--pycdc",
        help="Path to pycdc.exe to decompile",
        default=os.getenv("pycdc"),
    )
    args = parser.parse_args()
    output_format = "txt"  # Default output format
    output_file = None
    if args.output:
        _, ext = os.path.splitext(args.output)
        ext = ext.lower()
        if ext in [".html", ".md", ".txt", ".pdf"]:
            output_format = ext.replace(".", "")
            output_file = args.output
        else:
            print(
                "Your input file format was incorrect, the output has been saved as a TXT file."
            )
            output_file = args.output.rsplit(".", 1)[0] + ".txt"
    # 如果未指定输出文件，则输出到 stdout；否则写入文件
    process_path(args.path, output_format, args.mode, args.pycdc, output_file)
    if PYCDC_FLAG == False:
        print(
            "ERROR: Detected Python 3.11 or above .pyc files. You need to install pycdc and compile it yourself to obtain pycdc."
        )
        print("Repo: https://github.com/zrax/pycdc.git")
    if PYCDC_ADDR_FLAG == False:
        print("ERROR: The specified pycdc.exe path is not valid")
        print("Please check your pycdc path.")


if __name__ == "__main__":
    main()