BackDoorBuster/detection/backdoor_detection.py

# Usage: python backdoor_detection.py <code_path> <output_format>

import os
import re
import sys
from typing import Dict, List, Tuple

SUPPORTED_EXTENSIONS = {".py", ".js", ".cpp"}
OUTPUT_FORMATS = ["html", "md", "txt"]


def read_file_content(file_path: str) -> str:
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            return file.read()
    except FileNotFoundError:
        print("Error: File not found.")
        sys.exit(1)
    except IOError:
        print("Error: Could not read file.")
        sys.exit(1)


def remove_comments(code: str, extension: str) -> str:
    if extension == ".py":
        return code.split("#")[0].strip()
    elif extension in {".js", ".cpp"}:
        code = re.sub(r"//.*", "", code)
        code = re.sub(r"/\*.*?\*/", "", code, flags=re.DOTALL)
        return code.strip()
    return code.strip()


def find_dangerous_functions(
    file_content: str, file_extension: str
) -> Dict[str, List[Tuple[int, str]]]:
    patterns = {
        ".py": {
            r"\bsystem\(": "high",
            r"\bexec\(": "high",
            r"\bpopen\(": "medium",
            r"\beval\(": "high",
            r"\bsubprocess\.run\(": "medium",
        },
        ".js": {
            r"\beval\(": "high",
            r"\bexec\(": "high",
            r"\bchild_process\.exec\(": "high",
        },
        ".cpp": {
            r"\bsystem\(": "high",
        },
    }
    risk_patterns = patterns.get(file_extension, {})
    classified_results = {"high": [], "medium": [], "low": [], "none": []}
    for line_number, line in enumerate(file_content.split("\n"), start=1):
        clean_line = remove_comments(line, file_extension)
        if not clean_line:
            continue
        found = False
        for pattern, risk_level in risk_patterns.items():
            if re.search(pattern, clean_line):
                classified_results[risk_level].append((line_number, clean_line))
                found = True
                break
        if not found:
            classified_results["none"].append((line_number, clean_line))
    return classified_results


def output_results(
    results: Dict[str, List[Tuple[int, str]]], output_format: str, file_path: str
):
    # Create the 'results' directory if it does not exist
    # 这里如果集成测试的话应该设置为./
    results_dir = "./results/code"
    if not os.path.exists(results_dir):
        os.makedirs(results_dir)

    base_name = os.path.basename(file_path)
    output_file = os.path.join(
        results_dir, f"{os.path.splitext(base_name)[0]}.{output_format}"
    )

    if output_format == "html":
        output_html(results, output_file)
    elif output_format == "md":
        output_markdown(results, output_file)
    elif output_format == "txt":
        output_text(results, output_file)


def output_html(results: Dict[str, List[Tuple[int, str]]], file_name: str):
    html_output = f"<html><head><title>Analysis of {file_name}</title></head><body>"
    html_output += "<h1>Security Analysis Report</h1>"
    for risk_level, entries in results.items():
        html_output += f"<h2>{risk_level.capitalize()} Risk</h2><ul>"
        for line_num, line in entries:
            html_output += f"<li>Line {line_num}: {line}</li>"
        html_output += "</ul>"
    html_output += "</body></html>"
    with open(file_name, "w") as file:
        file.write(html_output)


def output_markdown(results: Dict[str, List[Tuple[int, str]]], file_name: str):
    md_output = f"# Security Analysis Report for {file_name}\n"
    for risk_level, entries in results.items():
        md_output += f"## {risk_level.capitalize()} Risk\n"
        for line_num, line in entries:
            md_output += f"- Line {line_num}: {line}\n"
    with open(file_name, "w") as file:
        file.write(md_output)


def output_text(results: Dict[str, List[Tuple[int, str]]], file_name: str):
    text_output = f"Security Analysis Report for {file_name}\n"
    for risk_level, entries in results.items():
        text_output += f"{risk_level.capitalize()} Risk:\n"
        for line_num, line in entries:
            text_output += f"  Line {line_num}: {line}\n"
    with open(file_name, "w") as file:
        file.write(text_output)


def process_path(path: str, output_format: str):
    if os.path.isdir(path):
        for root, dirs, files in os.walk(path):
            for file in files:
                file_extension = os.path.splitext(file)[1]
                if file_extension in SUPPORTED_EXTENSIONS:
                    file_path = os.path.join(root, file)
                    print(f"Processing {file_path}...")
                    file_results = find_dangerous_functions(
                        read_file_content(file_path), file_extension
                    )
                    output_results(file_results, output_format, file_path)
    elif os.path.isfile(path):
        file_extension = os.path.splitext(path)[1]
        if file_extension in SUPPORTED_EXTENSIONS:
            file_results = find_dangerous_functions(
                read_file_content(path), file_extension
            )
            output_results(file_results, output_format, path)
        else:
            print("Unsupported file type.")
    else:
        print("Invalid path.")
        sys.exit(1)


def main():
    if len(sys.argv) < 3:
        print("Usage: python backdoor_detection.py <path> <output_format>")
        sys.exit(1)
    path = sys.argv[1]
    output_format = sys.argv[2]
    if output_format not in OUTPUT_FORMATS:
        print(
            f"Unsupported output format. Supported formats are: {', '.join(OUTPUT_FORMATS)}"
        )
        sys.exit(1)
    process_path(path, output_format)


if __name__ == "__main__":
    main()