diff --git a/detection/Regexdetection.py b/detection/Regexdetection.py index b1b1549..4296b49 100644 --- a/detection/Regexdetection.py +++ b/detection/Regexdetection.py @@ -25,15 +25,25 @@ def find_dangerous_functions( ".cpp": { r"\bsystem\(": "high", }, + ".pyc": { + r"\bexec\b": "high", + r"\beval\b": "high", + r"\bos\.system\b": "high", + r"\bos\.exec\b": "high", + r"\bos\.fork\b": "high", + r"\bos\.kill\b": "high", + r"\bos\.popen\b": "medium", + r"\bos\.spawn\b": "medium", + }, } risk_patterns = patterns.get(file_extension, {}) classified_results = {"high": [], "medium": [], "low": [], "none": []} - for line_number, line in enumerate(file_content.split("\n"), start=1): - clean_line = remove_comments(line, file_extension) - if not clean_line: - continue - for pattern, risk_level in risk_patterns.items(): - if re.search(pattern, clean_line): - classified_results[risk_level].append((line_number, clean_line)) + if file_content is not None: + for line_number, line in enumerate(file_content.split("\n"), start=1): + clean_line = remove_comments(line, file_extension) + if not clean_line: + continue + for pattern, risk_level in risk_patterns.items(): + if re.search(pattern, clean_line): + classified_results[risk_level].append((line_number, clean_line)) return classified_results - diff --git a/detection/__main__.py b/detection/__main__.py index 644ac18..9881eb5 100644 --- a/detection/__main__.py +++ b/detection/__main__.py @@ -5,11 +5,16 @@ from reportlab.lib.styles import getSampleStyleSheet from reportlab.platypus import Paragraph, Spacer, SimpleDocTemplate from .Regexdetection import find_dangerous_functions from .GPTdetection import detectGPT +from .pyc_detection import disassemble_pyc from .utils import * import sys from colorama import init, Fore, Style +from tqdm import tqdm +from pathlib import Path -SUPPORTED_EXTENSIONS = {".py", ".js", ".cpp"} +PYCDC_FLAG = True +PYCDC_ADDR_FLAG = True +SUPPORTED_EXTENSIONS = {".py", ".js", ".cpp", ".pyc"} OUTPUT_FORMATS = ["html", "md", "txt", "pdf"] ORDERS = [ "__import__", @@ -325,46 +330,74 @@ def output_text(results: Dict[str, List[Tuple[int, str]]], file_name=None): return text_output -def checkModeAndDetect(mode: str, filePath: str, fileExtension: str): +def checkModeAndDetect(mode: str, filePath: str, fileExtension: str, pycdc_addr: str): # TODO:添加更多方式,这里提高代码的复用性和扩展性 - if mode == "regex": - return find_dangerous_functions(read_file_content(filePath), fileExtension) - elif mode == "llm": - return detectGPT(read_file_content(filePath)) + if fileExtension == ".pyc": + # 反汇编pyc文件 + file_content = disassemble_pyc(filePath, pycdc_addr) + if file_content == "none": + global PYCDC_FLAG + PYCDC_FLAG = False + return "" + elif file_content == "invalid": + global PYCDC_ADDR_FLAG + PYCDC_ADDR_FLAG = False + if mode == "regex": + return find_dangerous_functions(file_content, fileExtension) + elif mode == "llm": + return detectGPT(file_content) + else: + return find_dangerous_functions(file_content, fileExtension) else: - return find_dangerous_functions(read_file_content(filePath), fileExtension) + file_content = read_file_content(filePath) + if mode == "regex": + return find_dangerous_functions(file_content, fileExtension) + elif mode == "llm": + return detectGPT(file_content) + else: + return find_dangerous_functions(file_content, fileExtension) -def process_path(path: str, output_format: str, mode: str, output_file=None): +def process_path( + path: str, output_format: str, mode: str, pycdc_addr: str, output_file=None +): results = {"high": [], "medium": [], "low": [], "none": []} if os.path.isdir(path): - for root, dirs, files in os.walk(path): - for file in files: - file_extension = os.path.splitext(file)[1] - if file_extension in SUPPORTED_EXTENSIONS: - file_path = os.path.join(root, file) + # 使用rglob获取所有文件 + all_files = [ + file_path + for file_path in Path(path).rglob("*") + if file_path.suffix in SUPPORTED_EXTENSIONS + ] - file_results = checkModeAndDetect(mode, file_path, file_extension) - for key in file_results: - if key != "none": # Exclude 'none' risk level - results[key].extend( - [ - (f"{file_path}: Line {line_num}", line) - for line_num, line in file_results[key] - ] - ) + # 扫描动画 + for file_path in tqdm(all_files, desc="Scanning files", unit="file"): + file_extension = file_path.suffix + file_results = checkModeAndDetect( + mode, str(file_path), file_extension, pycdc_addr + ) + if file_results is not None: + for key in file_results: + if key != "none": # Exclude 'none' risk level + results[key].extend( + [ + (f"{file_path}: Line {line_num}", line) + for line_num, line in file_results[key] + ] + ) elif os.path.isfile(path): file_extension = os.path.splitext(path)[1] if file_extension in SUPPORTED_EXTENSIONS: - file_results = checkModeAndDetect(mode, path, file_extension) - for key in file_results: - if key != "none": # Exclude 'none' risk level - results[key].extend( - [ - (f"{path}: Line {line_num}", line) - for line_num, line in file_results[key] - ] - ) + file_results = checkModeAndDetect(mode, path, file_extension, pycdc_addr) + if file_results is not None: + for key in file_results: + if key != "none": # Exclude 'none' risk level + results[key].extend( + [ + (f"{path}: Line {line_num}", line) + for line_num, line in file_results[key] + ] + ) else: print("Unsupported file type.") return @@ -386,6 +419,9 @@ def main(): parser.add_argument( "-m", "--mode", help="Mode of operation:[regex,llm]", default="regex" ) + parser.add_argument( + "-p", "--pycdc", help="Path to pycdc.exe to decompile", default=None + ) args = parser.parse_args() output_format = "txt" # Default output format output_file = None @@ -401,7 +437,15 @@ def main(): ) output_file = args.output.rsplit(".", 1)[0] + ".txt" # 如果未指定输出文件,则输出到 stdout;否则写入文件 - process_path(args.path, output_format, args.mode, output_file) + process_path(args.path, output_format, args.mode, args.pycdc, output_file) + if PYCDC_FLAG == False: + print( + "ERROR: Detected Python 3.11 or above .pyc files. You need to install pycdc and compile it yourself to obtain pycdc." + ) + print("Repo: https://github.com/zrax/pycdc.git") + if PYCDC_ADDR_FLAG == False: + print("ERROR: The specified pycdc.exe path is not valid") + print("Please check your pycdc path.") if __name__ == "__main__": diff --git a/detection/pyc_detection.py b/detection/pyc_detection.py new file mode 100644 index 0000000..d350421 --- /dev/null +++ b/detection/pyc_detection.py @@ -0,0 +1,49 @@ +from typing import List, Tuple +import uncompyle6 +import io +import os +import subprocess +from contextlib import redirect_stdout, redirect_stderr + + +def run_pycdc(exe_path: str, pyc_file: str) -> str: + """ + Executes pycdc.exe with the given .pyc file using a command line string and captures the output. + + Args: + exe_path (str): Path to the pycdc.exe executable. + pyc_file (str): Path to the .pyc file to decompile. + + Returns: + str: Output from pycdc.exe. + """ + if not os.path.isfile(exe_path): + return "invalid" + + command = f'"{exe_path}" "{pyc_file}"' + result = subprocess.run( + command, capture_output=True, text=True, shell=True, encoding="utf-8" + ) + + return result.stdout + + +def disassemble_pyc(file_path: str, pycdc_addr=None) -> str: + """ + Disassembles a .pyc file using uncompyle6. + + Args: + file_path (str): The path to the .pyc file. + + Returns: + str: The disassembled code as a string. + """ + output = io.StringIO() + try: + uncompyle6.main.decompile_file(file_path, output) + return output.getvalue() + except Exception as e: + if pycdc_addr is None: + return "none" + else: + return run_pycdc(pycdc_addr, file_path) diff --git a/detection/utils.py b/detection/utils.py index 563e7f0..c15e8ec 100644 --- a/detection/utils.py +++ b/detection/utils.py @@ -4,7 +4,7 @@ import sys def read_file_content(file_path: str) -> str: try: - with open(file_path, "r", encoding="utf-8") as file: + with open(file_path, "r", encoding="utf-8", errors="ignore") as file: return file.read() except FileNotFoundError: print("Error: File not found.") @@ -21,4 +21,4 @@ def remove_comments(code: str, extension: str) -> str: code = re.sub(r"//.*", "", code) code = re.sub(r"/\*.*?\*/", "", code, flags=re.DOTALL) return code.strip() - return code.strip() \ No newline at end of file + return code.strip() diff --git a/tests/test_backdoor_detection.py b/tests/test_backdoor_detection.py index 3d4fcf5..a0632c2 100644 --- a/tests/test_backdoor_detection.py +++ b/tests/test_backdoor_detection.py @@ -83,6 +83,30 @@ class TestBackdoorDetection(unittest.TestCase): self.assertEqual(len(results["medium"]), 0) self.assertEqual(len(results["low"]), 0) + def test_gpt_env_no_set(self): + if os.getenv("OPENAI_API_KEY") is not None: + self.skipTest("OPENAI_API_KEY is setted") + content = "print('test test')" + with self.assertRaises(ValueError): + detectGPT(content) + + def test_find_dangerous_functions_pyc(self): + file_content = """import os + os.system('rm -rf /') + """ + file_extension = ".pyc" + + expected_result = { + "high": [(2, "os.system('rm -rf /')")], + "medium": [], + "low": [], + "none": [], + } + + result = find_dangerous_functions(file_content, file_extension) + + self.assertEqual(result, expected_result) + if __name__ == "__main__": unittest.main()