Merge pull request 'feature/rglob' (#29 ) from feature/rglob into main

Reviewed-on: #29 Reviewed-by: sangge <sangge@noreply.localhost> Reviewed-by: ccyj <ccyj@noreply.localhost>
feat: 使用rglob扫描
2024-06-03 20:24:42 +08:00 · 2024-06-03 16:29:35 +08:00 · 2024-06-03 11:41:19 +08:00 · 2024-06-02 20:24:03 +08:00 · 2024-06-02 19:54:47 +08:00 · 2024-05-31 21:13:01 +08:00
5 changed files with 169 additions and 42 deletions
--- a/detection/Regexdetection.py
+++ b/detection/Regexdetection.py
@@ -25,15 +25,25 @@ def find_dangerous_functions(
        ".cpp": {
            r"\bsystem\(": "high",
        },
+        ".pyc": {
+            r"\bexec\b": "high",
+            r"\beval\b": "high",
+            r"\bos\.system\b": "high",
+            r"\bos\.exec\b": "high",
+            r"\bos\.fork\b": "high",
+            r"\bos\.kill\b": "high",
+            r"\bos\.popen\b": "medium",
+            r"\bos\.spawn\b": "medium",
+        },
    }
    risk_patterns = patterns.get(file_extension, {})
    classified_results = {"high": [], "medium": [], "low": [], "none": []}
-    for line_number, line in enumerate(file_content.split("\n"), start=1):
-        clean_line = remove_comments(line, file_extension)
-        if not clean_line:
-            continue
-        for pattern, risk_level in risk_patterns.items():
-            if re.search(pattern, clean_line):
-                classified_results[risk_level].append((line_number, clean_line))
+    if file_content is not None:
+        for line_number, line in enumerate(file_content.split("\n"), start=1):
+            clean_line = remove_comments(line, file_extension)
+            if not clean_line:
+                continue
+            for pattern, risk_level in risk_patterns.items():
+                if re.search(pattern, clean_line):
+                    classified_results[risk_level].append((line_number, clean_line))
    return classified_results
-
--- a/detection/main.py
+++ b/detection/main.py
@@ -5,11 +5,16 @@ from reportlab.lib.styles import getSampleStyleSheet
 from reportlab.platypus import Paragraph, Spacer, SimpleDocTemplate
 from .Regexdetection import find_dangerous_functions
 from .GPTdetection import detectGPT
+from .pyc_detection import disassemble_pyc
 from .utils import *
 import sys
 from colorama import init, Fore, Style
+from tqdm import tqdm
+from pathlib import Path

-SUPPORTED_EXTENSIONS = {".py", ".js", ".cpp"}
+PYCDC_FLAG = True
+PYCDC_ADDR_FLAG = True
+SUPPORTED_EXTENSIONS = {".py", ".js", ".cpp", ".pyc"}
 OUTPUT_FORMATS = ["html", "md", "txt", "pdf"]
 ORDERS = [
    "__import__",
@@ -325,46 +330,74 @@ def output_text(results: Dict[str, List[Tuple[int, str]]], file_name=None):
        return text_output


-def checkModeAndDetect(mode: str, filePath: str, fileExtension: str):
+def checkModeAndDetect(mode: str, filePath: str, fileExtension: str, pycdc_addr: str):
    # TODO:添加更多方式，这里提高代码的复用性和扩展性
-    if mode == "regex":
-        return find_dangerous_functions(read_file_content(filePath), fileExtension)
-    elif mode == "llm":
-        return detectGPT(read_file_content(filePath))
+    if fileExtension == ".pyc":
+        # 反汇编pyc文件
+        file_content = disassemble_pyc(filePath, pycdc_addr)
+        if file_content == "none":
+            global PYCDC_FLAG
+            PYCDC_FLAG = False
+            return ""
+        elif file_content == "invalid":
+            global PYCDC_ADDR_FLAG
+            PYCDC_ADDR_FLAG = False
+        if mode == "regex":
+            return find_dangerous_functions(file_content, fileExtension)
+        elif mode == "llm":
+            return detectGPT(file_content)
+        else:
+            return find_dangerous_functions(file_content, fileExtension)
    else:
-        return find_dangerous_functions(read_file_content(filePath), fileExtension)
+        file_content = read_file_content(filePath)
+        if mode == "regex":
+            return find_dangerous_functions(file_content, fileExtension)
+        elif mode == "llm":
+            return detectGPT(file_content)
+        else:
+            return find_dangerous_functions(file_content, fileExtension)


-def process_path(path: str, output_format: str, mode: str, output_file=None):
+def process_path(
+    path: str, output_format: str, mode: str, pycdc_addr: str, output_file=None
+):
    results = {"high": [], "medium": [], "low": [], "none": []}
    if os.path.isdir(path):
-        for root, dirs, files in os.walk(path):
-            for file in files:
-                file_extension = os.path.splitext(file)[1]
-                if file_extension in SUPPORTED_EXTENSIONS:
-                    file_path = os.path.join(root, file)
+        # 使用rglob获取所有文件
+        all_files = [
+            file_path
+            for file_path in Path(path).rglob("*")
+            if file_path.suffix in SUPPORTED_EXTENSIONS
+        ]

-                    file_results = checkModeAndDetect(mode, file_path, file_extension)
-                    for key in file_results:
-                        if key != "none":  # Exclude 'none' risk level
-                            results[key].extend(
-                                [
-                                    (f"{file_path}: Line {line_num}", line)
-                                    for line_num, line in file_results[key]
-                                ]
-                            )
+        # 扫描动画
+        for file_path in tqdm(all_files, desc="Scanning files", unit="file"):
+            file_extension = file_path.suffix
+            file_results = checkModeAndDetect(
+                mode, str(file_path), file_extension, pycdc_addr
+            )
+            if file_results is not None:
+                for key in file_results:
+                    if key != "none":  # Exclude 'none' risk level
+                        results[key].extend(
+                            [
+                                (f"{file_path}: Line {line_num}", line)
+                                for line_num, line in file_results[key]
+                            ]
+                        )
    elif os.path.isfile(path):
        file_extension = os.path.splitext(path)[1]
        if file_extension in SUPPORTED_EXTENSIONS:
-            file_results = checkModeAndDetect(mode, path, file_extension)
-            for key in file_results:
-                if key != "none":  # Exclude 'none' risk level
-                    results[key].extend(
-                        [
-                            (f"{path}: Line {line_num}", line)
-                            for line_num, line in file_results[key]
-                        ]
-                    )
+            file_results = checkModeAndDetect(mode, path, file_extension, pycdc_addr)
+            if file_results is not None:
+                for key in file_results:
+                    if key != "none":  # Exclude 'none' risk level
+                        results[key].extend(
+                            [
+                                (f"{path}: Line {line_num}", line)
+                                for line_num, line in file_results[key]
+                            ]
+                        )
        else:
            print("Unsupported file type.")
            return
@@ -386,6 +419,9 @@ def main():
    parser.add_argument(
        "-m", "--mode", help="Mode of operation:[regex,llm]", default="regex"
    )
+    parser.add_argument(
+        "-p", "--pycdc", help="Path to pycdc.exe to decompile", default=None
+    )
    args = parser.parse_args()
    output_format = "txt"  # Default output format
    output_file = None
@@ -401,7 +437,15 @@ def main():
            )
            output_file = args.output.rsplit(".", 1)[0] + ".txt"
    # 如果未指定输出文件，则输出到 stdout；否则写入文件
-    process_path(args.path, output_format, args.mode, output_file)
+    process_path(args.path, output_format, args.mode, args.pycdc, output_file)
+    if PYCDC_FLAG == False:
+        print(
+            "ERROR: Detected Python 3.11 or above .pyc files. You need to install pycdc and compile it yourself to obtain pycdc."
+        )
+        print("Repo: https://github.com/zrax/pycdc.git")
+    if PYCDC_ADDR_FLAG == False:
+        print("ERROR: The specified pycdc.exe path is not valid")
+        print("Please check your pycdc path.")


 if __name__ == "__main__":
--- a/detection/pyc_detection.py
+++ b/detection/pyc_detection.py
@@ -0,0 +1,49 @@
+from typing import List, Tuple
+import uncompyle6
+import io
+import os
+import subprocess
+from contextlib import redirect_stdout, redirect_stderr
+
+
+def run_pycdc(exe_path: str, pyc_file: str) -> str:
+    """
+    Executes pycdc.exe with the given .pyc file using a command line string and captures the output.
+
+    Args:
+        exe_path (str): Path to the pycdc.exe executable.
+        pyc_file (str): Path to the .pyc file to decompile.
+
+    Returns:
+        str: Output from pycdc.exe.
+    """
+    if not os.path.isfile(exe_path):
+        return "invalid"
+
+    command = f'"{exe_path}" "{pyc_file}"'
+    result = subprocess.run(
+        command, capture_output=True, text=True, shell=True, encoding="utf-8"
+    )
+
+    return result.stdout
+
+
+def disassemble_pyc(file_path: str, pycdc_addr=None) -> str:
+    """
+    Disassembles a .pyc file using uncompyle6.
+
+    Args:
+        file_path (str): The path to the .pyc file.
+
+    Returns:
+        str: The disassembled code as a string.
+    """
+    output = io.StringIO()
+    try:
+        uncompyle6.main.decompile_file(file_path, output)
+        return output.getvalue()
+    except Exception as e:
+        if pycdc_addr is None:
+            return "none"
+        else:
+            return run_pycdc(pycdc_addr, file_path)
--- a/detection/utils.py
+++ b/detection/utils.py
@@ -4,7 +4,7 @@ import sys

 def read_file_content(file_path: str) -> str:
    try:
-        with open(file_path, "r", encoding="utf-8") as file:
+        with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
            return file.read()
    except FileNotFoundError:
        print("Error: File not found.")
@@ -21,4 +21,4 @@ def remove_comments(code: str, extension: str) -> str:
        code = re.sub(r"//.*", "", code)
        code = re.sub(r"/\*.*?\*/", "", code, flags=re.DOTALL)
        return code.strip()
-    return code.strip()
+    return code.strip()
--- a/tests/test_backdoor_detection.py
+++ b/tests/test_backdoor_detection.py
@@ -83,6 +83,30 @@ class TestBackdoorDetection(unittest.TestCase):
        self.assertEqual(len(results["medium"]), 0)
        self.assertEqual(len(results["low"]), 0)

+    def test_gpt_env_no_set(self):
+        if os.getenv("OPENAI_API_KEY") is not None:
+            self.skipTest("OPENAI_API_KEY is setted")
+        content = "print('test test')"
+        with self.assertRaises(ValueError):
+            detectGPT(content)
+
+    def test_find_dangerous_functions_pyc(self):
+        file_content = """import os
+        os.system('rm -rf /')
+        """
+        file_extension = ".pyc"
+
+        expected_result = {
+            "high": [(2, "os.system('rm -rf /')")],
+            "medium": [],
+            "low": [],
+            "none": [],
+        }
+
+        result = find_dangerous_functions(file_content, file_extension)
+
+        self.assertEqual(result, expected_result)
+

 if __name__ == "__main__":
    unittest.main()
Author	SHA1	Message	Date
ccyj	49408eda9f	Merge pull request 'feature/rglob' (#29 ) from feature/rglob into main Reviewed-on: #29 Reviewed-by: sangge <sangge@noreply.localhost> Reviewed-by: ccyj <ccyj@noreply.localhost>	2024-06-03 20:24:42 +08:00
dqy	d1ac4594e4	feat: 使用rglob扫描	2024-06-03 16:29:35 +08:00
dqy	62b77812af	fix: 去除扫描单个文件进度条	2024-06-03 11:41:19 +08:00
dqy	7eb4de8e6c	style: 添加扫描动画	2024-06-02 20:24:03 +08:00
dqy	b99334ed12	fix: 解决unicode字符报错	2024-06-02 19:54:47 +08:00
dqy	17245a9bcf	fix: 解决unicode编码错误	2024-05-31 21:13:01 +08:00
dqy	b673575fe4	fix: 删除无效模块	2024-05-31 20:36:42 +08:00
dqy	df65fff2c7	feat: 添加对python 3.11的反编译模块	2024-05-31 20:33:47 +08:00
dqy	aeb4a33d98	Merge branch 'main' of https://git.mamahaha.work/sangge/BackDoorBuster into feature/pyc-detection	2024-05-31 19:20:35 +08:00
dqy	e80e83ad51	Merge branch 'main' of https://git.mamahaha.work/sangge/BackDoorBuster into feature/pyc-detection Some checks failed Python application test / build (pull_request) Failing after 52s	2024-05-30 16:13:40 +08:00
dqy	8a14ef4341	fix: 修改相对模块引入	2024-05-29 20:36:09 +08:00
dqy	e418bbf380	test: 添加反汇编之后的正则匹配测试	2024-05-29 20:32:24 +08:00
dqy	d30ea0ca61	feat: 添加反汇编模块依赖	2024-05-29 20:31:42 +08:00
dqy	40f5c07fa1	feat: 添加对pyc文件的反汇编功能模块	2024-05-29 20:08:40 +08:00