[torch.compile] Fine-grained CustomOp enabling mechanism (#9300)

2024-10-17 14:36:37 -04:00
parent 7871659abb
commit 0f41fbe5a3
8 changed files with 220 additions and 21 deletions
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -65,6 +65,7 @@ if TYPE_CHECKING:
    VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
    VLLM_SKIP_P2P_CHECK: bool = False
    VLLM_TORCH_COMPILE_LEVEL: int = 0
+    VLLM_CUSTOM_OPS: List[str] = []
    VLLM_DISABLED_KERNELS: List[str] = []


@@ -205,7 +206,17 @@ environment_variables: Dict[str, Callable[[], Any]] = {
        os.environ.get("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1") != "0"),
    "VLLM_TORCH_COMPILE_LEVEL":
    lambda: int(os.environ.get("VLLM_TORCH_COMPILE_LEVEL", "0")),
-
+    # Fine-grained control over which custom ops to enable/disable.
+    # Use 'all' to enable all, 'none' to disable all.
+    # Also specify a list of custom op names to enable (prefixed with a '+'),
+    # or disable (prefixed with a '-').
+    # Examples:
+    # - 'all,-op1' to enable all except op1
+    # - 'none,+op1,+op2' to enable only op1 and op2
+    # By default, all custom ops are enabled when running without Inductor
+    # and disabled when running with Inductor (compile_level >= Inductor).
+    "VLLM_CUSTOM_OPS":
+    lambda: os.environ.get("VLLM_CUSTOM_OPS", "").replace(" ", "").split(","),
    # local rank of the process in the distributed setting, used to determine
    # the GPU device id
    "LOCAL_RANK":