[2/N][torch.compile] make compilation cfg part of vllm cfg (#10383)

Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-11-16 18:02:14 -08:00
parent 661a34fd4f
commit 4fd9375028
27 changed files with 359 additions and 283 deletions
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -69,7 +69,6 @@ if TYPE_CHECKING:
    VLLM_SKIP_P2P_CHECK: bool = False
    VLLM_TORCH_COMPILE_LEVEL: int = 0
    VLLM_TORCH_COMPILE_CONFIG: Optional[str] = None
-    VLLM_CUSTOM_OPS: List[str] = []
    VLLM_DISABLED_KERNELS: List[str] = []
    VLLM_USE_V1: bool = False
    VLLM_ENABLE_V1_MULTIPROCESSING: bool = False
@@ -217,18 +216,6 @@ environment_variables: Dict[str, Callable[[], Any]] = {
    "VLLM_TORCH_COMPILE_CONFIG":
    lambda: os.environ.get("VLLM_TORCH_COMPILE_CONFIG", None),

-    # Fine-grained control over which custom ops to enable/disable.
-    # Use 'all' to enable all, 'none' to disable all.
-    # Also specify a list of custom op names to enable (prefixed with a '+'),
-    # or disable (prefixed with a '-').
-    # Examples:
-    # - 'all,-op1' to enable all except op1
-    # - 'none,+op1,+op2' to enable only op1 and op2
-    # By default, all custom ops are enabled when running without Inductor
-    # and disabled when running with Inductor (compile_level >= Inductor).
-    "VLLM_CUSTOM_OPS":
-    lambda: os.environ.get("VLLM_CUSTOM_OPS", "").replace(" ", "").split(","),
-
    # local rank of the process in the distributed setting, used to determine
    # the GPU device id
    "LOCAL_RANK":