[torch.compile] Fuse RMSNorm with quant (#9138)

Signed-off-by: luka <luka@neuralmagic.com> Co-authored-by: youkaichao <youkaichao@126.com>
2024-11-08 16:20:08 -05:00
parent e1b5a82179
commit 4f93dfe952
17 changed files with 1335 additions and 368 deletions
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -68,6 +68,7 @@ if TYPE_CHECKING:
    VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
    VLLM_SKIP_P2P_CHECK: bool = False
    VLLM_TORCH_COMPILE_LEVEL: int = 0
+    VLLM_TORCH_COMPILE_CONFIG: Optional[str] = None
    VLLM_CUSTOM_OPS: List[str] = []
    VLLM_DISABLED_KERNELS: List[str] = []
    VLLM_USE_V1: bool = False
@@ -226,6 +227,7 @@ environment_variables: Dict[str, Callable[[], Any]] = {
    # and disabled when running with Inductor (compile_level >= Inductor).
    "VLLM_CUSTOM_OPS":
    lambda: os.environ.get("VLLM_CUSTOM_OPS", "").replace(" ", "").split(","),
+
    # local rank of the process in the distributed setting, used to determine
    # the GPU device id
    "LOCAL_RANK":