[torch.compile] Fuse RMSNorm with quant (#9138)

Signed-off-by: luka <luka@neuralmagic.com>
Co-authored-by: youkaichao <youkaichao@126.com>
This commit is contained in:
Luka Govedič
2024-11-08 16:20:08 -05:00
committed by GitHub
parent e1b5a82179
commit 4f93dfe952
17 changed files with 1335 additions and 368 deletions

View File

@@ -68,6 +68,7 @@ if TYPE_CHECKING:
VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
VLLM_SKIP_P2P_CHECK: bool = False
VLLM_TORCH_COMPILE_LEVEL: int = 0
VLLM_TORCH_COMPILE_CONFIG: Optional[str] = None
VLLM_CUSTOM_OPS: List[str] = []
VLLM_DISABLED_KERNELS: List[str] = []
VLLM_USE_V1: bool = False
@@ -226,6 +227,7 @@ environment_variables: Dict[str, Callable[[], Any]] = {
# and disabled when running with Inductor (compile_level >= Inductor).
"VLLM_CUSTOM_OPS":
lambda: os.environ.get("VLLM_CUSTOM_OPS", "").replace(" ", "").split(","),
# local rank of the process in the distributed setting, used to determine
# the GPU device id
"LOCAL_RANK":