[Feature] Add VLLM_USE_DEEP_GEMM_E8M0 Env to Control E8M0 Scale (#21968)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-08-11 12:39:08 -04:00
parent 8e13d9fe6d
commit f7dcce7a4a
9 changed files with 65 additions and 39 deletions
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -127,6 +127,7 @@ if TYPE_CHECKING:
    VLLM_TPU_MOST_MODEL_LEN: Optional[int] = None
    VLLM_TPU_USING_PATHWAYS: bool = False
    VLLM_USE_DEEP_GEMM: bool = False
+    VLLM_USE_DEEP_GEMM_E8M0: bool = True
    VLLM_SKIP_DEEP_GEMM_WARMUP: bool = False
    VLLM_USE_FLASHINFER_MOE_FP8: bool = False
    VLLM_USE_FLASHINFER_MOE_FP4: bool = False
@@ -925,6 +926,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_USE_DEEP_GEMM":
    lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "0"))),

+    # Whether to use E8M0 scaling when DeepGEMM is used on Blackwell GPUs.
+    # E8M0 is faster on B200 but may reduce accuracy.
+    "VLLM_USE_DEEP_GEMM_E8M0":
+    lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM_E8M0", "1"))),
    # DeepGemm JITs the kernels on-demand. The warmup attempts to make DeepGemm
    # JIT all the required kernels before model execution so there is no
    # JIT'ing in the hot-path. However, this warmup increases the engine