[Feature] Add Hopper DeepGEMM E8M0 for DeepSeekV3.1 scale_fmt (#23666)

Signed-off-by: yewentao256 <zhyanwentao@126.com> Signed-off-by: youkaichao <youkaichao@gmail.com> Co-authored-by: youkaichao <youkaichao@gmail.com>
2025-08-27 10:09:08 -04:00
parent 513c1fe255
commit 3af47c3cc6
10 changed files with 68 additions and 53 deletions
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -131,6 +131,7 @@ if TYPE_CHECKING:
    VLLM_TPU_USING_PATHWAYS: bool = False
    VLLM_USE_DEEP_GEMM: bool = False
    VLLM_USE_DEEP_GEMM_E8M0: bool = True
+    VLLM_USE_DEEP_GEMM_E8M0_HOPPER: bool = False
    VLLM_SKIP_DEEP_GEMM_WARMUP: bool = False
    VLLM_USE_FUSED_MOE_GROUPED_TOPK: bool = True
    VLLM_USE_FLASHINFER_MOE_FP8: bool = False
@@ -954,9 +955,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
    lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "0"))),

    # Whether to use E8M0 scaling when DeepGEMM is used on Blackwell GPUs.
-    # E8M0 is faster on B200 but may reduce accuracy.
    "VLLM_USE_DEEP_GEMM_E8M0":
    lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM_E8M0", "1"))),
+    # TODO(wentao): unify the two E8M0 flags after verifying the correctness.
+    # Whether to use E8M0 scaling when DeepGEMM is used on Hopper GPUs.
+    "VLLM_USE_DEEP_GEMM_E8M0_HOPPER":
+    lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM_E8M0_HOPPER", "0"))),
    # DeepGemm JITs the kernels on-demand. The warmup attempts to make DeepGemm
    # JIT all the required kernels before model execution so there is no
    # JIT'ing in the hot-path. However, this warmup increases the engine
@@ -1244,6 +1248,8 @@ def compute_hash() -> str:
        "VLLM_USE_FLASHINFER_SAMPLER",
        "VLLM_DISABLED_KERNELS",
        "VLLM_USE_DEEP_GEMM",
+        "VLLM_USE_DEEP_GEMM_E8M0",
+        "VLLM_USE_DEEP_GEMM_E8M0_HOPPER",
        "VLLM_USE_TRTLLM_FP4_GEMM",
        "VLLM_USE_FUSED_MOE_GROUPED_TOPK",
        "VLLM_USE_FLASHINFER_MOE_FP8",