[Feature] Add VLLM_USE_DEEP_GEMM_E8M0 Env to Control E8M0 Scale (#21968)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-08-11 12:39:08 -04:00
parent 8e13d9fe6d
commit f7dcce7a4a
9 changed files with 65 additions and 39 deletions
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -40,7 +40,7 @@ from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.utils import direct_register_custom_op, is_torch_equal_or_newer
-from vllm.utils.deep_gemm import is_blackwell_deep_gemm_used
+from vllm.utils.deep_gemm import is_blackwell_deep_gemm_e8m0_used

 from .rocm_aiter_fused_moe import is_rocm_aiter_moe_enabled

@@ -1387,8 +1387,8 @@ def fused_experts(hidden_states: torch.Tensor,
    # E8M0 scale, which means we requantize the weight and input to the specific
    # scale. Fallen back to cutlass or triton for some cases would cause
    # accuracy issue.
-    should_use_deep_gemm = is_blackwell_deep_gemm_used() or _valid_deep_gemm(
-        hidden_states, w1, w2)
+    should_use_deep_gemm = is_blackwell_deep_gemm_e8m0_used(
+    ) or _valid_deep_gemm(hidden_states, w1, w2)
    if (allow_deep_gemm and use_fp8_w8a8 and should_use_deep_gemm):
        assert apply_router_weight_on_input is False
        assert is_act_and_mul, (