[Feature] Add VLLM_USE_DEEP_GEMM_E8M0 Env to Control E8M0 Scale (#21968)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
This commit is contained in:
@@ -40,7 +40,7 @@ from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.triton_utils import tl, triton
|
||||
from vllm.utils import direct_register_custom_op, is_torch_equal_or_newer
|
||||
from vllm.utils.deep_gemm import is_blackwell_deep_gemm_used
|
||||
from vllm.utils.deep_gemm import is_blackwell_deep_gemm_e8m0_used
|
||||
|
||||
from .rocm_aiter_fused_moe import is_rocm_aiter_moe_enabled
|
||||
|
||||
@@ -1387,8 +1387,8 @@ def fused_experts(hidden_states: torch.Tensor,
|
||||
# E8M0 scale, which means we requantize the weight and input to the specific
|
||||
# scale. Fallen back to cutlass or triton for some cases would cause
|
||||
# accuracy issue.
|
||||
should_use_deep_gemm = is_blackwell_deep_gemm_used() or _valid_deep_gemm(
|
||||
hidden_states, w1, w2)
|
||||
should_use_deep_gemm = is_blackwell_deep_gemm_e8m0_used(
|
||||
) or _valid_deep_gemm(hidden_states, w1, w2)
|
||||
if (allow_deep_gemm and use_fp8_w8a8 and should_use_deep_gemm):
|
||||
assert apply_router_weight_on_input is False
|
||||
assert is_act_and_mul, (
|
||||
|
||||
Reference in New Issue
Block a user