[Feature] Add Hopper DeepGEMM E8M0 for DeepSeekV3.1 scale_fmt (#23666)
Signed-off-by: yewentao256 <zhyanwentao@126.com> Signed-off-by: youkaichao <youkaichao@gmail.com> Co-authored-by: youkaichao <youkaichao@gmail.com>
This commit is contained in:
@@ -131,6 +131,7 @@ if TYPE_CHECKING:
|
||||
VLLM_TPU_USING_PATHWAYS: bool = False
|
||||
VLLM_USE_DEEP_GEMM: bool = False
|
||||
VLLM_USE_DEEP_GEMM_E8M0: bool = True
|
||||
VLLM_USE_DEEP_GEMM_E8M0_HOPPER: bool = False
|
||||
VLLM_SKIP_DEEP_GEMM_WARMUP: bool = False
|
||||
VLLM_USE_FUSED_MOE_GROUPED_TOPK: bool = True
|
||||
VLLM_USE_FLASHINFER_MOE_FP8: bool = False
|
||||
@@ -954,9 +955,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "0"))),
|
||||
|
||||
# Whether to use E8M0 scaling when DeepGEMM is used on Blackwell GPUs.
|
||||
# E8M0 is faster on B200 but may reduce accuracy.
|
||||
"VLLM_USE_DEEP_GEMM_E8M0":
|
||||
lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM_E8M0", "1"))),
|
||||
# TODO(wentao): unify the two E8M0 flags after verifying the correctness.
|
||||
# Whether to use E8M0 scaling when DeepGEMM is used on Hopper GPUs.
|
||||
"VLLM_USE_DEEP_GEMM_E8M0_HOPPER":
|
||||
lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM_E8M0_HOPPER", "0"))),
|
||||
# DeepGemm JITs the kernels on-demand. The warmup attempts to make DeepGemm
|
||||
# JIT all the required kernels before model execution so there is no
|
||||
# JIT'ing in the hot-path. However, this warmup increases the engine
|
||||
@@ -1244,6 +1248,8 @@ def compute_hash() -> str:
|
||||
"VLLM_USE_FLASHINFER_SAMPLER",
|
||||
"VLLM_DISABLED_KERNELS",
|
||||
"VLLM_USE_DEEP_GEMM",
|
||||
"VLLM_USE_DEEP_GEMM_E8M0",
|
||||
"VLLM_USE_DEEP_GEMM_E8M0_HOPPER",
|
||||
"VLLM_USE_TRTLLM_FP4_GEMM",
|
||||
"VLLM_USE_FUSED_MOE_GROUPED_TOPK",
|
||||
"VLLM_USE_FLASHINFER_MOE_FP8",
|
||||
|
||||
Reference in New Issue
Block a user