[Feature] Enable E8M0 by Default on Hopper for DeepGEMM, 5% E2E throughput improvement (#26197)
Signed-off-by: yewentao256 <zhyanwentao@126.com> Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
@@ -629,25 +629,25 @@ def get_config(
|
||||
|
||||
if quantization_config is not None:
|
||||
config.quantization_config = quantization_config
|
||||
# auto-enable DeepGEMM UE8M0 on Hopper if model config requests it
|
||||
# auto-enable DeepGEMM UE8M0 if model config requests it
|
||||
scale_fmt = quantization_config.get("scale_fmt", None)
|
||||
if scale_fmt in ("ue8m0",):
|
||||
if not envs.is_set("VLLM_USE_DEEP_GEMM_E8M0_HOPPER"):
|
||||
os.environ["VLLM_USE_DEEP_GEMM_E8M0_HOPPER"] = "1"
|
||||
if not envs.is_set("VLLM_USE_DEEP_GEMM_E8M0"):
|
||||
os.environ["VLLM_USE_DEEP_GEMM_E8M0"] = "1"
|
||||
logger.info_once(
|
||||
(
|
||||
"Detected quantization_config.scale_fmt=%s; "
|
||||
"enabling Hopper UE8M0."
|
||||
"enabling UE8M0 for DeepGEMM."
|
||||
),
|
||||
scale_fmt,
|
||||
)
|
||||
elif not envs.VLLM_USE_DEEP_GEMM_E8M0_HOPPER:
|
||||
elif not envs.VLLM_USE_DEEP_GEMM_E8M0:
|
||||
logger.warning_once(
|
||||
(
|
||||
"Model config requests UE8M0 "
|
||||
"(quantization_config.scale_fmt=%s), but "
|
||||
"VLLM_USE_DEEP_GEMM_E8M0_HOPPER=0 is set; "
|
||||
"Hopper UE8M0 disabled."
|
||||
"VLLM_USE_DEEP_GEMM_E8M0=0 is set; "
|
||||
"UE8M0 for DeepGEMM disabled."
|
||||
),
|
||||
scale_fmt,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user