[Feature] Add Hopper DeepGEMM E8M0 for DeepSeekV3.1 scale_fmt (#23666)

Signed-off-by: yewentao256 <zhyanwentao@126.com> Signed-off-by: youkaichao <youkaichao@gmail.com> Co-authored-by: youkaichao <youkaichao@gmail.com>
2025-08-27 10:09:08 -04:00
parent 513c1fe255
commit 3af47c3cc6
10 changed files with 68 additions and 53 deletions
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -501,6 +501,24 @@ def get_config(

    if quantization_config is not None:
        config.quantization_config = quantization_config
+        # auto-enable DeepGEMM UE8M0 on Hopper if model config requests it
+        scale_fmt = quantization_config.get("scale_fmt", None)
+        if scale_fmt in ("ue8m0", ):
+            if not envs.is_set("VLLM_USE_DEEP_GEMM_E8M0_HOPPER"):
+                os.environ["VLLM_USE_DEEP_GEMM_E8M0_HOPPER"] = "1"
+                logger.info_once(
+                    ("Detected quantization_config.scale_fmt=%s; "
+                     "enabling Hopper UE8M0."),
+                    scale_fmt,
+                )
+            elif not envs.VLLM_USE_DEEP_GEMM_E8M0_HOPPER:
+                logger.warning_once(
+                    ("Model config requests UE8M0 "
+                     "(quantization_config.scale_fmt=%s), but "
+                     "VLLM_USE_DEEP_GEMM_E8M0_HOPPER=0 is set; "
+                     "Hopper UE8M0 disabled."),
+                    scale_fmt,
+                )

    if hf_overrides_kw:
        logger.debug("Overriding HF config with %s", hf_overrides_kw)