[UX] Integrate DeepGEMM into vLLM wheel via CMake (#37980)

Signed-off-by: mgoin <mgoin64@gmail.com> Signed-off-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Claude <noreply@anthropic.com>
2026-04-09 03:56:32 +02:00
parent 83aea2147f
commit eb4205fee5
12 changed files with 251 additions and 40 deletions
--- a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
+++ b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
@@ -14,7 +14,11 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
    get_fp8_min_max,
 )
 from vllm.platforms import current_platform
-from vllm.utils.deep_gemm import DeepGemmQuantScaleFMT, has_deep_gemm
+from vllm.utils.deep_gemm import (
+    DeepGemmQuantScaleFMT,
+    has_deep_gemm,
+    transform_sf_into_required_layout,
+)
 from vllm.utils.math_utils import cdiv, round_up
 from vllm.utils.torch_utils import set_random_seed

@@ -256,8 +260,6 @@ def test_silu_mul_fp8_quant_deep_gemm(E: int, T: int, H: int, fp8_type: torch.dt
            and current_platform.has_device_capability(100)
            and scale_fmt == DeepGemmQuantScaleFMT.UE8M0
        ):
-            from deep_gemm import transform_sf_into_required_layout
-
            _q, _s = ref_with_scale_fmt(
                E,
                T,