[Feature][Quantization] MXFP4 support for MOE models (#17888)

Signed-off-by: Felix Marty <felmarty@amd.com> Signed-off-by: Bowen Bao <bowenbao@amd.com> Signed-off-by: Felix Marty <Felix.Marty@amd.com> Co-authored-by: Bowen Bao <bowenbao@amd.com>
2025-07-09 22:19:02 +02:00
parent bf03ff3575
commit 332d4cb17b
15 changed files with 873 additions and 104 deletions
--- a/tests/kernels/moe/test_mxfp4_moe.py
+++ b/tests/kernels/moe/test_mxfp4_moe.py
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import importlib
+import importlib.metadata
+from dataclasses import dataclass
+
+import pytest
+import torch
+from packaging import version
+
+QUARK_MXFP4_AVAILABLE = importlib.util.find_spec(
+    "quark") is not None and version.parse(
+        importlib.metadata.version("amd-quark")) >= version.parse('0.8.99')
+
+
+@dataclass
+class ModelCase:
+    model_id: str
+    tp: int
+
+
+@pytest.mark.parametrize('model_case', [
+    ModelCase("fxmarty/qwen_1.5-moe-a2.7b-mxfp4", tp=1),
+    ModelCase("fxmarty/deepseek_r1_3_layers_mxfp4", tp=8),
+    ModelCase("fxmarty/Llama-4-Scout-17B-16E-Instruct-2-layers-mxfp4", tp=1)
+])
+@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE,
+                    reason="amd-quark>=0.9 is not available")
+def test_mxfp4_loading_and_execution_moe(vllm_runner, model_case: ModelCase):
+    if torch.cuda.device_count() < model_case.tp:
+        pytest.skip(f"This test requires >={model_case.tp} gpus, got only "
+                    f"{torch.cuda.device_count()}")
+
+    with vllm_runner(model_case.model_id,
+                     tensor_parallel_size=model_case.tp,
+                     load_format="dummy") as llm:
+
+        # TODO: llm.apply_model(check_model) currently relies on V0 internals.
+        # Re-enable this later.
+        # def check_model(model):
+        #     layer = model.model.layers[0]
+
+        #     qkv_proj = layer.self_attn.qkv_proj
+
+        #     assert isinstance(qkv_proj.quant_method, QuarkLinearMethod)
+        #     assert isinstance(qkv_proj.scheme, QuarkW4A4MXFP4)
+
+        #     assert isinstance(layer.mlp.experts.quant_method,
+        #                       QuarkW4A4MXFp4MoEMethod)
+
+        # if model_case.model_id == "fxmarty/qwen_1.5-moe-a2.7b-mxfp4":
+        #     llm.apply_model(check_model)
+
+        output = llm.generate_greedy("Today I am in the French Alps and",
+                                     max_tokens=20)
+        assert output