[Quantization] Support Quark W8A8 INT8 MoE inference (#36320)

Signed-off-by: kangletian <Letian.Kang@amd.com>
2026-04-10 01:24:43 +08:00
parent 56e19d7ee2
commit 827268e98d
4 changed files with 360 additions and 2 deletions
--- a/tests/quantization/test_quark.py
+++ b/tests/quantization/test_quark.py
@@ -22,6 +22,9 @@ from vllm.model_executor.layers.quantization.quark.quark import (  # noqa: E501
    QuarkW8A8Fp8,
    QuarkW8A8Int8,
 )
+from vllm.model_executor.layers.quantization.quark.quark_moe import (  # noqa: E501
+    QuarkW8A8Int8MoEMethod,
+)
 from vllm.platforms import current_platform

 from .reference_mxfp4 import dq_mxfp4_torch, qdq_mxfp4_torch
@@ -126,6 +129,34 @@ def test_quark_int8_w_per_tensor_a_per_tensor(vllm_runner, tp):
        assert output


+@pytest.mark.parametrize("tp", [1])
+def test_quark_int8_w8a8_moe(vllm_runner, tp):
+    """Test W8A8 INT8 MoE quantization with a tiny Qwen3 MoE model."""
+    model_path = "nameistoken/tiny-qwen3-moe-w8a8-int8-quark"
+    with vllm_runner(
+        model_path,
+        enforce_eager=True,
+        tensor_parallel_size=tp,
+        gpu_memory_utilization=0.1,
+    ) as llm:
+
+        def check_model(model):
+            layer = model.model.layers[0]
+            # MoE experts should use QuarkW8A8Int8MoEMethod
+            moe = layer.mlp.experts
+            assert isinstance(moe.quant_method, QuarkW8A8Int8MoEMethod), (
+                f"Expected QuarkW8A8Int8MoEMethod, got {type(moe.quant_method)}"
+            )
+            # Non-MoE linear layers should use QuarkW8A8Int8
+            qkv_proj = layer.self_attn.qkv_proj
+            assert isinstance(qkv_proj.scheme, QuarkW8A8Int8)
+
+        llm.apply_model(check_model)
+
+        output = llm.generate_greedy("Hello", max_tokens=4)
+        assert output
+
+
 def test_quark_fp8_parity(vllm_runner):
    quark_model_id = "amd-quark/llama-tiny-fp8-quark-quant-method"
    fp8_model_id = "amd-quark/llama-tiny-fp8-quant-method"