[Quantization] Support Quark W8A8 INT8 MoE inference (#36320)

Signed-off-by: kangletian <Letian.Kang@amd.com>
This commit is contained in:
PikaPikachu
2026-04-10 01:24:43 +08:00
committed by GitHub
parent 56e19d7ee2
commit 827268e98d
4 changed files with 360 additions and 2 deletions

View File

@@ -22,6 +22,9 @@ from vllm.model_executor.layers.quantization.quark.quark import ( # noqa: E501
QuarkW8A8Fp8,
QuarkW8A8Int8,
)
from vllm.model_executor.layers.quantization.quark.quark_moe import ( # noqa: E501
QuarkW8A8Int8MoEMethod,
)
from vllm.platforms import current_platform
from .reference_mxfp4 import dq_mxfp4_torch, qdq_mxfp4_torch
@@ -126,6 +129,34 @@ def test_quark_int8_w_per_tensor_a_per_tensor(vllm_runner, tp):
assert output
@pytest.mark.parametrize("tp", [1])
def test_quark_int8_w8a8_moe(vllm_runner, tp):
"""Test W8A8 INT8 MoE quantization with a tiny Qwen3 MoE model."""
model_path = "nameistoken/tiny-qwen3-moe-w8a8-int8-quark"
with vllm_runner(
model_path,
enforce_eager=True,
tensor_parallel_size=tp,
gpu_memory_utilization=0.1,
) as llm:
def check_model(model):
layer = model.model.layers[0]
# MoE experts should use QuarkW8A8Int8MoEMethod
moe = layer.mlp.experts
assert isinstance(moe.quant_method, QuarkW8A8Int8MoEMethod), (
f"Expected QuarkW8A8Int8MoEMethod, got {type(moe.quant_method)}"
)
# Non-MoE linear layers should use QuarkW8A8Int8
qkv_proj = layer.self_attn.qkv_proj
assert isinstance(qkv_proj.scheme, QuarkW8A8Int8)
llm.apply_model(check_model)
output = llm.generate_greedy("Hello", max_tokens=4)
assert output
def test_quark_fp8_parity(vllm_runner):
quark_model_id = "amd-quark/llama-tiny-fp8-quark-quant-method"
fp8_model_id = "amd-quark/llama-tiny-fp8-quant-method"