[Feature][Quantization] MXFP4 support for MOE models (#17888)
Signed-off-by: Felix Marty <felmarty@amd.com> Signed-off-by: Bowen Bao <bowenbao@amd.com> Signed-off-by: Felix Marty <Felix.Marty@amd.com> Co-authored-by: Bowen Bao <bowenbao@amd.com>
This commit is contained in:
57
tests/kernels/moe/test_mxfp4_moe.py
Normal file
57
tests/kernels/moe/test_mxfp4_moe.py
Normal file
@@ -0,0 +1,57 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import importlib
|
||||
import importlib.metadata
|
||||
from dataclasses import dataclass
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from packaging import version
|
||||
|
||||
QUARK_MXFP4_AVAILABLE = importlib.util.find_spec(
|
||||
"quark") is not None and version.parse(
|
||||
importlib.metadata.version("amd-quark")) >= version.parse('0.8.99')
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelCase:
|
||||
model_id: str
|
||||
tp: int
|
||||
|
||||
|
||||
@pytest.mark.parametrize('model_case', [
|
||||
ModelCase("fxmarty/qwen_1.5-moe-a2.7b-mxfp4", tp=1),
|
||||
ModelCase("fxmarty/deepseek_r1_3_layers_mxfp4", tp=8),
|
||||
ModelCase("fxmarty/Llama-4-Scout-17B-16E-Instruct-2-layers-mxfp4", tp=1)
|
||||
])
|
||||
@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE,
|
||||
reason="amd-quark>=0.9 is not available")
|
||||
def test_mxfp4_loading_and_execution_moe(vllm_runner, model_case: ModelCase):
|
||||
if torch.cuda.device_count() < model_case.tp:
|
||||
pytest.skip(f"This test requires >={model_case.tp} gpus, got only "
|
||||
f"{torch.cuda.device_count()}")
|
||||
|
||||
with vllm_runner(model_case.model_id,
|
||||
tensor_parallel_size=model_case.tp,
|
||||
load_format="dummy") as llm:
|
||||
|
||||
# TODO: llm.apply_model(check_model) currently relies on V0 internals.
|
||||
# Re-enable this later.
|
||||
# def check_model(model):
|
||||
# layer = model.model.layers[0]
|
||||
|
||||
# qkv_proj = layer.self_attn.qkv_proj
|
||||
|
||||
# assert isinstance(qkv_proj.quant_method, QuarkLinearMethod)
|
||||
# assert isinstance(qkv_proj.scheme, QuarkW4A4MXFP4)
|
||||
|
||||
# assert isinstance(layer.mlp.experts.quant_method,
|
||||
# QuarkW4A4MXFp4MoEMethod)
|
||||
|
||||
# if model_case.model_id == "fxmarty/qwen_1.5-moe-a2.7b-mxfp4":
|
||||
# llm.apply_model(check_model)
|
||||
|
||||
output = llm.generate_greedy("Today I am in the French Alps and",
|
||||
max_tokens=20)
|
||||
assert output
|
||||
Reference in New Issue
Block a user