[Bugfix] Make compressed-tensors MoEs respect ignored layers (#28878)
Signed-off-by: HDCharles <charlesdavidhernandez@gmail.com>
This commit is contained in:
@@ -10,6 +10,7 @@ import torch
|
||||
from compressed_tensors.quantization import QuantizationType
|
||||
|
||||
from tests.models.utils import check_logprobs_close
|
||||
from vllm.model_executor.layers.fused_moe import UnquantizedFusedMoEMethod
|
||||
from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501
|
||||
CompressedTensors24,
|
||||
CompressedTensorsLinearMethod,
|
||||
@@ -767,3 +768,50 @@ def test_compressed_tensors_fp8_block_enabled(vllm_runner):
|
||||
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||
assert output
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.is_cuda(),
|
||||
reason="This test is not for non-CUDA platforms",
|
||||
)
|
||||
def test_compressed_tensors_moe_ignore_with_model(vllm_runner):
|
||||
"""
|
||||
Integration test for MoE layer ignore functionality with a real model.
|
||||
|
||||
This test would verify that when loading a compressed-tensors quantized
|
||||
MoE model where some MoE layers are in the ignore list, those layers
|
||||
use UnquantizedFusedMoEMethod while non-ignored layers use the
|
||||
quantized method.
|
||||
|
||||
Expected model structure:
|
||||
- Compressed-tensors quantized MoE model (e.g., Mixtral-based)
|
||||
- Config with ignore list containing specific MoE layers
|
||||
- Multiple MoE layers where some are quantized and some are not
|
||||
"""
|
||||
|
||||
# model_path = "nm-testing/tinysmokeqwen3moe-W4A16-first-only" # CT 12.3
|
||||
model_path = "nm-testing/tinysmokeqwen3moe-W4A16-first-only-CTstable" # CT 12.2
|
||||
|
||||
with vllm_runner(model_path, enforce_eager=True) as llm:
|
||||
|
||||
def check_model(model):
|
||||
from vllm.model_executor.layers.fused_moe import FusedMoE
|
||||
from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import ( # noqa: E501
|
||||
CompressedTensorsMoEMethod,
|
||||
)
|
||||
|
||||
# Check layer 0 MoE (should be quantized)
|
||||
layer_quantized = model.model.layers[0].mlp.experts
|
||||
assert isinstance(layer_quantized, FusedMoE)
|
||||
assert isinstance(layer_quantized.quant_method, CompressedTensorsMoEMethod)
|
||||
|
||||
# Check layer 10 MoE (should be unquantized + ignored)
|
||||
layer_unquantized = model.model.layers[3].mlp.experts
|
||||
assert isinstance(layer_unquantized, FusedMoE)
|
||||
assert isinstance(layer_unquantized.quant_method, UnquantizedFusedMoEMethod)
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
# Verify the model can generate output
|
||||
output = llm.generate_greedy("Hello, my name is", max_tokens=4)
|
||||
assert output
|
||||
|
||||
Reference in New Issue
Block a user