[V1] Support LLM.apply_model (#18465)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-09-20 15:14:35 +08:00
committed by GitHub
parent be874c0201
commit 3d9a1d2de5
17 changed files with 194 additions and 169 deletions

View File

@@ -105,18 +105,21 @@ def test_register_quantization_config():
])
def test_custom_quant(vllm_runner, model, monkeypatch):
"""Test infer with the custom quantization method."""
# vllm_runner.apply_model() relies on V0 internals.
monkeypatch.setenv("VLLM_USE_V1", "0")
# `LLM.apply_model` requires pickling a function.
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
with vllm_runner(model_name=model,
quantization="custom_quant",
enforce_eager=True) as llm:
model = llm.llm.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
layer = model.model.layers[0]
qkv_proj = layer.self_attn.qkv_proj
def check_model(model):
layer = model.model.layers[0]
qkv_proj = layer.self_attn.qkv_proj
# Check the quantization method is FakeQuantLinearMethod
assert isinstance(qkv_proj.quant_method, FakeQuantLinearMethod)
# Check the quantization method is FakeQuantLinearMethod
assert isinstance(qkv_proj.quant_method, FakeQuantLinearMethod)
llm.apply_model(check_model)
output = llm.generate_greedy("Hello my name is", max_tokens=20)
assert output