[CI] Prune Quantization Tests and skip compilation (#27038)

Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
Michael Goin
2025-10-16 17:26:35 -04:00
committed by GitHub
parent b3dda72c23
commit 01c977e96d
9 changed files with 62 additions and 134 deletions

View File

@@ -56,7 +56,10 @@ def enable_pickle(monkeypatch):
def test_quark_fp8_w_per_tensor_a_per_tensor(vllm_runner, kv_cache_dtype, tp):
model_path = "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test"
with vllm_runner(
model_path, kv_cache_dtype=kv_cache_dtype, tensor_parallel_size=tp
model_path,
enforce_eager=True,
kv_cache_dtype=kv_cache_dtype,
tensor_parallel_size=tp,
) as llm:
def check_model(model):
@@ -74,14 +77,14 @@ def test_quark_fp8_w_per_tensor_a_per_tensor(vllm_runner, kv_cache_dtype, tp):
llm.apply_model(check_model)
output = llm.generate_greedy("Hello my name is", max_tokens=20)
output = llm.generate_greedy("Hello my name is", max_tokens=4)
assert output
@pytest.mark.parametrize("tp", [1])
def test_quark_fp8_w_per_channel_a_per_token(vllm_runner, tp):
model_path = "amd/Qwen2.5-1.5B-Instruct-ptpc-Quark-ts"
with vllm_runner(model_path, tensor_parallel_size=tp) as llm:
with vllm_runner(model_path, enforce_eager=True, tensor_parallel_size=tp) as llm:
def check_model(model):
layer = model.model.layers[0]
@@ -98,14 +101,14 @@ def test_quark_fp8_w_per_channel_a_per_token(vllm_runner, tp):
llm.apply_model(check_model)
output = llm.generate_greedy("Hello my name is", max_tokens=20)
output = llm.generate_greedy("Hello my name is", max_tokens=4)
assert output
@pytest.mark.parametrize("tp", [1])
def test_quark_int8_w_per_tensor_a_per_tensor(vllm_runner, tp):
model_path = "amd/Llama-3.1-8B-Instruct-w-int8-a-int8-sym-test"
with vllm_runner(model_path, tensor_parallel_size=tp) as llm:
with vllm_runner(model_path, enforce_eager=True, tensor_parallel_size=tp) as llm:
def check_model(model):
layer = model.model.layers[0]
@@ -117,7 +120,7 @@ def test_quark_int8_w_per_tensor_a_per_tensor(vllm_runner, tp):
llm.apply_model(check_model)
output = llm.generate_greedy("Hello my name is", max_tokens=20)
output = llm.generate_greedy("Hello my name is", max_tokens=4)
assert output