[CI] Prune Quantization Tests and skip compilation (#27038)
Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
@@ -56,7 +56,10 @@ def enable_pickle(monkeypatch):
|
||||
def test_quark_fp8_w_per_tensor_a_per_tensor(vllm_runner, kv_cache_dtype, tp):
|
||||
model_path = "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test"
|
||||
with vllm_runner(
|
||||
model_path, kv_cache_dtype=kv_cache_dtype, tensor_parallel_size=tp
|
||||
model_path,
|
||||
enforce_eager=True,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
tensor_parallel_size=tp,
|
||||
) as llm:
|
||||
|
||||
def check_model(model):
|
||||
@@ -74,14 +77,14 @@ def test_quark_fp8_w_per_tensor_a_per_tensor(vllm_runner, kv_cache_dtype, tp):
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||
assert output
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tp", [1])
|
||||
def test_quark_fp8_w_per_channel_a_per_token(vllm_runner, tp):
|
||||
model_path = "amd/Qwen2.5-1.5B-Instruct-ptpc-Quark-ts"
|
||||
with vllm_runner(model_path, tensor_parallel_size=tp) as llm:
|
||||
with vllm_runner(model_path, enforce_eager=True, tensor_parallel_size=tp) as llm:
|
||||
|
||||
def check_model(model):
|
||||
layer = model.model.layers[0]
|
||||
@@ -98,14 +101,14 @@ def test_quark_fp8_w_per_channel_a_per_token(vllm_runner, tp):
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||
assert output
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tp", [1])
|
||||
def test_quark_int8_w_per_tensor_a_per_tensor(vllm_runner, tp):
|
||||
model_path = "amd/Llama-3.1-8B-Instruct-w-int8-a-int8-sym-test"
|
||||
with vllm_runner(model_path, tensor_parallel_size=tp) as llm:
|
||||
with vllm_runner(model_path, enforce_eager=True, tensor_parallel_size=tp) as llm:
|
||||
|
||||
def check_model(model):
|
||||
layer = model.model.layers[0]
|
||||
@@ -117,7 +120,7 @@ def test_quark_int8_w_per_tensor_a_per_tensor(vllm_runner, tp):
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||
assert output
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user