[CI] Prune Quantization Tests and skip compilation (#27038)

Signed-off-by: mgoin <mgoin64@gmail.com>
2025-10-16 17:26:35 -04:00
parent b3dda72c23
commit 01c977e96d
9 changed files with 62 additions and 134 deletions
--- a/tests/quantization/test_quark.py
+++ b/tests/quantization/test_quark.py
@@ -56,7 +56,10 @@ def enable_pickle(monkeypatch):
 def test_quark_fp8_w_per_tensor_a_per_tensor(vllm_runner, kv_cache_dtype, tp):
    model_path = "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test"
    with vllm_runner(
-        model_path, kv_cache_dtype=kv_cache_dtype, tensor_parallel_size=tp
+        model_path,
+        enforce_eager=True,
+        kv_cache_dtype=kv_cache_dtype,
+        tensor_parallel_size=tp,
    ) as llm:

        def check_model(model):
@@ -74,14 +77,14 @@ def test_quark_fp8_w_per_tensor_a_per_tensor(vllm_runner, kv_cache_dtype, tp):

        llm.apply_model(check_model)

-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy("Hello my name is", max_tokens=4)
        assert output


@pytest.mark.parametrize("tp", [1])
 def test_quark_fp8_w_per_channel_a_per_token(vllm_runner, tp):
    model_path = "amd/Qwen2.5-1.5B-Instruct-ptpc-Quark-ts"
-    with vllm_runner(model_path, tensor_parallel_size=tp) as llm:
+    with vllm_runner(model_path, enforce_eager=True, tensor_parallel_size=tp) as llm:

        def check_model(model):
            layer = model.model.layers[0]
@@ -98,14 +101,14 @@ def test_quark_fp8_w_per_channel_a_per_token(vllm_runner, tp):

        llm.apply_model(check_model)

-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy("Hello my name is", max_tokens=4)
        assert output


@pytest.mark.parametrize("tp", [1])
 def test_quark_int8_w_per_tensor_a_per_tensor(vllm_runner, tp):
    model_path = "amd/Llama-3.1-8B-Instruct-w-int8-a-int8-sym-test"
-    with vllm_runner(model_path, tensor_parallel_size=tp) as llm:
+    with vllm_runner(model_path, enforce_eager=True, tensor_parallel_size=tp) as llm:

        def check_model(model):
            layer = model.model.layers[0]
@@ -117,7 +120,7 @@ def test_quark_int8_w_per_tensor_a_per_tensor(vllm_runner, tp):

        llm.apply_model(check_model)

-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy("Hello my name is", max_tokens=4)
        assert output