[CI] Prune Quantization Tests and skip compilation (#27038)
Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
@@ -66,13 +66,6 @@ def enable_pickle(monkeypatch):
|
||||
2560,
|
||||
True,
|
||||
),
|
||||
(
|
||||
"nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
|
||||
"channel",
|
||||
QuantizationType.INT,
|
||||
2560,
|
||||
True,
|
||||
),
|
||||
(
|
||||
"nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama",
|
||||
"tensor",
|
||||
@@ -138,7 +131,7 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
output = llm.generate_greedy(["Hello my name is"], max_tokens=20)
|
||||
output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
|
||||
assert output
|
||||
|
||||
|
||||
@@ -146,12 +139,9 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
|
||||
"model_path",
|
||||
[
|
||||
"neuralmagic/Llama-3.2-1B-quantized.w8a8",
|
||||
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
|
||||
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
|
||||
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@pytest.mark.parametrize("max_tokens", [8])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
@pytest.mark.parametrize(
|
||||
"use_aiter", [True, False] if current_platform.is_rocm() else [False]
|
||||
@@ -211,7 +201,7 @@ def test_compressed_tensors_w8a8_logprobs(
|
||||
def test_compressed_tensors_no_enforce_eager(vllm_runner):
|
||||
model_path = "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
|
||||
with vllm_runner(model_path) as llm:
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||
assert output
|
||||
|
||||
|
||||
@@ -219,15 +209,10 @@ def test_compressed_tensors_no_enforce_eager(vllm_runner):
|
||||
"model_args",
|
||||
[
|
||||
("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2", "tensor"),
|
||||
("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym", "tensor"),
|
||||
(
|
||||
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
|
||||
"channel",
|
||||
),
|
||||
(
|
||||
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym",
|
||||
"channel",
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
@@ -253,7 +238,7 @@ def test_compressed_tensors_w8a8_dynamic_per_token(
|
||||
# this will enable VLLM_ROCM_USE_AITER_LINEAR
|
||||
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
|
||||
|
||||
with vllm_runner(model_path, dtype=torch.float16) as llm:
|
||||
with vllm_runner(model_path, enforce_eager=True, dtype=torch.float16) as llm:
|
||||
|
||||
def check_model(model):
|
||||
layer = model.model.layers[0]
|
||||
@@ -268,7 +253,7 @@ def test_compressed_tensors_w8a8_dynamic_per_token(
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
output = llm.generate_greedy(["Hello my name is"], max_tokens=20)
|
||||
output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
|
||||
assert output
|
||||
|
||||
|
||||
@@ -283,38 +268,6 @@ def test_compressed_tensors_w8a8_dynamic_per_token(
|
||||
True,
|
||||
False,
|
||||
),
|
||||
(
|
||||
"nm-testing/tinyllama-oneshot-w4a16-group128-v2",
|
||||
"group",
|
||||
128,
|
||||
8,
|
||||
True,
|
||||
False,
|
||||
),
|
||||
(
|
||||
"nm-testing/tinyllama-oneshot-w8a16-per-channel",
|
||||
"channel",
|
||||
None,
|
||||
4,
|
||||
True,
|
||||
False,
|
||||
),
|
||||
(
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-awq-group128-asym256",
|
||||
"group",
|
||||
128,
|
||||
8,
|
||||
False,
|
||||
False,
|
||||
),
|
||||
(
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-Asym-Updated-Channel",
|
||||
"channel",
|
||||
None,
|
||||
8,
|
||||
False,
|
||||
False,
|
||||
),
|
||||
(
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-Asym-Updated-ActOrder",
|
||||
"group",
|
||||
@@ -330,7 +283,7 @@ def test_compressed_tensors_w8a8_dynamic_per_token(
|
||||
)
|
||||
def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
|
||||
model, strategy, group, pack_factor, symmetric, has_g_idx = wNa16_args
|
||||
with vllm_runner(model) as llm:
|
||||
with vllm_runner(model, enforce_eager=True) as llm:
|
||||
|
||||
def check_model(model):
|
||||
layer = model.model.layers[0]
|
||||
@@ -348,7 +301,7 @@ def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||
assert output
|
||||
|
||||
|
||||
@@ -357,7 +310,7 @@ def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
|
||||
)
|
||||
def test_compressed_tensors_w4a16_marlin24(vllm_runner):
|
||||
model_path = "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
|
||||
with vllm_runner(model_path) as llm:
|
||||
with vllm_runner(model_path, enforce_eager=True) as llm:
|
||||
|
||||
def check_model(model):
|
||||
layer = model.model.layers[0]
|
||||
@@ -370,13 +323,13 @@ def test_compressed_tensors_w4a16_marlin24(vllm_runner):
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||
assert output
|
||||
|
||||
|
||||
def test_compressed_tensors_fp8(vllm_runner):
|
||||
model_path = "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
|
||||
with vllm_runner(model_path) as llm:
|
||||
with vllm_runner(model_path, enforce_eager=True) as llm:
|
||||
|
||||
def check_model(model):
|
||||
layer = model.model.layers[0]
|
||||
@@ -399,7 +352,7 @@ def test_compressed_tensors_fp8(vllm_runner):
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||
assert output
|
||||
|
||||
|
||||
@@ -412,8 +365,8 @@ def test_compressed_tensors_fp8(vllm_runner):
|
||||
)
|
||||
def test_compressed_tensors_kv_cache(vllm_runner):
|
||||
model_path = "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
|
||||
with vllm_runner(model_path, kv_cache_dtype="fp8") as llm:
|
||||
output = llm.generate_greedy("Hello world!", max_tokens=20)
|
||||
with vllm_runner(model_path, enforce_eager=True, kv_cache_dtype="fp8") as llm:
|
||||
output = llm.generate_greedy("Hello world!", max_tokens=4)
|
||||
assert output
|
||||
|
||||
|
||||
@@ -465,7 +418,7 @@ def _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy, format="d
|
||||
)
|
||||
def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
|
||||
model, weight_strategy, input_strategy = args_2of4
|
||||
with vllm_runner(model) as llm:
|
||||
with vllm_runner(model, enforce_eager=True) as llm:
|
||||
|
||||
def check_model(model):
|
||||
layer = model.model.layers[0]
|
||||
@@ -476,7 +429,7 @@ def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||
print(output)
|
||||
assert output
|
||||
|
||||
@@ -512,7 +465,7 @@ def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
|
||||
)
|
||||
def test_compressed_tensors_2of4_quant_fp8_compressed(vllm_runner, args_2of4):
|
||||
model, weight_strategy, input_strategy = args_2of4
|
||||
with vllm_runner(model) as llm:
|
||||
with vllm_runner(model, enforce_eager=True) as llm:
|
||||
|
||||
def check_model(model):
|
||||
layer = model.model.layers[0]
|
||||
@@ -528,7 +481,7 @@ def test_compressed_tensors_2of4_quant_fp8_compressed(vllm_runner, args_2of4):
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||
print(output)
|
||||
assert output
|
||||
|
||||
@@ -564,7 +517,7 @@ def test_compressed_tensors_2of4_quant_fp8_compressed(vllm_runner, args_2of4):
|
||||
)
|
||||
def test_compressed_tensors_2of4_quant_int8_compressed(vllm_runner, args_2of4):
|
||||
model, weight_strategy, input_strategy = args_2of4
|
||||
with vllm_runner(model) as llm:
|
||||
with vllm_runner(model, enforce_eager=True) as llm:
|
||||
|
||||
def check_model(model):
|
||||
layer = model.model.layers[0]
|
||||
@@ -580,7 +533,7 @@ def test_compressed_tensors_2of4_quant_int8_compressed(vllm_runner, args_2of4):
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||
print(output)
|
||||
assert output
|
||||
|
||||
@@ -611,7 +564,7 @@ def test_compressed_tensors_2of4_quant_int8_compressed(vllm_runner, args_2of4):
|
||||
)
|
||||
def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
|
||||
model, weight_strategy, input_strategy = args_2of4
|
||||
with vllm_runner(model) as llm:
|
||||
with vllm_runner(model, enforce_eager=True) as llm:
|
||||
|
||||
def check_model(model):
|
||||
layer = model.model.layers[0]
|
||||
@@ -622,7 +575,7 @@ def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||
print(output)
|
||||
assert output
|
||||
|
||||
@@ -637,7 +590,7 @@ def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
|
||||
)
|
||||
def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4):
|
||||
model = args_2of4
|
||||
with vllm_runner(model) as llm:
|
||||
with vllm_runner(model, enforce_eager=True) as llm:
|
||||
|
||||
def check_model(model):
|
||||
layer = model.model.layers[0]
|
||||
@@ -656,7 +609,7 @@ def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4):
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||
print(output)
|
||||
assert output
|
||||
|
||||
@@ -670,7 +623,7 @@ def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4):
|
||||
)
|
||||
def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
|
||||
model = args_2of4
|
||||
with vllm_runner(model) as llm:
|
||||
with vllm_runner(model, enforce_eager=True) as llm:
|
||||
|
||||
def check_model(model):
|
||||
layer = model.model.layers[0]
|
||||
@@ -689,7 +642,7 @@ def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||
print(output)
|
||||
assert output
|
||||
|
||||
@@ -723,7 +676,7 @@ def test_compressed_tensors_nvfp4(vllm_runner, args):
|
||||
assert qkv_proj.scheme.group_size == 16
|
||||
|
||||
llm.apply_model(check_model)
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||
print(output)
|
||||
assert output
|
||||
|
||||
@@ -758,7 +711,7 @@ def test_compressed_tensors_w4a8_fp8(vllm_runner, args):
|
||||
assert proj.scheme.group_size == 128
|
||||
|
||||
llm.apply_model(check_model)
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||
print(output)
|
||||
assert output
|
||||
|
||||
@@ -792,7 +745,7 @@ def test_compressed_tensors_transforms_perplexity(
|
||||
|
||||
def test_compressed_tensors_fp8_block_enabled(vllm_runner):
|
||||
model_path = "RedHatAI/Qwen3-0.6B-FP8-BLOCK"
|
||||
with vllm_runner(model_path) as llm:
|
||||
with vllm_runner(model_path, enforce_eager=True) as llm:
|
||||
fp8_dtype = current_platform.fp8_dtype()
|
||||
|
||||
def check_model(model):
|
||||
@@ -816,5 +769,5 @@ def test_compressed_tensors_fp8_block_enabled(vllm_runner):
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||
assert output
|
||||
|
||||
Reference in New Issue
Block a user