[CI] Prune Quantization Tests and skip compilation (#27038)

Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
Michael Goin
2025-10-16 17:26:35 -04:00
committed by GitHub
parent b3dda72c23
commit 01c977e96d
9 changed files with 62 additions and 134 deletions

View File

@@ -19,7 +19,7 @@ def test_pre_quantized_model(vllm_runner):
dtype="bfloat16",
enforce_eager=True,
) as llm:
output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
assert output
@@ -39,8 +39,9 @@ def test_opt_125m_int8wo_model_loading_with_params(vllm_runner, pt_load_map_loca
quantization="torchao",
dtype="bfloat16",
pt_load_map_location=pt_load_map_location,
enforce_eager=True,
) as llm:
output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
assert output
@@ -54,8 +55,9 @@ def test_opt_125m_int4wo_model_per_module_quant(vllm_runner):
quantization="torchao",
dtype="bfloat16",
pt_load_map_location="cuda:0",
enforce_eager=True,
) as llm:
output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
assert output
@@ -69,8 +71,9 @@ def test_qwenvl_int8wo_model_loading_with_params(vllm_runner):
quantization="torchao",
dtype="bfloat16",
pt_load_map_location="cuda:0",
enforce_eager=True,
) as llm:
output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
assert output
@@ -90,7 +93,7 @@ def test_opt_125m_awq_int4wo_model_loading_with_params(vllm_runner):
dtype="bfloat16",
pt_load_map_location="cuda:0",
) as llm:
output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
assert output
@@ -122,8 +125,9 @@ def test_on_the_fly_quant_config_dict_json(vllm_runner):
pt_load_map_location="cuda:0",
quantization="torchao",
hf_overrides=hf_overrides,
enforce_eager=True,
) as llm:
output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
assert output
@@ -156,8 +160,9 @@ def test_on_the_fly_quant_config_file(vllm_runner):
pt_load_map_location="cuda:0",
quantization="torchao",
hf_overrides=hf_overrides,
enforce_eager=True,
) as llm:
output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
assert output
@@ -228,7 +233,7 @@ def test_opt_125m_float8_weight_only_safetensors_model_loading_with_params(vllm_
"torchao-testing/opt-125m-Float8WeightOnlyConfig-v2-0.14.0.dev-safetensors"
)
with vllm_runner(model_name=model_name, dtype="bfloat16") as llm:
output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
assert output
@@ -245,7 +250,7 @@ def test_opt_125m_module_fqn_to_config_regex_model(vllm_runner):
with vllm_runner(
model_name=model_name, dtype="bfloat16", pt_load_map_location="cuda:0"
) as llm:
output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
assert output