Fix cpu offload testing for gptq/awq/ct (#15648)

Signed-off-by: mgoin <mgoin64@gmail.com>
2025-03-28 10:29:32 -06:00
parent 432cf22a6a
commit 47e9038d23
2 changed files with 42 additions and 3 deletions
--- a/tests/quantization/test_cpu_offload.py
+++ b/tests/quantization/test_cpu_offload.py
@@ -33,7 +33,9 @@ def test_cpu_offload_fp8():

@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
                    reason="gptq_marlin is not supported on this GPU type.")
-def test_cpu_offload_gptq():
+def test_cpu_offload_gptq(monkeypatch):
+    # This quant method is sensitive to dummy weights, so we force real weights
+    monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
    # Test GPTQ Marlin
    compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", [],
                         ["--cpu-offload-gb", "1"],
@@ -47,7 +49,9 @@ def test_cpu_offload_gptq():

@pytest.mark.skipif(not is_quant_method_supported("awq_marlin"),
                    reason="awq_marlin is not supported on this GPU type.")
-def test_cpu_offload_awq():
+def test_cpu_offload_awq(monkeypatch):
+    # This quant method is sensitive to dummy weights, so we force real weights
+    monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
    # Test AWQ Marlin
    compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ", [],
                         ["--cpu-offload-gb", "1"],
@@ -61,7 +65,9 @@ def test_cpu_offload_awq():

@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
                    reason="gptq_marlin is not supported on this GPU type.")
-def test_cpu_offload_compressed_tensors():
+def test_cpu_offload_compressed_tensors(monkeypatch):
+    # This quant method is sensitive to dummy weights, so we force real weights
+    monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
    # Test wNa16
    compare_two_settings("nm-testing/tinyllama-oneshot-w4a16-channel-v2", [],
                         ["--cpu-offload-gb", "1"],