[Bugfix] Fix GPTQ and GPTQ Marlin CPU Offloading (#7225)

This commit is contained in:
Michael Goin
2024-08-06 21:34:26 -04:00
committed by GitHub
parent fd95e026e0
commit f9a5600649
4 changed files with 33 additions and 14 deletions

View File

@@ -22,11 +22,28 @@ def test_cpu_offload_fp8():
["--cpu-offload-gb", "2"])
@pytest.mark.skipif(not is_quant_method_supported("awq"),
reason="awq is not supported on this GPU type.")
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
reason="gptq_marlin is not supported on this GPU type.")
def test_cpu_offload_gptq():
# Test GPTQ Marlin
compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", [],
["--cpu-offload-gb", "1"])
# Test GPTQ
compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4",
["--quantization", "gptq"],
["--quantization", "gptq", "--cpu-offload-gb", "1"])
@pytest.mark.skipif(not is_quant_method_supported("awq_marlin"),
reason="awq_marlin is not supported on this GPU type.")
def test_cpu_offload_awq():
compare_two_settings("casperhansen/llama-3-8b-instruct-awq", [],
["--cpu-offload-gb", "2"])
# Test AWQ Marlin
compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ", [],
["--cpu-offload-gb", "1"])
# Test AWQ
compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ",
["--quantization", "awq"],
["--quantization", "awq", "--cpu-offload-gb", "1"])
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),