diff --git a/tests/quantization/test_torchao.py b/tests/quantization/test_torchao.py index c859f890b..fb794baa5 100644 --- a/tests/quantization/test_torchao.py +++ b/tests/quantization/test_torchao.py @@ -20,7 +20,7 @@ TORCHAO_AVAILABLE = importlib.util.find_spec("torchao") is not None @pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available") def test_pre_quantized_model(vllm_runner): with vllm_runner( - "drisspg/fp8-opt-125m", + "torchao-testing/opt-125m-Float8WeightOnlyConfig-v2-0.15.0", quantization="torchao", dtype="bfloat16", enforce_eager=True, @@ -52,22 +52,6 @@ def test_opt_125m_int8wo_model_loading_with_params(vllm_runner, pt_load_map_loca assert output -@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available") -def test_opt_125m_int4wo_model_per_module_quant(vllm_runner): - torch._dynamo.reset() - model_name = "jerryzh168/opt-125m-int4wo-per-module" - with vllm_runner( - model_name=model_name, - quantization="torchao", - dtype="bfloat16", - pt_load_map_location="cuda:0", - enforce_eager=True, - ) as llm: - output = llm.generate_greedy(["The capital of France is"], max_tokens=4) - - assert output - - @pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available") def test_qwenvl_int8wo_model_loading_with_params(vllm_runner): torch._dynamo.reset()