Consolidate Intel Quantization Toolkit Integration in vLLM (#31716)

Signed-off-by: yiliu30 <yi4.liu@intel.com>
2026-01-14 15:11:30 +08:00
parent 6fa6e7ef0c
commit 50632adc58
10 changed files with 531 additions and 660 deletions
--- a/tests/quantization/test_auto_round.py
+++ b/tests/quantization/test_auto_round.py
@@ -26,9 +26,7 @@ MODELS = [
 )
@pytest.mark.parametrize("model", MODELS)
 def test_auto_round(vllm_runner, model):
-    with vllm_runner(
-        model, enforce_eager=True, allow_deprecated_quantization=True
-    ) as llm:
+    with vllm_runner(model, enforce_eager=True) as llm:
        output = llm.generate_greedy(["The capital of France is"], max_tokens=8)
    assert output
    print(f"{output[0][1]}")