[V1] V1 Enablement Oracle (#13726)

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com> Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com> Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: Michael Goin <michael@neuralmagic.com>
2025-03-15 01:02:20 -04:00
parent 8c0d15d5c5
commit d4d93db2c5
96 changed files with 1537 additions and 512 deletions
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -21,6 +21,14 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
 from vllm.platforms import current_platform


+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    This module relies on V0 internals, so set VLLM_USE_V1=0.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
@pytest.mark.parametrize(
    "model_args",
    [
--- a/tests/quantization/test_cpu_offload.py
+++ b/tests/quantization/test_cpu_offload.py
@@ -10,6 +10,13 @@ from tests.quantization.utils import is_quant_method_supported
 from ..utils import compare_two_settings


+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    # Fall back to V0 if cpu offloading is enabled.
+    # Fixture is required to that baseline uses V0.
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
                    reason="fp8 is not supported on this GPU type.")
 def test_cpu_offload_fp8():
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -47,7 +47,9 @@ KV_CACHE_MODELS = [
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
                    reason="FP8 is not supported on this GPU type.")
@pytest.mark.parametrize("model_id", KV_CACHE_MODELS)
-def test_kv_cache_model_load_and_run(vllm_runner, model_id: str):
+def test_kv_cache_model_load_and_run(vllm_runner, model_id: str, monkeypatch):
+    # vllm_runner.apply_model() relies on V0 internals.
+    monkeypatch.setenv("VLLM_USE_V1", "0")
    with vllm_runner(model_id, kv_cache_dtype="fp8") as llm:

        def check_model(model):
@@ -86,6 +88,9 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str):
@pytest.mark.parametrize("force_marlin", [False, True])
 def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
                         monkeypatch) -> None:
+    # vllm_runner.apply_model() relies on V0 internals.
+    monkeypatch.setenv("VLLM_USE_V1", "0")
+
    if force_marlin:
        monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")

--- a/tests/quantization/test_gptq_dynamic.py
+++ b/tests/quantization/test_gptq_dynamic.py
@@ -28,8 +28,10 @@ MODEL_QUANT = [


@pytest.mark.parametrize("model_id, use_marlin_kernel", MODEL_QUANT)
-def test_gptq_with_dynamic(vllm_runner, model_id: str,
-                           use_marlin_kernel: bool):
+def test_gptq_with_dynamic(vllm_runner, model_id: str, use_marlin_kernel: bool,
+                           monkeypatch):
+    # vllm_runner.apply_model() relies on V0 internals.
+    monkeypatch.setenv("VLLM_USE_V1", "0")

    vllm_model = vllm_runner(model_id, dtype=torch.float16, max_model_len=2048)

--- a/tests/quantization/test_lm_head.py
+++ b/tests/quantization/test_lm_head.py
@@ -29,7 +29,10 @@ def test_lm_head(
    vllm_runner,
    model_id: str,
    lm_head_quantized: bool,
+    monkeypatch,
 ) -> None:
+    # vllm_runner.apply_model() relies on V0 internals.
+    monkeypatch.setenv("VLLM_USE_V1", "0")
    with vllm_runner(model_id, dtype=torch.float16,
                     max_model_len=2048) as vllm_model:

--- a/tests/quantization/test_quark.py
+++ b/tests/quantization/test_quark.py
@@ -10,7 +10,9 @@ from vllm.model_executor.layers.quantization.quark.quark import (  # noqa: E501
    QuarkLinearMethod, QuarkW8A8Fp8)


-def test_quark_fp8(vllm_runner):
+def test_quark_fp8(vllm_runner, monkeypatch):
+    # vllm_runner.apply_model() relies on V0 internals.
+    monkeypatch.setenv("VLLM_USE_V1", "0")
    model_path = "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test"
    with vllm_runner(model_path) as llm:

--- a/tests/quantization/test_register_quantization_config.py
+++ b/tests/quantization/test_register_quantization_config.py
@@ -101,8 +101,10 @@ def test_register_quantization_config():
                         argvalues=[
                             "meta-llama/Llama-3.2-1B-Instruct",
                         ])
-def test_custom_quant(vllm_runner, model):
+def test_custom_quant(vllm_runner, model, monkeypatch):
    """Test infer with the custom quantization method."""
+    # vllm_runner.apply_model() relies on V0 internals.
+    monkeypatch.setenv("VLLM_USE_V1", "0")
    with vllm_runner(model_name=model,
                     quantization="custom_quant",
                     enforce_eager=True) as llm: