[V0 deprecation] Remove VLLM_USE_V1 usage in most modules (#27955)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-11-05 12:51:16 +08:00
parent 878fd5a16f
commit 428bc7bf1c
19 changed files with 107 additions and 238 deletions
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -285,10 +285,6 @@ class MambaModelConfig(VerifyAndUpdateConfig):
        Args:
            vllm_config: vLLM Config
        """
-
-        if not envs.VLLM_USE_V1:
-            return
-
        model_config = vllm_config.model_config
        cache_config = vllm_config.cache_config

@@ -329,10 +325,6 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
        Args:
            vllm_config: vLLM Config
        """
-
-        if not envs.VLLM_USE_V1:
-            return
-
        # Save the user input before it gets modified by MambaModelConfig
        mamba_block_size = vllm_config.cache_config.mamba_block_size
        # Enable FULL_AND_PIECEWISE by default
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -9,7 +9,6 @@ from torch import nn
 from transformers import BatchFeature, Gemma3Config, Gemma3Processor
 from transformers.models.gemma3.processing_gemma3 import Gemma3ProcessorKwargs

-import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.logger import init_logger
@@ -137,11 +136,10 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
        if not do_pan_and_scan:
            return 0

-        if envs.VLLM_USE_V1:
-            logger.warning_once(
-                "`do_pan_and_scan=True` has suboptimal results on V1 "
-                "because of the simplified attention pattern being used."
-            )
+        logger.warning_once(
+            "`do_pan_and_scan=True` has suboptimal results on V1 "
+            "because of the simplified attention pattern being used."
+        )

        # Based on Gemma3ImageProcessor.pan_and_scan
        if image_width >= image_height:
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -12,7 +12,6 @@ from torch.func import functional_call
 from transformers import PretrainedConfig
 from typing_extensions import deprecated

-import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.distributed import (
    get_tensor_model_parallel_rank,
@@ -576,11 +575,8 @@ def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module:
    pin_memory = is_pin_memory_available()
    uva_available = is_uva_available()

-    if envs.VLLM_USE_V1:
-        assert uva_available, "V1 CPU offloading requires uva (pin memory) support"
-        uva_offloading = True
-    else:
-        uva_offloading = False
+    assert uva_available, "V1 CPU offloading requires uva (pin memory) support"
+    uva_offloading = True

    # offload parameters to CPU
    # use pin_memory if possible, which helps cudagraph capture speed