[V0 deprecation] Remove VLLM_USE_V1 usage in most modules (#27955)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -285,10 +285,6 @@ class MambaModelConfig(VerifyAndUpdateConfig):
|
||||
Args:
|
||||
vllm_config: vLLM Config
|
||||
"""
|
||||
|
||||
if not envs.VLLM_USE_V1:
|
||||
return
|
||||
|
||||
model_config = vllm_config.model_config
|
||||
cache_config = vllm_config.cache_config
|
||||
|
||||
@@ -329,10 +325,6 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
|
||||
Args:
|
||||
vllm_config: vLLM Config
|
||||
"""
|
||||
|
||||
if not envs.VLLM_USE_V1:
|
||||
return
|
||||
|
||||
# Save the user input before it gets modified by MambaModelConfig
|
||||
mamba_block_size = vllm_config.cache_config.mamba_block_size
|
||||
# Enable FULL_AND_PIECEWISE by default
|
||||
|
||||
@@ -9,7 +9,6 @@ from torch import nn
|
||||
from transformers import BatchFeature, Gemma3Config, Gemma3Processor
|
||||
from transformers.models.gemma3.processing_gemma3 import Gemma3ProcessorKwargs
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.config.multimodal import BaseDummyOptions
|
||||
from vllm.logger import init_logger
|
||||
@@ -137,11 +136,10 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
|
||||
if not do_pan_and_scan:
|
||||
return 0
|
||||
|
||||
if envs.VLLM_USE_V1:
|
||||
logger.warning_once(
|
||||
"`do_pan_and_scan=True` has suboptimal results on V1 "
|
||||
"because of the simplified attention pattern being used."
|
||||
)
|
||||
logger.warning_once(
|
||||
"`do_pan_and_scan=True` has suboptimal results on V1 "
|
||||
"because of the simplified attention pattern being used."
|
||||
)
|
||||
|
||||
# Based on Gemma3ImageProcessor.pan_and_scan
|
||||
if image_width >= image_height:
|
||||
|
||||
@@ -12,7 +12,6 @@ from torch.func import functional_call
|
||||
from transformers import PretrainedConfig
|
||||
from typing_extensions import deprecated
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.distributed import (
|
||||
get_tensor_model_parallel_rank,
|
||||
@@ -576,11 +575,8 @@ def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module:
|
||||
pin_memory = is_pin_memory_available()
|
||||
uva_available = is_uva_available()
|
||||
|
||||
if envs.VLLM_USE_V1:
|
||||
assert uva_available, "V1 CPU offloading requires uva (pin memory) support"
|
||||
uva_offloading = True
|
||||
else:
|
||||
uva_offloading = False
|
||||
assert uva_available, "V1 CPU offloading requires uva (pin memory) support"
|
||||
uva_offloading = True
|
||||
|
||||
# offload parameters to CPU
|
||||
# use pin_memory if possible, which helps cudagraph capture speed
|
||||
|
||||
Reference in New Issue
Block a user