[V0 deprecation] Deprecate V0 Neuron backend (#21159)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
Woosuk Kwon
2025-09-06 16:15:18 -07:00
committed by GitHub
parent 848562bd49
commit 4172235ab7
46 changed files with 10 additions and 5462 deletions

View File

@@ -461,11 +461,6 @@ class ModelConfig:
DP (which is controlled by `--data-parallel-size`).
This is only supported on a per-model basis and falls back to
`"weights"` if the encoder does not support DP."""
override_neuron_config: dict[str, Any] = field(default_factory=dict)
"""Initialize non-default neuron config or override default neuron config
that are specific to Neuron devices, this argument will be used to
configure the neuron config that can not be gathered from the vllm
arguments. e.g. `{"cast_logits_dtype": "bfloat16"}`."""
pooler_config: Optional["PoolerConfig"] = field(init=False)
"""Pooler config which controls the behaviour of output pooling in pooling
models."""
@@ -785,10 +780,6 @@ class ModelConfig:
if not self.skip_tokenizer_init:
self._verify_tokenizer_mode()
if (not current_platform.is_neuron() and self.override_neuron_config):
raise ValueError(
"`override_neuron_config` is only supported on Neuron.")
# Avoid running try_verify_and_update_config multiple times
self.config_updated = False
@@ -1696,13 +1687,7 @@ class ModelConfig:
"""
For Mllama, VLLM overrides HF's is_encoder_decoder flag and sets it to
True to enable cross-attention
Neuron needs all multimodal data to be in the decoder and does not
need to explicitly enable cross-attention
"""
if (current_platform.is_neuron()
and self.hf_config.model_type == "mllama"):
return False
return is_encoder_decoder(self.hf_config)
@property
@@ -1871,7 +1856,7 @@ class LoadConfig:
self.ignore_patterns = ["original/**/*"]
Device = Literal["auto", "cuda", "neuron", "cpu", "tpu", "xpu"]
Device = Literal["auto", "cuda", "cpu", "tpu", "xpu"]
@config
@@ -1927,9 +1912,7 @@ class DeviceConfig:
self.device_type = self.device.type
# Some device types require processing inputs on CPU
if self.device_type in ["neuron"]:
self.device = torch.device("cpu")
elif self.device_type in ["tpu"]:
if self.device_type in ["tpu"]:
self.device = None
else:
# Set device with device type
@@ -3941,7 +3924,6 @@ class VllmConfig:
f"skip_tokenizer_init={self.model_config.skip_tokenizer_init}, "
f"tokenizer_mode={self.model_config.tokenizer_mode}, "
f"revision={self.model_config.revision}, "
f"override_neuron_config={self.model_config.override_neuron_config}, " # noqa
f"tokenizer_revision={self.model_config.tokenizer_revision}, "
f"trust_remote_code={self.model_config.trust_remote_code}, "
f"dtype={self.model_config.dtype}, "

View File

@@ -33,9 +33,8 @@ class CacheConfig:
"""Configuration for the KV cache."""
block_size: SkipValidation[BlockSize] = None # type: ignore
"""Size of a contiguous cache block in number of tokens. This is ignored on
neuron devices and set to `--max-model-len`. On CUDA devices, only block
sizes up to 32 are supported. On HPU devices, block size defaults to 128.
"""Size of a contiguous cache block in number of tokens. On CUDA devices,
only block sizes up to 32 are supported.
This config has no static default. If left unspecified by the user, it will
be set in `Platform.check_and_update_config()` based on the current

View File

@@ -377,10 +377,7 @@ class ParallelConfig:
from vllm.executor import ray_utils
backend: DistributedExecutorBackend = "mp"
ray_found = ray_utils.ray_is_available()
if current_platform.is_neuron():
# neuron uses single process to control multiple devices
backend = "uni"
elif current_platform.is_tpu() and envs.VLLM_XLA_USE_SPMD:
if current_platform.is_tpu() and envs.VLLM_XLA_USE_SPMD:
backend = "uni"
elif (current_platform.is_cuda()
and cuda_device_count_stateless() < self.world_size):