[platforms] enable platform plugins (#11602)
Signed-off-by: youkaichao <youkaichao@gmail.com>
This commit is contained in:
@@ -22,7 +22,7 @@ from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS,
|
||||
get_quantization_config)
|
||||
from vllm.model_executor.models import ModelRegistry
|
||||
from vllm.platforms import current_platform, interface
|
||||
from vllm.platforms import CpuArchEnum
|
||||
from vllm.tracing import is_otel_available, otel_import_error_traceback
|
||||
from vllm.transformers_utils.config import (
|
||||
ConfigFormat, get_config, get_hf_image_processor_config,
|
||||
@@ -349,6 +349,7 @@ class ModelConfig:
|
||||
self.is_hybrid = self._init_is_hybrid()
|
||||
self.has_inner_state = self._init_has_inner_state()
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
if current_platform.is_neuron():
|
||||
self.override_neuron_config = override_neuron_config
|
||||
else:
|
||||
@@ -589,6 +590,7 @@ class ModelConfig:
|
||||
raise ValueError(
|
||||
f"Unknown quantization method: {self.quantization}. Must "
|
||||
f"be one of {supported_quantization}.")
|
||||
from vllm.platforms import current_platform
|
||||
current_platform.verify_quantization(self.quantization)
|
||||
if self.quantization not in optimized_quantization_methods:
|
||||
logger.warning(
|
||||
@@ -644,6 +646,7 @@ class ModelConfig:
|
||||
|
||||
# Reminder: Please update docs/source/usage/compatibility_matrix.md
|
||||
# If the feature combo become valid
|
||||
from vllm.platforms import current_platform
|
||||
if not current_platform.is_async_output_supported(self.enforce_eager):
|
||||
logger.warning(
|
||||
"Async output processing is not supported on the "
|
||||
@@ -1012,6 +1015,7 @@ class CacheConfig:
|
||||
raise ValueError(
|
||||
"GPU memory utilization must be less than 1.0. Got "
|
||||
f"{self.gpu_memory_utilization}.")
|
||||
from vllm.platforms import current_platform
|
||||
if (current_platform.is_cuda() and self.block_size is not None
|
||||
and self.block_size > 32):
|
||||
raise ValueError("CUDA Paged Attention kernel only supports "
|
||||
@@ -1279,6 +1283,7 @@ class ParallelConfig:
|
||||
f"distributed executor backend "
|
||||
f"'{self.distributed_executor_backend}'.")
|
||||
ray_only_devices = ["tpu", "hpu"]
|
||||
from vllm.platforms import current_platform
|
||||
if (current_platform.device_type in ray_only_devices
|
||||
and self.world_size > 1):
|
||||
if self.distributed_executor_backend is None:
|
||||
@@ -1327,7 +1332,7 @@ class ParallelConfig:
|
||||
def _verify_args(self) -> None:
|
||||
# Lazy import to avoid circular import
|
||||
from vllm.executor.executor_base import ExecutorBase
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
if self.distributed_executor_backend not in (
|
||||
"ray", "mp", None) and not (isinstance(
|
||||
self.distributed_executor_backend, type) and issubclass(
|
||||
@@ -1528,6 +1533,7 @@ class DeviceConfig:
|
||||
def __init__(self, device: str = "auto") -> None:
|
||||
if device == "auto":
|
||||
# Automated device type detection
|
||||
from vllm.platforms import current_platform
|
||||
self.device_type = current_platform.device_type
|
||||
if not self.device_type:
|
||||
raise RuntimeError("Failed to infer device type")
|
||||
@@ -2241,9 +2247,10 @@ def _get_and_verify_dtype(
|
||||
else:
|
||||
torch_dtype = config_dtype
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
if (current_platform.is_cpu()
|
||||
and current_platform.get_cpu_architecture()
|
||||
== interface.CpuArchEnum.POWERPC
|
||||
== CpuArchEnum.POWERPC
|
||||
and (config_dtype == torch.float16
|
||||
or config_dtype == torch.float32)):
|
||||
logger.info(
|
||||
@@ -3083,6 +3090,7 @@ class VllmConfig:
|
||||
model_config: ModelConfig,
|
||||
load_config: LoadConfig) -> Optional[QuantizationConfig]:
|
||||
"""Get the quantization config."""
|
||||
from vllm.platforms import current_platform
|
||||
if model_config.quantization is not None:
|
||||
from vllm.model_executor.model_loader.weight_utils import (
|
||||
get_quant_config)
|
||||
@@ -3145,6 +3153,7 @@ class VllmConfig:
|
||||
self.quant_config = VllmConfig._get_quantization_config(
|
||||
self.model_config, self.load_config)
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
if self.scheduler_config is not None and \
|
||||
self.model_config is not None and \
|
||||
self.scheduler_config.chunked_prefill_enabled and \
|
||||
|
||||
Reference in New Issue
Block a user