[Hardware][PPC64LE] Enable V1 for ppc64le and ARM (#20554)
Signed-off-by: Akash Kaothalkar <akash.kaothalkar@ibm.com> Co-authored-by: Akash Kaothalkar <akash.kaothalkar@ibm.com> Co-authored-by: Nikhil Gupta <nikhil.gupta2@arm.com>
This commit is contained in:
@@ -36,6 +36,7 @@ from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
|
||||
from vllm.executor.executor_base import ExecutorBase
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.quantization import QuantizationMethods
|
||||
from vllm.platforms import CpuArchEnum, current_platform
|
||||
from vllm.plugins import load_general_plugins
|
||||
from vllm.reasoning import ReasoningParserManager
|
||||
from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
|
||||
@@ -1096,7 +1097,6 @@ class EngineArgs:
|
||||
If VLLM_USE_V1 is specified by the user but the VllmConfig
|
||||
is incompatible, we raise an error.
|
||||
"""
|
||||
from vllm.platforms import current_platform
|
||||
current_platform.pre_register_and_update()
|
||||
|
||||
device_config = DeviceConfig(
|
||||
@@ -1123,9 +1123,16 @@ class EngineArgs:
|
||||
# Set default arguments for V0 or V1 Engine.
|
||||
if use_v1:
|
||||
self._set_default_args_v1(usage_context, model_config)
|
||||
# Disable chunked prefill for POWER (ppc64le)/ARM CPUs in V1
|
||||
if current_platform.is_cpu(
|
||||
) and current_platform.get_cpu_architecture() in (
|
||||
CpuArchEnum.POWERPC, CpuArchEnum.ARM):
|
||||
logger.info(
|
||||
"Chunked prefill is not supported for ARM and POWER CPUs; "
|
||||
"disabling it for V1 backend.")
|
||||
self.enable_chunked_prefill = False
|
||||
else:
|
||||
self._set_default_args_v0(model_config)
|
||||
|
||||
assert self.enable_chunked_prefill is not None
|
||||
|
||||
if envs.VLLM_ATTENTION_BACKEND in [STR_DUAL_CHUNK_FLASH_ATTN_VAL]:
|
||||
@@ -1242,7 +1249,6 @@ class EngineArgs:
|
||||
if self.enable_chunked_prefill and self.pipeline_parallel_size > 1:
|
||||
raise ValueError("Multi-Step Chunked-Prefill is not supported "
|
||||
"for pipeline-parallel-size > 1")
|
||||
from vllm.platforms import current_platform
|
||||
if current_platform.is_cpu():
|
||||
logger.warning("Multi-Step (--num-scheduler-steps > 1) is "
|
||||
"currently not supported for CPUs and has been "
|
||||
@@ -1391,7 +1397,6 @@ class EngineArgs:
|
||||
# Skip this check if we are running on a non-GPU platform,
|
||||
# or if the device capability is not available
|
||||
# (e.g. in a Ray actor without GPUs).
|
||||
from vllm.platforms import current_platform
|
||||
if (current_platform.is_cuda()
|
||||
and current_platform.get_device_capability()
|
||||
and current_platform.get_device_capability().major < 8):
|
||||
@@ -1652,7 +1657,6 @@ class EngineArgs:
|
||||
# as the platform that vLLM is running on (e.g. the case of scaling
|
||||
# vLLM with Ray) and has no GPUs. In this case we use the default
|
||||
# values for non-H100/H200 GPUs.
|
||||
from vllm.platforms import current_platform
|
||||
try:
|
||||
device_memory = current_platform.get_device_total_memory()
|
||||
device_name = current_platform.get_device_name().lower()
|
||||
@@ -1755,7 +1759,6 @@ class AsyncEngineArgs(EngineArgs):
|
||||
parser.add_argument('--disable-log-requests',
|
||||
action='store_true',
|
||||
help='Disable logging requests.')
|
||||
from vllm.platforms import current_platform
|
||||
current_platform.pre_register_and_update(parser)
|
||||
return parser
|
||||
|
||||
|
||||
Reference in New Issue
Block a user