[Platform] Move platform check to right place (#18470)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -42,7 +42,10 @@ from vllm.transformers_utils.config import (
|
||||
try_get_generation_config, uses_mrope)
|
||||
from vllm.transformers_utils.s3_utils import S3Model
|
||||
from vllm.transformers_utils.utils import is_s3, maybe_model_redirect
|
||||
from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless,
|
||||
from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS,
|
||||
MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
|
||||
POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, GiB_bytes,
|
||||
LayerBlockType, cuda_device_count_stateless,
|
||||
get_cpu_memory, get_open_port, is_torch_equal_or_newer,
|
||||
random_uuid, resolve_obj_by_qualname)
|
||||
|
||||
@@ -64,12 +67,6 @@ logger = init_logger(__name__)
|
||||
|
||||
ConfigT = TypeVar("ConfigT", bound=ConfigType)
|
||||
|
||||
# This value is chosen to have a balance between ITL and TTFT. Note it is
|
||||
# not optimized for throughput.
|
||||
_DEFAULT_MAX_NUM_BATCHED_TOKENS = 2048
|
||||
_POOLING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768
|
||||
_MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS = 5120
|
||||
|
||||
TaskOption = Literal["auto", "generate", "embedding", "embed", "classify",
|
||||
"score", "reward", "transcription"]
|
||||
|
||||
@@ -2074,28 +2071,28 @@ class SchedulerConfig:
|
||||
# so we don't reject sequences on account of a short
|
||||
# max_num_batched_tokens.
|
||||
self.max_num_batched_tokens = max(
|
||||
self.max_model_len, _DEFAULT_MAX_NUM_BATCHED_TOKENS)
|
||||
self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS)
|
||||
else:
|
||||
self.max_num_batched_tokens = (
|
||||
_DEFAULT_MAX_NUM_BATCHED_TOKENS)
|
||||
DEFAULT_MAX_NUM_BATCHED_TOKENS)
|
||||
else:
|
||||
# If max_model_len is too short, use
|
||||
# _DEFAULT_MAX_NUM_BATCHED_TOKENS as the default value
|
||||
# DEFAULT_MAX_NUM_BATCHED_TOKENS as the default value
|
||||
# for higher throughput.
|
||||
self.max_num_batched_tokens = max(
|
||||
self.max_model_len, _DEFAULT_MAX_NUM_BATCHED_TOKENS)
|
||||
self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS)
|
||||
|
||||
if self.runner_type == "pooling":
|
||||
# Choose specific value for higher throughput
|
||||
self.max_num_batched_tokens = max(
|
||||
self.max_num_batched_tokens,
|
||||
_POOLING_MODEL_MAX_NUM_BATCHED_TOKENS,
|
||||
POOLING_MODEL_MAX_NUM_BATCHED_TOKENS,
|
||||
)
|
||||
if self.is_multimodal_model:
|
||||
# The value needs to be at least the number of multimodal tokens
|
||||
self.max_num_batched_tokens = max(
|
||||
self.max_num_batched_tokens,
|
||||
_MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
|
||||
MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
|
||||
)
|
||||
|
||||
# When using default settings,
|
||||
@@ -4316,18 +4313,6 @@ class VllmConfig:
|
||||
"full_cuda_graph is not supported with "
|
||||
"cascade attention. Disabling cascade attention.")
|
||||
self.model_config.disable_cascade_attn = True
|
||||
|
||||
if self.model_config and self.model_config.use_mla and \
|
||||
not (current_platform.is_cuda() or current_platform.is_rocm()):
|
||||
logger.info(
|
||||
"MLA is enabled on a non-GPU platform; forcing chunked "
|
||||
"prefill and prefix caching to be disabled.")
|
||||
self.scheduler_config.enable_chunked_prefill = False
|
||||
self.scheduler_config.chunked_prefill_enabled = False
|
||||
self.scheduler_config.max_num_batched_tokens = max(
|
||||
self.scheduler_config.max_model_len,
|
||||
_DEFAULT_MAX_NUM_BATCHED_TOKENS)
|
||||
|
||||
if self.cache_config is not None:
|
||||
self.cache_config.enable_prefix_caching = False
|
||||
|
||||
|
||||
Reference in New Issue
Block a user