[v1] EngineArgs for better config handling for v1 (#10382)

Signed-off-by: rickyx <rickyx@anyscale.com>
This commit is contained in:
Ricky Xu
2024-11-25 21:09:43 -08:00
committed by GitHub
parent a6760f6456
commit 519e8e4182
13 changed files with 109 additions and 27 deletions

View File

@@ -41,19 +41,6 @@ class EngineCore:
executor_class: Type[GPUExecutor],
usage_context: UsageContext,
):
# Override the configs for V1.
# FIXME
if usage_context == UsageContext.LLM_CLASS:
vllm_config.scheduler_config.max_num_seqs = 1024
vllm_config.scheduler_config.max_num_batched_tokens = 8192
elif usage_context == UsageContext.OPENAI_API_SERVER:
vllm_config.scheduler_config.max_num_seqs = 1024
vllm_config.scheduler_config.max_num_batched_tokens = 2048
# TODO (ywang96): Enable APC by default when VLM supports it.
if not vllm_config.model_config.is_multimodal_model:
vllm_config.cache_config.enable_prefix_caching = True
assert vllm_config.model_config.task != "embedding"
logger.info("Initializing an LLM engine (v%s) with config: %s",