[Bugfix] Fix SHM cache initialization (#26427)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -1601,10 +1601,11 @@ def build_app(args: Namespace) -> FastAPI:
|
||||
|
||||
async def init_app_state(
|
||||
engine_client: EngineClient,
|
||||
vllm_config: VllmConfig,
|
||||
state: State,
|
||||
args: Namespace,
|
||||
) -> None:
|
||||
vllm_config = engine_client.vllm_config
|
||||
|
||||
if args.served_model_name is not None:
|
||||
served_model_names = args.served_model_name
|
||||
else:
|
||||
@@ -1622,11 +1623,9 @@ async def init_app_state(
|
||||
state.engine_client = engine_client
|
||||
state.log_stats = not args.disable_log_stats
|
||||
state.vllm_config = vllm_config
|
||||
model_config = vllm_config.model_config
|
||||
|
||||
supported_tasks = await engine_client.get_supported_tasks()
|
||||
|
||||
logger.info("Supported_tasks: %s", supported_tasks)
|
||||
logger.info("Supported tasks: %s", supported_tasks)
|
||||
|
||||
resolved_chat_template = load_chat_template(args.chat_template)
|
||||
if resolved_chat_template is not None:
|
||||
@@ -1688,7 +1687,6 @@ async def init_app_state(
|
||||
|
||||
state.openai_serving_models = OpenAIServingModels(
|
||||
engine_client=engine_client,
|
||||
model_config=model_config,
|
||||
base_model_paths=base_model_paths,
|
||||
lora_modules=lora_modules,
|
||||
)
|
||||
@@ -1696,7 +1694,6 @@ async def init_app_state(
|
||||
state.openai_serving_responses = (
|
||||
OpenAIServingResponses(
|
||||
engine_client,
|
||||
model_config,
|
||||
state.openai_serving_models,
|
||||
request_logger=request_logger,
|
||||
chat_template=resolved_chat_template,
|
||||
@@ -1717,7 +1714,6 @@ async def init_app_state(
|
||||
state.openai_serving_chat = (
|
||||
OpenAIServingChat(
|
||||
engine_client,
|
||||
model_config,
|
||||
state.openai_serving_models,
|
||||
args.response_role,
|
||||
request_logger=request_logger,
|
||||
@@ -1740,7 +1736,6 @@ async def init_app_state(
|
||||
state.openai_serving_completion = (
|
||||
OpenAIServingCompletion(
|
||||
engine_client,
|
||||
model_config,
|
||||
state.openai_serving_models,
|
||||
request_logger=request_logger,
|
||||
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
|
||||
@@ -1754,7 +1749,6 @@ async def init_app_state(
|
||||
state.openai_serving_pooling = (
|
||||
OpenAIServingPooling(
|
||||
engine_client,
|
||||
vllm_config,
|
||||
state.openai_serving_models,
|
||||
request_logger=request_logger,
|
||||
chat_template=resolved_chat_template,
|
||||
@@ -1768,7 +1762,6 @@ async def init_app_state(
|
||||
state.openai_serving_embedding = (
|
||||
OpenAIServingEmbedding(
|
||||
engine_client,
|
||||
model_config,
|
||||
state.openai_serving_models,
|
||||
request_logger=request_logger,
|
||||
chat_template=resolved_chat_template,
|
||||
@@ -1782,7 +1775,6 @@ async def init_app_state(
|
||||
state.openai_serving_classification = (
|
||||
ServingClassification(
|
||||
engine_client,
|
||||
model_config,
|
||||
state.openai_serving_models,
|
||||
request_logger=request_logger,
|
||||
log_error_stack=args.log_error_stack,
|
||||
@@ -1793,7 +1785,6 @@ async def init_app_state(
|
||||
state.openai_serving_scores = (
|
||||
ServingScores(
|
||||
engine_client,
|
||||
model_config,
|
||||
state.openai_serving_models,
|
||||
request_logger=request_logger,
|
||||
log_error_stack=args.log_error_stack,
|
||||
@@ -1803,7 +1794,6 @@ async def init_app_state(
|
||||
)
|
||||
state.openai_serving_tokenization = OpenAIServingTokenization(
|
||||
engine_client,
|
||||
model_config,
|
||||
state.openai_serving_models,
|
||||
request_logger=request_logger,
|
||||
chat_template=resolved_chat_template,
|
||||
@@ -1814,7 +1804,6 @@ async def init_app_state(
|
||||
state.openai_serving_transcription = (
|
||||
OpenAIServingTranscription(
|
||||
engine_client,
|
||||
model_config,
|
||||
state.openai_serving_models,
|
||||
request_logger=request_logger,
|
||||
log_error_stack=args.log_error_stack,
|
||||
@@ -1825,7 +1814,6 @@ async def init_app_state(
|
||||
state.openai_serving_translation = (
|
||||
OpenAIServingTranslation(
|
||||
engine_client,
|
||||
model_config,
|
||||
state.openai_serving_models,
|
||||
request_logger=request_logger,
|
||||
log_error_stack=args.log_error_stack,
|
||||
@@ -1946,12 +1934,11 @@ async def run_server_worker(
|
||||
maybe_register_tokenizer_info_endpoint(args)
|
||||
app = build_app(args)
|
||||
|
||||
vllm_config = await engine_client.get_vllm_config()
|
||||
await init_app_state(engine_client, vllm_config, app.state, args)
|
||||
await init_app_state(engine_client, app.state, args)
|
||||
|
||||
logger.info(
|
||||
"Starting vLLM API server %d on %s",
|
||||
vllm_config.parallel_config._api_process_rank,
|
||||
engine_client.vllm_config.parallel_config._api_process_rank,
|
||||
listen_address,
|
||||
)
|
||||
shutdown_task = await serve_http(
|
||||
|
||||
Reference in New Issue
Block a user