[Misc] IO Processor plugins for pooling models (#22820)

Signed-off-by: Christian Pinto <christian.pinto@ibm.com>
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Co-authored-by: Max de Bayser <mbayser@br.ibm.com>
This commit is contained in:
Christian Pinto
2025-09-01 07:07:12 +01:00
committed by GitHub
parent 437c3ce026
commit 1cb39dbcdd
25 changed files with 1183 additions and 43 deletions

View File

@@ -64,6 +64,7 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
EmbeddingRequest,
EmbeddingResponse, ErrorInfo,
ErrorResponse,
IOProcessorResponse,
LoadLoRAAdapterRequest,
PoolingRequest, PoolingResponse,
RerankRequest, RerankResponse,
@@ -795,7 +796,7 @@ async def create_pooling(request: PoolingRequest, raw_request: Request):
if isinstance(generator, ErrorResponse):
return JSONResponse(content=generator.model_dump(),
status_code=generator.error.code)
elif isinstance(generator, PoolingResponse):
elif isinstance(generator, (PoolingResponse, IOProcessorResponse)):
return JSONResponse(content=generator.model_dump())
assert_never(generator)
@@ -1782,7 +1783,7 @@ async def init_app_state(
) if "generate" in supported_tasks else None
state.openai_serving_pooling = OpenAIServingPooling(
engine_client,
model_config,
vllm_config,
state.openai_serving_models,
request_logger=request_logger,
chat_template=resolved_chat_template,