From b386bb3d7c871f380b96d0ec0f74c53ed4cadf62 Mon Sep 17 00:00:00 2001 From: Augusto Yao Date: Wed, 11 Mar 2026 11:16:34 +0800 Subject: [PATCH] fix bugs when token_classify & classify run concurrently (#36614) Signed-off-by: augusto.yjh --- vllm/model_executor/layers/pooler/tokwise/methods.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/pooler/tokwise/methods.py b/vllm/model_executor/layers/pooler/tokwise/methods.py index baa9d4075..f242d215d 100644 --- a/vllm/model_executor/layers/pooler/tokwise/methods.py +++ b/vllm/model_executor/layers/pooler/tokwise/methods.py @@ -47,10 +47,13 @@ class AllPool(TokenPoolingMethod): pooling_metadata: PoolingMetadata, ) -> list[TokenPoolingMethodOutputItem]: pooling_cursor = pooling_metadata.get_pooling_cursor() - hidden_states_all = hidden_states.split( - pooling_cursor.num_scheduled_tokens_cpu.tolist() - ) - hidden_states_lst = [hidden_states_all[i] for i in pooling_cursor.index] + hidden_states_lst = [ + hidden_states[first : last + 1] + for first, last in zip( + pooling_cursor.first_token_indices_gpu.tolist(), + pooling_cursor.last_token_indices_gpu.tolist(), + ) + ] if not self.enable_chunked_prefill: return hidden_states_lst