diff --git a/vllm/model_executor/layers/pooler/tokwise/methods.py b/vllm/model_executor/layers/pooler/tokwise/methods.py index baa9d4075..f242d215d 100644 --- a/vllm/model_executor/layers/pooler/tokwise/methods.py +++ b/vllm/model_executor/layers/pooler/tokwise/methods.py @@ -47,10 +47,13 @@ class AllPool(TokenPoolingMethod): pooling_metadata: PoolingMetadata, ) -> list[TokenPoolingMethodOutputItem]: pooling_cursor = pooling_metadata.get_pooling_cursor() - hidden_states_all = hidden_states.split( - pooling_cursor.num_scheduled_tokens_cpu.tolist() - ) - hidden_states_lst = [hidden_states_all[i] for i in pooling_cursor.index] + hidden_states_lst = [ + hidden_states[first : last + 1] + for first, last in zip( + pooling_cursor.first_token_indices_gpu.tolist(), + pooling_cursor.last_token_indices_gpu.tolist(), + ) + ] if not self.enable_chunked_prefill: return hidden_states_lst