[Model] Pooling models default to using chunked prefill & prefix caching if supported. (#20930)
Signed-off-by: wang.yuqi <noooop@126.com>
This commit is contained in:
@@ -22,8 +22,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer
|
||||
from vllm.model_executor.layers.mamba.mamba_utils import (
|
||||
MambaStateShapeCalculator)
|
||||
from vllm.model_executor.layers.pooler import (DispatchPooler, Pooler,
|
||||
PoolingType)
|
||||
from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||
DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
|
||||
@@ -604,6 +603,5 @@ class JambaForSequenceClassification(JambaForCausalLM):
|
||||
Pooler.for_classify(
|
||||
pooler_config,
|
||||
classifier=self.score,
|
||||
default_pooling_type=PoolingType.LAST,
|
||||
),
|
||||
})
|
||||
|
||||
Reference in New Issue
Block a user