[Frontend] Support using chat template as custom score template for reranking models (#30550)

Signed-off-by: Jakub Zakrzewski <jzakrzewski@nvidia.com>
Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
Signed-off-by: wang.yuqi <noooop@126.com>
Co-authored-by: wang.yuqi <yuqi.wang@daocloud.io>
This commit is contained in:
Jakub Zakrzewski
2025-12-23 12:19:16 +01:00
committed by GitHub
parent 27c6c2f98c
commit 23daef548d
19 changed files with 663 additions and 46 deletions

View File

@@ -57,7 +57,14 @@ from vllm.model_executor.model_loader.weight_utils import (
)
from vllm.sequence import IntermediateTensors
from .interfaces import SupportsEagle, SupportsEagle3, SupportsLoRA, SupportsPP
from .adapters import as_embedding_model, as_seq_cls_model
from .interfaces import (
SupportsEagle,
SupportsEagle3,
SupportsLoRA,
SupportsPP,
)
from .interfaces_base import attn_type
from .utils import (
AutoWeightsLoader,
PPMissingLayer,
@@ -698,3 +705,17 @@ class LlamaForCausalLM(
name = name.replace(item, mapping[item])
return name, loaded_weight
@attn_type("encoder_only")
class LlamaBidirectionalForSequenceClassification(as_seq_cls_model(LlamaForCausalLM)):
# This class sets the correct attention type and pooling type
# through LlamaBidirectionalConfig.
pass
@attn_type("encoder_only")
class LlamaBidirectionalModel(as_embedding_model(LlamaForCausalLM)):
# This class sets the correct attention type and pooling type
# through LlamaBidirectionalConfig.
pass