[Refactor] Separate sequence and token pooling types (#32026)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2026-01-10 12:53:24 +08:00
committed by GitHub
parent 52d428295d
commit 583a90e005
42 changed files with 324 additions and 204 deletions

View File

@@ -129,7 +129,7 @@ class SiglipProcessingInfo(BaseProcessingInfo):
image_width=image_width,
image_height=image_height,
),
_get_vision_feature_select_strategy(pooler_config.pooling_type),
_get_vision_feature_select_strategy(pooler_config.seq_pooling_type),
)
def get_image_size_with_most_features(self) -> ImageSize:
@@ -998,7 +998,7 @@ class SiglipTextEmbeddings(nn.Module):
# Assume EOS token corresponds to CLS token in text model
@default_pooling_type("CLS")
@default_pooling_type(seq_pooling_type="CLS")
@MULTIMODAL_REGISTRY.register_processor(
SiglipMultiModalProcessor,
info=SiglipProcessingInfo,
@@ -1125,7 +1125,7 @@ class SiglipEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant):
) -> torch.Tensor:
if feature_select_strategy is None:
feature_select_strategy = _get_vision_feature_select_strategy(
self.pooler_config.pooling_type
self.pooler_config.seq_pooling_type
)
pooled_output = self.vision_model(