[Model] Pooling models default to using chunked prefill & prefix caching if supported. (#20930)

Signed-off-by: wang.yuqi <noooop@126.com>
This commit is contained in:
wang.yuqi
2025-08-12 00:41:37 +08:00
committed by GitHub
parent 16fb668b61
commit 84cf78acee
31 changed files with 452 additions and 261 deletions

View File

@@ -227,6 +227,20 @@ def test_get_pooling_config_from_args():
assert asdict(pooling_config) == asdict(override_pooler_config)
@pytest.mark.parametrize(
("model_id", "default_pooling_type", "pooling_type"),
[
("tomaarsen/Qwen3-Reranker-0.6B-seq-cls", "LAST", "LAST"), # LLM
("intfloat/e5-small", "CLS", "MEAN"), # BertModel
("Qwen/Qwen2.5-Math-RM-72B", "ALL", "ALL"), # reward
("Qwen/Qwen2.5-Math-PRM-7B", "STEP", "STEP") # step reward
])
def test_default_pooling_type(model_id, default_pooling_type, pooling_type):
model_config = ModelConfig(model_id)
assert model_config._model_info.default_pooling_type == default_pooling_type
assert model_config.pooler_config.pooling_type == pooling_type
@pytest.mark.skipif(current_platform.is_rocm(),
reason="Xformers backend is not supported on ROCm.")
def test_get_bert_tokenization_sentence_transformer_config():