[Model] Pooling models default to using chunked prefill & prefix caching if supported. (#20930)
Signed-off-by: wang.yuqi <noooop@126.com>
This commit is contained in:
@@ -1600,11 +1600,10 @@ class EngineArgs:
|
||||
else:
|
||||
|
||||
pooling_type = model_config.pooler_config.pooling_type
|
||||
|
||||
# TODO: when encoder models are supported we'll have to
|
||||
# check for causal attention here.
|
||||
incremental_prefill_supported = (pooling_type is not None and
|
||||
pooling_type.lower() == "last")
|
||||
is_causal = getattr(model_config.hf_config, "is_causal", True)
|
||||
incremental_prefill_supported = (pooling_type is not None
|
||||
and pooling_type.lower() == "last"
|
||||
and is_causal)
|
||||
|
||||
action = "Enabling" if \
|
||||
incremental_prefill_supported else "Disabling"
|
||||
|
||||
Reference in New Issue
Block a user