[V0 Deprecation] Remove pooling model support in V0 (#23434)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Signed-off-by: Max de Bayser <mbayser@br.ibm.com> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
committed by
GitHub
parent
934bebf192
commit
2554b27baa
@@ -1566,8 +1566,7 @@ class EngineArgs:
|
||||
use_spec_decode = self.speculative_config is not None
|
||||
|
||||
if (is_gpu and not use_sliding_window and not use_spec_decode
|
||||
and not self.enable_lora
|
||||
and model_config.runner_type != "pooling"):
|
||||
and not self.enable_lora):
|
||||
self.enable_chunked_prefill = True
|
||||
logger.warning(
|
||||
"Chunked prefill is enabled by default for models "
|
||||
@@ -1585,10 +1584,6 @@ class EngineArgs:
|
||||
"OOM during the initial memory profiling phase, or result "
|
||||
"in low performance due to small KV cache size. Consider "
|
||||
"setting --max-model-len to a smaller value.", max_model_len)
|
||||
elif (self.enable_chunked_prefill
|
||||
and model_config.runner_type == "pooling"):
|
||||
msg = "Chunked prefill is not supported for pooling models"
|
||||
raise ValueError(msg)
|
||||
|
||||
# if using prefix caching, we must set a hash algo
|
||||
if self.enable_prefix_caching:
|
||||
|
||||
Reference in New Issue
Block a user