[Performance] V1 Classify Models E2E Performance Optimization (#23541)

Signed-off-by: wang.yuqi <noooop@126.com>
This commit is contained in:
wang.yuqi
2025-08-29 18:12:32 +08:00
committed by GitHub
parent ad39106b16
commit d9e00dbd1f
5 changed files with 81 additions and 38 deletions

View File

@@ -1248,10 +1248,17 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
and "encode" in supported_tasks):
supported_tasks.remove("encode")
logger.info_once("Chunked prefill is not supported with "
"encode task which using ALL pooling. "
"Please turn off chunked prefill by "
"`--no-enable-chunked-prefill` before using it.")
logger.debug_once("Chunked prefill is not supported with "
"encode task which using ALL pooling. "
"Please turn off chunked prefill by "
"`--no-enable-chunked-prefill` before using it.")
if "score" in supported_tasks:
num_labels = getattr(self.model_config.hf_config, "num_labels", 0)
if num_labels != 1:
supported_tasks.remove("score")
logger.debug_once(
"Score API is only enabled for num_labels == 1.")
return supported_tasks