[Core] Whisper support torch.compile (#30385)
Signed-off-by: NickLucche <nlucches@redhat.com>
This commit is contained in:
@@ -25,6 +25,7 @@ from vllm.config import (
|
||||
set_current_vllm_config,
|
||||
)
|
||||
from vllm.config.compilation import DynamicShapesType
|
||||
from vllm.forward_context import get_forward_context, is_forward_context_available
|
||||
from vllm.logger import init_logger
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.utils.import_utils import resolve_obj_by_qualname
|
||||
@@ -388,6 +389,12 @@ def _support_torch_compile(
|
||||
if self.do_not_compile or torch.compiler.is_compiling():
|
||||
return self.forward(*args, **kwargs)
|
||||
|
||||
# If skip_compiled is set, bypass compiled model call. This is used e.g. for
|
||||
# enc-dec models where tensor shapes/types vary across invocations, preventing
|
||||
# the capture of a single computational graph.
|
||||
if is_forward_context_available() and get_forward_context().skip_compiled:
|
||||
return self.forward(*args, **kwargs)
|
||||
|
||||
# if aot_compiled_fn is set, call it with partition wrapper context.
|
||||
# The partition wrapper must be active at runtime for CUDA graph
|
||||
# capture to work correctly with inductor graph partitioning.
|
||||
|
||||
Reference in New Issue
Block a user