[Frontend][torch.compile] CompilationConfig Overhaul (#20283): name change compilation level to compilation mode, deprecation compilation level (#26355)
Signed-off-by: morrison-turnansky <mturnans@redhat.com> Signed-off-by: Morrison Turnansky <mturnans@redhat.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
e66d787bce
commit
96b9aa5aa0
@@ -247,12 +247,12 @@ class CpuPlatform(Platform):
|
||||
parallel_config.enable_dbo = False
|
||||
|
||||
# Note: workaround for v1 gpu_model_runner
|
||||
from vllm.config import CompilationLevel
|
||||
from vllm.config import CompilationMode
|
||||
|
||||
vllm_config.compilation_config.cudagraph_capture_sizes = []
|
||||
|
||||
compilation_config = vllm_config.compilation_config
|
||||
if vllm_config.compilation_config.level == CompilationLevel.PIECEWISE:
|
||||
if vllm_config.compilation_config.mode == CompilationMode.VLLM_COMPILE:
|
||||
# Note: vLLM V1 is using PIECEWISE level compilation, which will
|
||||
# take time to compile kernels just-in-time with the inductor
|
||||
# backend. For CPU CI tests, most of them are executed fast and
|
||||
@@ -265,7 +265,7 @@ class CpuPlatform(Platform):
|
||||
else:
|
||||
backend = "inductor"
|
||||
|
||||
compilation_config.level = CompilationLevel.DYNAMO_ONCE
|
||||
compilation_config.mode = CompilationMode.DYNAMO_TRACE_ONCE
|
||||
compilation_config.backend = backend
|
||||
compilation_config.inductor_compile_config.update(
|
||||
{
|
||||
@@ -277,7 +277,7 @@ class CpuPlatform(Platform):
|
||||
)
|
||||
|
||||
if vllm_config.lora_config is not None:
|
||||
compilation_config.level = CompilationLevel.NO_COMPILATION
|
||||
compilation_config.mode = CompilationMode.NONE
|
||||
|
||||
assert vllm_config.device_config.device_type == "cpu"
|
||||
|
||||
|
||||
@@ -114,7 +114,7 @@ class TpuPlatform(Platform):
|
||||
|
||||
@classmethod
|
||||
def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
|
||||
from vllm.config import CompilationLevel, CUDAGraphMode
|
||||
from vllm.config import CompilationMode, CUDAGraphMode
|
||||
|
||||
cache_config = vllm_config.cache_config
|
||||
# For v0, the default block size is 16.
|
||||
@@ -122,12 +122,13 @@ class TpuPlatform(Platform):
|
||||
cache_config.block_size = cast(BlockSize, 16)
|
||||
compilation_config = vllm_config.compilation_config
|
||||
|
||||
# TPU only supports DYNAMO_ONCE compilation level
|
||||
if compilation_config.level != CompilationLevel.DYNAMO_ONCE:
|
||||
# TPU only supports DYNAMO_TRACE_ONCE compilation mode
|
||||
if compilation_config.mode != CompilationMode.DYNAMO_TRACE_ONCE:
|
||||
logger.info(
|
||||
"[TPU] Forcing DYNAMO_ONCE compilation level, and disabling cudagraph."
|
||||
"[TPU] Forcing DYNAMO_TRACE_ONCE compilation mode, and\
|
||||
disabling cudagraph."
|
||||
)
|
||||
compilation_config.level = CompilationLevel.DYNAMO_ONCE
|
||||
compilation_config.mode = CompilationMode.DYNAMO_TRACE_ONCE
|
||||
|
||||
if (
|
||||
compilation_config.cudagraph_mode is None
|
||||
|
||||
@@ -144,7 +144,7 @@ class XPUPlatform(Platform):
|
||||
cache_config.block_size = 64
|
||||
|
||||
# lazy import to avoid circular import
|
||||
from vllm.config import CompilationLevel, CUDAGraphMode
|
||||
from vllm.config import CompilationMode, CUDAGraphMode
|
||||
|
||||
compilation_config = vllm_config.compilation_config
|
||||
if compilation_config.compile_sizes is None:
|
||||
@@ -155,7 +155,7 @@ class XPUPlatform(Platform):
|
||||
)
|
||||
|
||||
if vllm_config.lora_config is not None:
|
||||
compilation_config.level = CompilationLevel.NO_COMPILATION
|
||||
compilation_config.mode = CompilationMode.NONE
|
||||
|
||||
# check and update parallel config
|
||||
parallel_config = vllm_config.parallel_config
|
||||
|
||||
Reference in New Issue
Block a user