[Kernel] Add KernelConfig flag to enable/disable FlashInfer autotune (#34006)

Signed-off-by: Mohammad Miadh Angkad <176301910+mmangkad@users.noreply.github.com>
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
This commit is contained in:
Mohammad Miadh Angkad
2026-02-07 21:24:44 +08:00
committed by GitHub
parent edb359cce4
commit dd6a6e1190
5 changed files with 104 additions and 1 deletions

View File

@@ -30,6 +30,7 @@ from .cache import CacheConfig
from .compilation import CompilationConfig, CompilationMode, CUDAGraphMode
from .device import DeviceConfig
from .ec_transfer import ECTransferConfig
from .kernel import KernelConfig
from .kv_events import KVEventsConfig
from .kv_transfer import KVTransferConfig
from .load import LoadConfig
@@ -129,6 +130,9 @@ OPTIMIZATION_LEVEL_00 = {
"cudagraph_mode": CUDAGraphMode.NONE,
"use_inductor_graph_partition": False,
},
"kernel_config": {
"enable_flashinfer_autotune": False,
},
}
OPTIMIZATION_LEVEL_01 = {
"compilation_config": {
@@ -145,6 +149,9 @@ OPTIMIZATION_LEVEL_01 = {
"cudagraph_mode": CUDAGraphMode.PIECEWISE,
"use_inductor_graph_partition": False,
},
"kernel_config": {
"enable_flashinfer_autotune": True,
},
}
OPTIMIZATION_LEVEL_02 = {
"compilation_config": {
@@ -161,6 +168,9 @@ OPTIMIZATION_LEVEL_02 = {
"cudagraph_mode": CUDAGraphMode.FULL_AND_PIECEWISE,
"use_inductor_graph_partition": False,
},
"kernel_config": {
"enable_flashinfer_autotune": True,
},
}
OPTIMIZATION_LEVEL_03 = {
"compilation_config": {
@@ -177,6 +187,9 @@ OPTIMIZATION_LEVEL_03 = {
"cudagraph_mode": CUDAGraphMode.FULL_AND_PIECEWISE,
"use_inductor_graph_partition": False,
},
"kernel_config": {
"enable_flashinfer_autotune": True,
},
}
OPTIMIZATION_LEVEL_TO_CONFIG = {
@@ -211,6 +224,8 @@ class VllmConfig:
"""Load configuration."""
attention_config: AttentionConfig = Field(default_factory=AttentionConfig)
"""Attention configuration."""
kernel_config: KernelConfig = Field(default_factory=KernelConfig)
"""Kernel configuration."""
lora_config: LoRAConfig | None = None
"""LoRA configuration."""
speculative_config: SpeculativeConfig | None = None
@@ -756,6 +771,11 @@ class VllmConfig:
default_config = OPTIMIZATION_LEVEL_TO_CONFIG[self.optimization_level]
self._apply_optimization_level_defaults(default_config)
if self.kernel_config.enable_flashinfer_autotune is None:
raise ValueError(
"KernelConfig.enable_flashinfer_autotune must be set after applying "
"optimization level defaults."
)
if (
self.compilation_config.cudagraph_mode.requires_piecewise_compilation()