[Kernel] Add KernelConfig flag to enable/disable FlashInfer autotune (#34006)

Signed-off-by: Mohammad Miadh Angkad <176301910+mmangkad@users.noreply.github.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2026-02-07 21:24:44 +08:00
parent edb359cce4
commit dd6a6e1190
5 changed files with 104 additions and 1 deletions
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -30,6 +30,7 @@ from .cache import CacheConfig
 from .compilation import CompilationConfig, CompilationMode, CUDAGraphMode
 from .device import DeviceConfig
 from .ec_transfer import ECTransferConfig
+from .kernel import KernelConfig
 from .kv_events import KVEventsConfig
 from .kv_transfer import KVTransferConfig
 from .load import LoadConfig
@@ -129,6 +130,9 @@ OPTIMIZATION_LEVEL_00 = {
        "cudagraph_mode": CUDAGraphMode.NONE,
        "use_inductor_graph_partition": False,
    },
+    "kernel_config": {
+        "enable_flashinfer_autotune": False,
+    },
 }
 OPTIMIZATION_LEVEL_01 = {
    "compilation_config": {
@@ -145,6 +149,9 @@ OPTIMIZATION_LEVEL_01 = {
        "cudagraph_mode": CUDAGraphMode.PIECEWISE,
        "use_inductor_graph_partition": False,
    },
+    "kernel_config": {
+        "enable_flashinfer_autotune": True,
+    },
 }
 OPTIMIZATION_LEVEL_02 = {
    "compilation_config": {
@@ -161,6 +168,9 @@ OPTIMIZATION_LEVEL_02 = {
        "cudagraph_mode": CUDAGraphMode.FULL_AND_PIECEWISE,
        "use_inductor_graph_partition": False,
    },
+    "kernel_config": {
+        "enable_flashinfer_autotune": True,
+    },
 }
 OPTIMIZATION_LEVEL_03 = {
    "compilation_config": {
@@ -177,6 +187,9 @@ OPTIMIZATION_LEVEL_03 = {
        "cudagraph_mode": CUDAGraphMode.FULL_AND_PIECEWISE,
        "use_inductor_graph_partition": False,
    },
+    "kernel_config": {
+        "enable_flashinfer_autotune": True,
+    },
 }

 OPTIMIZATION_LEVEL_TO_CONFIG = {
@@ -211,6 +224,8 @@ class VllmConfig:
    """Load configuration."""
    attention_config: AttentionConfig = Field(default_factory=AttentionConfig)
    """Attention configuration."""
+    kernel_config: KernelConfig = Field(default_factory=KernelConfig)
+    """Kernel configuration."""
    lora_config: LoRAConfig | None = None
    """LoRA configuration."""
    speculative_config: SpeculativeConfig | None = None
@@ -756,6 +771,11 @@ class VllmConfig:

        default_config = OPTIMIZATION_LEVEL_TO_CONFIG[self.optimization_level]
        self._apply_optimization_level_defaults(default_config)
+        if self.kernel_config.enable_flashinfer_autotune is None:
+            raise ValueError(
+                "KernelConfig.enable_flashinfer_autotune must be set after applying "
+                "optimization level defaults."
+            )

        if (
            self.compilation_config.cudagraph_mode.requires_piecewise_compilation()