[Kernel] Add KernelConfig flag to enable/disable FlashInfer autotune (#34006)

Signed-off-by: Mohammad Miadh Angkad <176301910+mmangkad@users.noreply.github.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2026-02-07 21:24:44 +08:00
parent edb359cce4
commit dd6a6e1190
5 changed files with 104 additions and 1 deletions
--- a/vllm/config/init.py
+++ b/vllm/config/init.py
@@ -11,6 +11,7 @@ from vllm.config.compilation import (
 )
 from vllm.config.device import DeviceConfig
 from vllm.config.ec_transfer import ECTransferConfig
 from vllm.config.kernel import KernelConfig
 from vllm.config.kv_events import KVEventsConfig
 from vllm.config.kv_transfer import KVTransferConfig
 from vllm.config.load import LoadConfig
@@ -65,6 +66,8 @@ __all__ = [
    "DeviceConfig",
    # From vllm.config.ec_transfer
    "ECTransferConfig",
    # From vllm.config.kernel
    "KernelConfig",
    # From vllm.config.kv_events
    "KVEventsConfig",
    # From vllm.config.kv_transfer
--- a/vllm/config/kernel.py
+++ b/vllm/config/kernel.py
@@ -0,0 +1,44 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Callable
 from typing import Any
 from pydantic import Field, field_validator
 from vllm.config.utils import config
 from vllm.utils.hashing import safe_hash
@config
 class KernelConfig:
    """Configuration for kernel selection and warmup behavior."""
    enable_flashinfer_autotune: bool = Field(default=None)
    """If True, run FlashInfer autotuning during kernel warmup."""
    def compute_hash(self) -> str:
        """
        WARNING: Whenever a new field is added to this config,
        ensure that it is included in the factors list if
        it affects the computation graph.
        Provide a hash that uniquely identifies all the configs
        that affect the structure of the computation
        graph from input ids/embeddings to the final hidden states,
        excluding anything before input ids/embeddings and after
        the final hidden states.
        """
        # no factors to consider.
        # this config will not affect the computation graph.
        factors: list[Any] = []
        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
        return hash_str
    @field_validator("enable_flashinfer_autotune", mode="wrap")
    @classmethod
    def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
        """Skip validation if the value is `None` when initialization is delayed."""
        if value is None:
            return value
        return handler(value)
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -30,6 +30,7 @@ from .cache import CacheConfig
 from .compilation import CompilationConfig, CompilationMode, CUDAGraphMode
 from .device import DeviceConfig
 from .ec_transfer import ECTransferConfig
 from .kernel import KernelConfig
 from .kv_events import KVEventsConfig
 from .kv_transfer import KVTransferConfig
 from .load import LoadConfig
@@ -129,6 +130,9 @@ OPTIMIZATION_LEVEL_00 = {
        "cudagraph_mode": CUDAGraphMode.NONE,
        "use_inductor_graph_partition": False,
    },
    "kernel_config": {
        "enable_flashinfer_autotune": False,
    },
 }
 OPTIMIZATION_LEVEL_01 = {
    "compilation_config": {
@@ -145,6 +149,9 @@ OPTIMIZATION_LEVEL_01 = {
        "cudagraph_mode": CUDAGraphMode.PIECEWISE,
        "use_inductor_graph_partition": False,
    },
    "kernel_config": {
        "enable_flashinfer_autotune": True,
    },
 }
 OPTIMIZATION_LEVEL_02 = {
    "compilation_config": {
@@ -161,6 +168,9 @@ OPTIMIZATION_LEVEL_02 = {
        "cudagraph_mode": CUDAGraphMode.FULL_AND_PIECEWISE,
        "use_inductor_graph_partition": False,
    },
    "kernel_config": {
        "enable_flashinfer_autotune": True,
    },
 }
 OPTIMIZATION_LEVEL_03 = {
    "compilation_config": {
@@ -177,6 +187,9 @@ OPTIMIZATION_LEVEL_03 = {
        "cudagraph_mode": CUDAGraphMode.FULL_AND_PIECEWISE,
        "use_inductor_graph_partition": False,
    },
    "kernel_config": {
        "enable_flashinfer_autotune": True,
    },
 }
 OPTIMIZATION_LEVEL_TO_CONFIG = {
@@ -211,6 +224,8 @@ class VllmConfig:
    """Load configuration."""
    attention_config: AttentionConfig = Field(default_factory=AttentionConfig)
    """Attention configuration."""
    kernel_config: KernelConfig = Field(default_factory=KernelConfig)
    """Kernel configuration."""
    lora_config: LoRAConfig | None = None
    """LoRA configuration."""
    speculative_config: SpeculativeConfig | None = None
@@ -756,6 +771,11 @@ class VllmConfig:
        default_config = OPTIMIZATION_LEVEL_TO_CONFIG[self.optimization_level]
        self._apply_optimization_level_defaults(default_config)
        if self.kernel_config.enable_flashinfer_autotune is None:
            raise ValueError(
                "KernelConfig.enable_flashinfer_autotune must be set after applying "
                "optimization level defaults."
            )
        if (
            self.compilation_config.cudagraph_mode.requires_piecewise_compilation()
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -40,6 +40,7 @@ from vllm.config import (
    DeviceConfig,
    ECTransferConfig,
    EPLBConfig,
    KernelConfig,
    KVEventsConfig,
    KVTransferConfig,
    LoadConfig,
@@ -536,6 +537,10 @@ class EngineArgs:
    pooler_config: PoolerConfig | None = ModelConfig.pooler_config
    compilation_config: CompilationConfig = get_field(VllmConfig, "compilation_config")
    attention_config: AttentionConfig = get_field(VllmConfig, "attention_config")
    kernel_config: KernelConfig = get_field(VllmConfig, "kernel_config")
    enable_flashinfer_autotune: bool = get_field(
        KernelConfig, "enable_flashinfer_autotune"
    )
    worker_cls: str = ParallelConfig.worker_cls
    worker_extension_cls: str = ParallelConfig.worker_extension_cls
@@ -595,6 +600,8 @@ class EngineArgs:
            self.compilation_config = CompilationConfig(**self.compilation_config)
        if isinstance(self.attention_config, dict):
            self.attention_config = AttentionConfig(**self.attention_config)
        if isinstance(self.kernel_config, dict):
            self.kernel_config = KernelConfig(**self.kernel_config)
        if isinstance(self.eplb_config, dict):
            self.eplb_config = EPLBConfig(**self.eplb_config)
        if isinstance(self.weight_transfer_config, dict):
@@ -1163,6 +1170,17 @@ class EngineArgs:
            **compilation_kwargs["max_cudagraph_capture_size"],
        )
        # Kernel arguments
        kernel_kwargs = get_kwargs(KernelConfig)
        kernel_group = parser.add_argument_group(
            title="KernelConfig",
            description=KernelConfig.__doc__,
        )
        kernel_group.add_argument(
            "--enable-flashinfer-autotune",
            **kernel_kwargs["enable_flashinfer_autotune"],
        )
        # vLLM arguments
        vllm_kwargs = get_kwargs(VllmConfig)
        vllm_group = parser.add_argument_group(
@@ -1189,6 +1207,7 @@ class EngineArgs:
        vllm_group.add_argument(
            "--attention-config", "-ac", **vllm_kwargs["attention_config"]
        )
        vllm_group.add_argument("--kernel-config", **vllm_kwargs["kernel_config"])
        vllm_group.add_argument(
            "--additional-config", **vllm_kwargs["additional_config"]
        )
@@ -1717,6 +1736,17 @@ class EngineArgs:
            else:
                attention_config.backend = self.attention_backend
        # Kernel config overrides
        kernel_config = copy.deepcopy(self.kernel_config)
        if self.enable_flashinfer_autotune is not None:
            if kernel_config.enable_flashinfer_autotune is not None:
                raise ValueError(
                    "enable_flashinfer_autotune and "
                    "kernel_config.enable_flashinfer_autotune "
                    "are mutually exclusive"
                )
            kernel_config.enable_flashinfer_autotune = self.enable_flashinfer_autotune
        load_config = self.create_load_config()
        # Pass reasoning_parser into StructuredOutputsConfig
@@ -1767,6 +1797,7 @@ class EngineArgs:
            device_config=device_config,
            load_config=load_config,
            attention_config=attention_config,
            kernel_config=kernel_config,
            lora_config=lora_config,
            speculative_config=speculative_config,
            structured_outputs_config=self.structured_outputs_config,
--- a/vllm/model_executor/warmup/kernel_warmup.py
+++ b/vllm/model_executor/warmup/kernel_warmup.py
@@ -36,8 +36,13 @@ def kernel_warmup(worker: "Worker"):
        max_tokens = worker.scheduler_config.max_num_batched_tokens
        deep_gemm_warmup(model, max_tokens)
    enable_flashinfer_autotune = (
        worker.vllm_config.kernel_config.enable_flashinfer_autotune
    )
    # FlashInfer autotune for Hopper (SM 9.0) and Blackwell (SM 10.0) GPUs
-    if has_flashinfer() and current_platform.has_device_capability(90):
+    if enable_flashinfer_autotune is False:
        logger.info("Skipping FlashInfer autotune because it is disabled.")
    elif has_flashinfer() and current_platform.has_device_capability(90):
        flashinfer_autotune(worker.model_runner)
    # FlashInfer attention warmup