[bug] Fix "Current vLLM config is not set." warnings when FlashInfer attention is used (#30241)

Signed-off-by: Po-Han Huang <pohanh@nvidia.com>
2025-12-11 03:18:51 +08:00
parent 9f042ba26b
commit eea41804a4
2 changed files with 6 additions and 1 deletions
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -269,6 +269,8 @@ def supports_trtllm_attention() -> bool:

 def force_use_trtllm_attention() -> bool | None:
    """
+    This function should only be called during initialization stage when vllm config
+    is set.
    Return `None` if --attention-config.use_trtllm_attention is not set,
    return `True` if TRTLLM attention is forced to be used,
    return `False` if TRTLLM attention is forced to be not used.
@@ -296,11 +298,12 @@ def use_trtllm_attention(
    kv_cache_dtype: str,
    q_dtype: torch.dtype,
    is_prefill: bool,
+    # None means auto-detection, True means force on, False means force off
+    force_use_trtllm: bool | None = None,
    has_sinks: bool = False,
    has_spec: bool = False,
 ) -> bool:
    """Return `True` if TRTLLM attention is used."""
-    force_use_trtllm = force_use_trtllm_attention()

    # CLI argument is set to 0 - respect it
    if force_use_trtllm is not None and not force_use_trtllm: