[FlashInfer] Upgrade to 0.2.0 (#11194)

Signed-off-by: Bowen Wang <abmfy@icloud.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
This commit is contained in:
Bowen Wang
2025-01-28 02:19:24 +08:00
committed by GitHub
parent 3f1fc7425a
commit 2bc3fbba0c
10 changed files with 256 additions and 74 deletions

View File

@@ -310,14 +310,15 @@ class ModelConfig:
(self.hf_text_config.model_type in ["gemma2", "cohere2"]))
if (not self.disable_sliding_window and has_interleaved_attention):
if envs.VLLM_ATTENTION_BACKEND == "XFORMERS":
if (backend :=
envs.VLLM_ATTENTION_BACKEND) in ("XFORMERS", "FLASHINFER"):
sliding_window_len_min = get_min_sliding_window(
self.hf_text_config.sliding_window)
logger.warning_once(
f"{self.hf_text_config.model_type} has interleaved "
"attention, which is currently not supported by the "
"XFORMERS backend. Disabling sliding window and capping "
f"{backend} backend. Disabling sliding window and capping "
"the max length to the sliding window size "
f"({sliding_window_len_min}).")
self.disable_sliding_window = True
@@ -3310,7 +3311,7 @@ _current_vllm_config: Optional[VllmConfig] = None
@contextmanager
def set_current_vllm_config(vllm_config: VllmConfig):
def set_current_vllm_config(vllm_config: VllmConfig, check_compile=False):
"""
Temporarily set the current VLLM config.
Used during model initialization.
@@ -3330,7 +3331,8 @@ def set_current_vllm_config(vllm_config: VllmConfig):
vllm_config.compilation_config.enabled_custom_ops)
logger.debug("disabled custom ops: %s",
vllm_config.compilation_config.disabled_custom_ops)
if vllm_config.compilation_config.level == CompilationLevel.PIECEWISE \
if check_compile and \
vllm_config.compilation_config.level == CompilationLevel.PIECEWISE \
and compilation_counter.num_models_seen == num_models_seen:
# If the model supports compilation,
# compilation_counter.num_models_seen should be increased