[V1] Remove V0 code paths for Hybrid models (#25400)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
This commit is contained in:
Thomas Parnell
2025-09-23 17:26:13 +02:00
committed by GitHub
parent 2c58742dff
commit a903669e10
31 changed files with 352 additions and 2296 deletions

View File

@@ -8,7 +8,6 @@ import torch
import torch.nn as nn
from transformers import Lfm2Config
from vllm import envs
from vllm.attention import Attention
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, ModelConfig, VllmConfig
@@ -297,7 +296,6 @@ class Lfm2ShortConvDecoderLayer(nn.Module):
self.conv(
hidden_states,
output,
conv_metadata=None,
)
hidden_states, residual = self.ffn_norm(output, residual)
hidden_states = self.feed_forward(hidden_states)
@@ -459,13 +457,11 @@ class Lfm2ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
def get_mamba_state_shape_from_config(
cls,
vllm_config: "VllmConfig",
use_v1: bool = True,
) -> tuple[tuple[int, int]]:
""" Calculate shapes for LFM2's convolutional cache.
Args:
vllm_config: vLLM config
use_v1: Get shapes for V1 (or V0)
Returns:
Tuple containing:
@@ -478,7 +474,6 @@ class Lfm2ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
tp_world_size=parallel_config.tensor_parallel_size,
intermediate_size=hf_config.conv_dim,
conv_kernel=hf_config.conv_L_cache,
use_v1=use_v1,
)
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
@@ -489,8 +484,6 @@ class Lfm2ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
scheduler_config = vllm_config.scheduler_config
assert (not cache_config.enable_prefix_caching
), "Lfm2 currently does not support prefix caching"
assert envs.VLLM_USE_V1, (
"Lfm2ForCausalLM doesn't support vLLM v0. Please enable v1")
super().__init__()
self.config = config