[Model] Add support for Cheers multimodal model (#38788)

Signed-off-by: bsliu <1187291748@qq.com>
Signed-off-by: 吴炳贤 <wubingxian24@mails.ucas.ac.cn>
This commit is contained in:
bsliu
2026-04-02 21:01:40 +08:00
committed by GitHub
parent dfe5e31689
commit c0817e4d39
11 changed files with 996 additions and 0 deletions

View File

@@ -541,6 +541,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
| `BeeForConditionalGeneration` | Bee-8B | T + I<sup>E+</sup> | `Open-Bee/Bee-8B-RL`, `Open-Bee/Bee-8B-SFT` | | ✅︎ |
| `Blip2ForConditionalGeneration` | BLIP-2 | T + I<sup>E</sup> | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | ✅︎ | ✅︎ |
| `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b`, etc. | | ✅︎ |
| `CheersForConditionalGeneration` | Cheers | T + I | `ai9stars/Cheers` | | ✅︎ |
| `Cohere2VisionForConditionalGeneration` | Command A Vision | T + I<sup>+</sup> | `CohereLabs/command-a-vision-07-2025`, etc. | | ✅︎ |
| `DeepseekVLV2ForCausalLM` | DeepSeek-VL2 | T + I<sup>+</sup> | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2`, etc. | | ✅︎ |
| `DeepseekOCRForCausalLM` | DeepSeek-OCR | T + I<sup>+</sup> | `deepseek-ai/DeepSeek-OCR`, etc. | ✅︎ | ✅︎ |

View File

@@ -179,6 +179,33 @@ def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
)
# Cheers
def run_cheers(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "ai9stars/Cheers"
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=4096,
limit_mm_per_prompt={modality: 1},
)
prompts = [
(
f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
f"<|im_start|>user\n<|image_pad|>{question}<|im_end|>\n"
f"<|im_start|>assistant\n"
)
for question in questions
]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
def run_command_a_vision(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
@@ -2140,6 +2167,7 @@ model_example_map = {
"aria": run_aria,
"aya_vision": run_aya_vision,
"bagel": run_bagel,
"cheers": run_cheers,
"bee": run_bee,
"blip-2": run_blip2,
"chameleon": run_chameleon,

View File

@@ -766,6 +766,14 @@ _MULTIMODAL_EXAMPLE_MODELS = {
extras={"6b": "Salesforce/blip2-opt-6.7b"},
),
"ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"),
"Cheers": _HfExamplesInfo(
"ai9stars/Cheers",
trust_remote_code=True,
),
"CheersForConditionalGeneration": _HfExamplesInfo(
"ai9stars/Cheers",
trust_remote_code=True,
),
"Cohere2VisionForConditionalGeneration": _HfExamplesInfo(
"CohereLabs/command-a-vision-07-2025"
),

View File

@@ -1203,6 +1203,7 @@ class ModelConfig:
"gemma3",
"molmo2",
"paligemma",
"umm",
)
if not hasattr(self.hf_config, "model_type"):
return False

View File

@@ -0,0 +1,753 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Inference-only Cheers (UMM) model compatible with HuggingFace weights.
Cheers is a unified multimodal model for image understanding and generation.
For vLLM, we focus on the image understanding (vision-to-text) capabilities.
The image generation part (gen_projector, hi_gate, etc.) is not supported,
but the VAE encoder + decoder projector are required for image understanding.
"""
import math
from collections.abc import Iterable, Mapping, Sequence
from typing import Any, Literal, TypeAlias
import torch
import torch.nn as nn
import torch.nn.functional as F
from einops import rearrange
from transformers import BatchFeature
from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.inputs import MultiModalDataDict
from vllm.logger import init_logger
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (
MultiModalFieldConfig,
MultiModalKwargsItems,
)
from vllm.multimodal.parse import MultiModalDataItems
from vllm.multimodal.processing import (
BaseDummyInputsBuilder,
BaseMultiModalProcessor,
BaseProcessingInfo,
PromptReplacement,
)
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.processors.cheers import CheersProcessor
from vllm.utils.tensor_schema import TensorSchema
from .interfaces import (
MultiModalEmbeddings,
SupportsLoRA,
SupportsMultiModal,
SupportsPP,
)
from .siglip import SiglipVisionModel
from .utils import (
AutoWeightsLoader,
WeightsMapper,
init_vllm_registered_model,
maybe_prefix,
)
logger = init_logger(__name__)
# ── VAE components (needed for image understanding pipeline) ────────
def _swish(x: torch.Tensor) -> torch.Tensor:
return x * torch.sigmoid(x)
class _AttnBlock(nn.Module):
def __init__(self, in_channels: int):
super().__init__()
self.norm = nn.GroupNorm(32, in_channels, eps=1e-6, affine=True)
self.q = nn.Conv2d(in_channels, in_channels, 1)
self.k = nn.Conv2d(in_channels, in_channels, 1)
self.v = nn.Conv2d(in_channels, in_channels, 1)
self.proj_out = nn.Conv2d(in_channels, in_channels, 1)
def forward(self, x: torch.Tensor) -> torch.Tensor:
h_ = self.norm(x)
q = self.q(h_)
k = self.k(h_)
v = self.v(h_)
b, c, h, w = q.shape
q = rearrange(q, "b c h w -> b 1 (h w) c").contiguous()
k = rearrange(k, "b c h w -> b 1 (h w) c").contiguous()
v = rearrange(v, "b c h w -> b 1 (h w) c").contiguous()
h_ = F.scaled_dot_product_attention(q, k, v)
h_ = rearrange(h_, "b 1 (h w) c -> b c h w", h=h, w=w, c=c, b=b)
return x + self.proj_out(h_)
class _ResnetBlock(nn.Module):
def __init__(self, in_channels: int, out_channels: int):
super().__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.norm1 = nn.GroupNorm(32, in_channels, eps=1e-6, affine=True)
self.conv1 = nn.Conv2d(in_channels, out_channels, 3, 1, 1)
self.norm2 = nn.GroupNorm(32, out_channels, eps=1e-6, affine=True)
self.conv2 = nn.Conv2d(out_channels, out_channels, 3, 1, 1)
if in_channels != out_channels:
self.nin_shortcut = nn.Conv2d(in_channels, out_channels, 1)
def forward(self, x: torch.Tensor) -> torch.Tensor:
h = _swish(self.norm1(x))
h = self.conv1(h)
h = _swish(self.norm2(h))
h = self.conv2(h)
if self.in_channels != self.out_channels:
x = self.nin_shortcut(x)
return x + h
class _Downsample(nn.Module):
def __init__(self, in_channels: int):
super().__init__()
self.conv = nn.Conv2d(in_channels, in_channels, 3, stride=2, padding=0)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = F.pad(x, (0, 1, 0, 1), mode="constant", value=0)
return self.conv(x)
class _Upsample(nn.Module):
def __init__(self, in_channels: int):
super().__init__()
self.conv = nn.Conv2d(in_channels, in_channels, 3, 1, 1)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = F.interpolate(x, scale_factor=2.0, mode="nearest")
return self.conv(x)
_VAE_ENCODER_DEFAULTS = {
"in_channels": 3,
"ch": 128,
"ch_mult": [1, 2, 4, 4],
"num_res_blocks": 2,
"z_channels": 32,
}
_VAE_DECODER_DEFAULTS = {
"in_channels": 3,
"out_ch": 3,
"ch": 128,
"ch_mult": [1, 2, 4, 4],
"num_res_blocks": 2,
"z_channels": 32,
}
def _cfg(config, key, defaults=None):
"""Access config attribute whether it's a dict or namespace object."""
if isinstance(config, dict):
if key in config:
return config[key]
if defaults and key in defaults:
return defaults[key]
raise KeyError(f"Key '{key}' not found in config dict: {list(config.keys())}")
return getattr(config, key)
class CheersVAEEncoder(nn.Module):
"""VAE encoder from the Cheers/UMM model."""
def __init__(self, config):
super().__init__()
d = _VAE_ENCODER_DEFAULTS
ch = _cfg(config, "ch", d)
ch_mult = _cfg(config, "ch_mult", d)
num_res_blocks = _cfg(config, "num_res_blocks", d)
z_channels = _cfg(config, "z_channels", d)
in_channels = _cfg(config, "in_channels", d)
num_resolutions = len(ch_mult)
self.quant_conv = nn.Conv2d(2 * z_channels, 2 * z_channels, 1)
self.conv_in = nn.Conv2d(in_channels, ch, 3, 1, 1)
in_ch_mult = (1,) + tuple(ch_mult)
self.down = nn.ModuleList()
block_in = ch
for i_level in range(num_resolutions):
block = nn.ModuleList()
attn = nn.ModuleList()
block_in = ch * in_ch_mult[i_level]
block_out = ch * ch_mult[i_level]
for _ in range(num_res_blocks):
block.append(_ResnetBlock(block_in, block_out))
block_in = block_out
down = nn.Module()
down.block = block
down.attn = attn
if i_level != num_resolutions - 1:
down.downsample = _Downsample(block_in)
self.down.append(down)
self.mid = nn.Module()
self.mid.block_1 = _ResnetBlock(block_in, block_in)
self.mid.attn_1 = _AttnBlock(block_in)
self.mid.block_2 = _ResnetBlock(block_in, block_in)
self.norm_out = nn.GroupNorm(32, block_in, eps=1e-6, affine=True)
self.conv_out = nn.Conv2d(block_in, 2 * z_channels, 3, 1, 1)
self._num_resolutions = num_resolutions
self._num_res_blocks = num_res_blocks
def forward(self, x: torch.Tensor) -> torch.Tensor:
hs = [self.conv_in(x)]
for i_level in range(self._num_resolutions):
for i_block in range(self._num_res_blocks):
h = self.down[i_level].block[i_block](hs[-1])
if len(self.down[i_level].attn) > 0:
h = self.down[i_level].attn[i_block](h)
hs.append(h)
if hasattr(self.down[i_level], "downsample"):
hs.append(self.down[i_level].downsample(hs[-1]))
h = hs[-1]
h = self.mid.block_1(h)
h = self.mid.attn_1(h)
h = self.mid.block_2(h)
h = _swish(self.norm_out(h))
h = self.conv_out(h)
h = self.quant_conv(h)
return h
class CheersVAEDecoder(nn.Module):
"""VAE decoder (used inside VAEDecoderProjector)."""
def __init__(self, config):
super().__init__()
d = _VAE_DECODER_DEFAULTS
ch = _cfg(config, "ch", d)
ch_mult = _cfg(config, "ch_mult", d)
num_res_blocks = _cfg(config, "num_res_blocks", d)
z_channels = _cfg(config, "z_channels", d)
out_ch = _cfg(config, "out_ch", d)
num_resolutions = len(ch_mult)
self.post_quant_conv = nn.Conv2d(z_channels, z_channels, 1)
block_in = ch * ch_mult[num_resolutions - 1]
self.conv_in = nn.Conv2d(z_channels, block_in, 3, 1, 1)
self.mid = nn.Module()
self.mid.block_1 = _ResnetBlock(block_in, block_in)
self.mid.attn_1 = _AttnBlock(block_in)
self.mid.block_2 = _ResnetBlock(block_in, block_in)
self.up = nn.ModuleList()
for i_level in reversed(range(num_resolutions)):
block = nn.ModuleList()
attn = nn.ModuleList()
block_out = ch * ch_mult[i_level]
for _ in range(num_res_blocks + 1):
block.append(_ResnetBlock(block_in, block_out))
block_in = block_out
up = nn.Module()
up.block = block
up.attn = attn
if i_level != 0:
up.upsample = _Upsample(block_in)
self.up.insert(0, up)
self.norm_out = nn.GroupNorm(32, block_in, eps=1e-6, affine=True)
self.conv_out = nn.Conv2d(block_in, out_ch, 3, 1, 1)
self._num_resolutions = num_resolutions
self._num_res_blocks = num_res_blocks
def forward(self, z: torch.Tensor) -> torch.Tensor:
z = self.post_quant_conv(z)
upscale_dtype = next(self.up.parameters()).dtype
h = self.conv_in(z)
h = self.mid.block_1(h)
h = self.mid.attn_1(h)
h = self.mid.block_2(h)
h = h.to(upscale_dtype)
for i_level in reversed(range(self._num_resolutions)):
for i_block in range(self._num_res_blocks + 1):
h = self.up[i_level].block[i_block](h)
if len(self.up[i_level].attn) > 0:
h = self.up[i_level].attn[i_block](h)
if i_level != 0:
h = self.up[i_level].upsample(h)
h = _swish(self.norm_out(h))
return self.conv_out(h)
class CheersVAEModel(nn.Module):
"""VAE model with encoder only (for image understanding)."""
def __init__(self, config):
super().__init__()
enc_cfg = _cfg(config, "vae_encoder_config")
self.encoder = CheersVAEEncoder(enc_cfg)
self.ps = [2, 2]
z_ch = _cfg(enc_cfg, "z_channels", _VAE_ENCODER_DEFAULTS)
self.bn = nn.BatchNorm2d(
math.prod(self.ps) * z_ch,
eps=1e-4,
momentum=0.1,
affine=False,
track_running_stats=True,
)
def encode(self, x: torch.Tensor) -> torch.Tensor:
self.bn.eval()
moments = self.encoder(x)
mean = torch.chunk(moments, 2, dim=1)[0]
z = rearrange(
mean,
"... c (i pi) (j pj) -> ... (c pi pj) i j",
pi=self.ps[0],
pj=self.ps[1],
)
return self.bn(z)
class CheersVAEDecoderProjector(nn.Module):
"""VAE decoder projector that converts latent back to pixel-like space."""
def __init__(self, config):
super().__init__()
dec_cfg = _cfg(config, "vae_decoder_config")
enc_cfg = _cfg(config, "vae_encoder_config")
self.decoder = CheersVAEDecoder(dec_cfg)
self.ps = [2, 2]
z_ch = _cfg(enc_cfg, "z_channels", _VAE_ENCODER_DEFAULTS)
self.bn = nn.BatchNorm2d(
math.prod(self.ps) * z_ch,
eps=1e-4,
momentum=0.1,
affine=False,
track_running_stats=True,
)
def forward(self, z: torch.Tensor) -> torch.Tensor:
self.bn.eval()
s = torch.sqrt(self.bn.running_var.view(1, -1, 1, 1) + 1e-4)
m = self.bn.running_mean.view(1, -1, 1, 1)
z = z * s + m
z = rearrange(
z,
"... (c pi pj) i j -> ... c (i pi) (j pj)",
pi=self.ps[0],
pj=self.ps[1],
)
return self.decoder(z)
class CheersImagePixelInputs(TensorSchema):
"""
Dimensions:
- bn: Batch size * number of images
- c: Number of channels (3)
- h: Height of each image
- w: Width of each image
"""
type: Literal["pixel_values"]
pixel_values: torch.Tensor # Shape: (bn, 3, h, w)
CheersImageInputs: TypeAlias = CheersImagePixelInputs
class CheersUndProjector(nn.Module):
"""Understanding projector that maps vision features to LLM dimension
with 2x2 spatial compression (4x token reduction)."""
def __init__(
self,
image_embed_dim: int,
text_embed_dim: int,
compression_factor: tuple[int, int] = (2, 2),
quant_config: QuantizationConfig | None = None,
prefix: str = "",
):
super().__init__()
self.image_embed_dim = image_embed_dim
self.text_embed_dim = text_embed_dim
self.compression_factor = compression_factor
self.layernorm = nn.LayerNorm(image_embed_dim)
hidden_size = image_embed_dim * (compression_factor[0] * compression_factor[1])
self.mlp = nn.Sequential(
nn.Linear(hidden_size, hidden_size),
nn.GELU(),
nn.Linear(hidden_size, text_embed_dim),
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self.layernorm(x)
height = width = int(x.size(1) ** 0.5)
x = x.permute(0, 2, 1).unflatten(-1, (height, width))
batch_size, dim, height, width = x.shape
unfolded = x.unfold(
2, self.compression_factor[0], self.compression_factor[0]
).unfold(3, self.compression_factor[1], self.compression_factor[1])
unfolded = unfolded.contiguous().view(
batch_size,
dim,
-1,
self.compression_factor[0] * self.compression_factor[1],
)
unfolded = (
unfolded.permute(0, 2, 3, 1)
.contiguous()
.view(
batch_size,
-1,
dim * self.compression_factor[0] * self.compression_factor[1],
)
)
return self.mlp(unfolded)
class CheersProcessingInfo(BaseProcessingInfo):
"""Processing information for Cheers model."""
def get_hf_processor(self, **kwargs: object) -> CheersProcessor:
from vllm.transformers_utils.processor import cached_get_image_processor
image_processor = cached_get_image_processor(
self.ctx.model_config.model,
revision=self.ctx.model_config.revision,
trust_remote_code=self.ctx.model_config.trust_remote_code,
)
tokenizer = self.get_tokenizer()
return CheersProcessor(
image_processor=image_processor,
tokenizer=tokenizer,
**kwargs,
)
def get_supported_mm_limits(self) -> Mapping[str, int | None]:
return {"image": None}
def get_mm_max_tokens_per_item(
self,
seq_len: int,
mm_counts: Mapping[str, int],
) -> Mapping[str, int]:
hf_config = self.get_hf_config()
vit_config = hf_config.vision_representation_config
patch_size = vit_config.patch_size
image_size = vit_config.image_size
num_patches = (image_size // patch_size) ** 2
# After 2x2 compression, tokens reduce by 4x
num_tokens = num_patches // 4
return {"image": num_tokens}
def get_num_image_tokens(
self,
*,
image_width: int,
image_height: int,
) -> int:
hf_config = self.get_hf_config()
vit_config = hf_config.vision_representation_config
patch_size = vit_config.patch_size
image_size = vit_config.image_size
num_patches = (image_size // patch_size) ** 2
return num_patches // 4
class CheersDummyInputsBuilder(BaseDummyInputsBuilder[CheersProcessingInfo]):
"""Build dummy inputs for Cheers model profiling."""
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
num_images = mm_counts.get("image", 0)
return "<|image_pad|>" * num_images
def get_dummy_mm_data(
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None,
) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0)
hf_config = self.info.get_hf_config()
vit_config = hf_config.vision_representation_config
image_size = vit_config.image_size
image_overrides = mm_options.get("image") if mm_options else None
return {
"image": self._get_dummy_images(
width=image_size,
height=image_size,
num_images=num_images,
overrides=image_overrides,
),
}
class CheersMultiModalProcessor(BaseMultiModalProcessor[CheersProcessingInfo]):
"""Multimodal processor for Cheers model."""
def _call_hf_processor(
self,
prompt: str,
mm_data: Mapping[str, object],
mm_kwargs: Mapping[str, object],
tok_kwargs: Mapping[str, object],
) -> BatchFeature:
return super()._call_hf_processor(prompt, mm_data, mm_kwargs, tok_kwargs)
def _hf_processor_applies_updates(
self,
prompt_text: str,
mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object],
tokenization_kwargs: Mapping[str, object],
) -> bool:
return False
def _get_prompt_updates(
self,
mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, Any],
out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptReplacement]:
hf_config = self.info.get_hf_config()
vit_config = hf_config.vision_representation_config
patch_size = vit_config.patch_size
image_size = vit_config.image_size
tokenizer = self.info.get_tokenizer()
image_token_id = tokenizer.get_vocab().get("<|image_pad|>")
if image_token_id is None:
raise ValueError(
"Image token '<|image_pad|>' not found in tokenizer vocabulary"
)
def get_replacement_cheers(item_idx: int):
num_patches = (image_size // patch_size) ** 2
num_tokens = num_patches // 4
return [image_token_id] * num_tokens
return [
PromptReplacement(
modality="image",
target=[image_token_id],
replacement=get_replacement_cheers,
)
]
def _get_mm_fields_config(
self,
hf_inputs: Any,
hf_processor_mm_kwargs: Mapping[str, object],
) -> Mapping[str, MultiModalFieldConfig]:
return {
"pixel_values": MultiModalFieldConfig.batched("image"),
}
@MULTIMODAL_REGISTRY.register_processor(
CheersMultiModalProcessor,
info=CheersProcessingInfo,
dummy_inputs=CheersDummyInputsBuilder,
)
class CheersForConditionalGeneration(
nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP
):
"""
Cheers: A unified multimodal model for image understanding and generation.
For vLLM, we focus on the image understanding (vision-to-text) capabilities.
The image generation part is not supported in vLLM.
"""
requires_raw_input_tokens = True
hf_to_vllm_mapper = WeightsMapper(
orig_to_new_prefix={
"model.language_model.": "language_model.model.",
"model.vision_representation.": "vision_representation.vision_model.",
"model.und_projector.": "und_projector.",
"model.vae_model.": "vae_model.",
"model.vae_decoder_projector.": "vae_decoder_projector.",
"lm_head.": "language_model.lm_head.",
}
)
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> str | None:
if modality.startswith("image"):
return "<|image_pad|>"
raise ValueError("Only image modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
config = vllm_config.model_config.hf_config
quant_config = vllm_config.quant_config
multimodal_config = vllm_config.model_config.multimodal_config
if type(config).__name__ not in ("CheersConfig", "UMMConfig"):
raise ValueError(
f"Expected CheersConfig or UMMConfig, got {type(config).__name__}."
)
self.config = config
self.multimodal_config = multimodal_config
# The Cheers model's custom Qwen2Config defaults rope_theta to
# 1_000_000, but this isn't stored in the JSON. vLLM's standard
# Qwen2Config defaults to 10_000, causing a 100× mismatch.
# We must patch BOTH the attribute AND rope_parameters (which
# patch_rope_parameters may have already populated from the wrong
# default before __init__ runs).
_CHEERS_ROPE_THETA = 1_000_000.0
tc = config.text_config
old_theta = getattr(tc, "rope_theta", None)
if old_theta != _CHEERS_ROPE_THETA:
logger.info(
"Overriding text_config.rope_theta from %s to %s",
old_theta,
_CHEERS_ROPE_THETA,
)
tc.rope_theta = _CHEERS_ROPE_THETA
rp = getattr(tc, "rope_parameters", None)
if rp is not None and rp.get("rope_theta") != _CHEERS_ROPE_THETA:
logger.info(
"Overriding rope_parameters.rope_theta from %s to %s",
rp.get("rope_theta"),
_CHEERS_ROPE_THETA,
)
rp["rope_theta"] = _CHEERS_ROPE_THETA
with self._mark_language_model(vllm_config):
self.language_model = init_vllm_registered_model(
vllm_config=vllm_config,
hf_config=config.text_config,
prefix=maybe_prefix(prefix, "language_model"),
architectures=["Qwen2ForCausalLM"],
)
vit_config = config.vision_representation_config
with self._mark_tower_model(vllm_config, "image"):
self.vae_model = CheersVAEModel(config)
self.vae_decoder_projector = CheersVAEDecoderProjector(config)
self.vision_representation = SiglipVisionModel(
config=vit_config,
quant_config=quant_config,
prefix=maybe_prefix(prefix, "vision_representation"),
)
vit_hidden_size = vit_config.hidden_size
llm_hidden_size = config.text_config.hidden_size
self.und_projector = CheersUndProjector(
image_embed_dim=vit_hidden_size,
text_embed_dim=llm_hidden_size,
compression_factor=(2, 2),
quant_config=quant_config,
prefix=maybe_prefix(prefix, "und_projector"),
)
self.make_empty_intermediate_tensors = (
self.language_model.make_empty_intermediate_tensors
)
def _parse_and_validate_image_input(
self, **kwargs: object
) -> CheersImageInputs | None:
pixel_values = kwargs.pop("pixel_values", None)
if pixel_values is None:
return None
return CheersImagePixelInputs(
type="pixel_values",
pixel_values=pixel_values,
)
def _process_image_input(
self, image_input: CheersImageInputs
) -> tuple[torch.Tensor, ...]:
"""Process image inputs through VAE → SigLIP → projector pipeline.
HF native path: pixel_values → VAE.encode(t=1.0) → vae_decoder_projector
→ SigLIP → und_projector → text-space embeddings
"""
pixel_values = image_input["pixel_values"]
if pixel_values.ndim == 5:
batch_size, num_images, channels, height, width = pixel_values.shape
pixel_values = pixel_values.reshape(
batch_size * num_images, channels, height, width
)
with torch.no_grad():
vae_dtype = next(self.vae_model.parameters()).dtype
image_latent = self.vae_model.encode(pixel_values.to(dtype=vae_dtype))
image_pixel_hat = self.vae_decoder_projector(image_latent)
vision_features = self.vision_representation(image_pixel_hat)
vision_embeds = self.und_projector(vision_features)
return tuple(vision_embeds)
def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return []
return self._process_image_input(image_input)
def forward(
self,
input_ids: torch.Tensor | None,
positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None,
**kwargs: object,
) -> torch.Tensor | IntermediateTensors:
if intermediate_tensors is not None:
inputs_embeds = None
hidden_states = self.language_model.model(
input_ids=input_ids,
positions=positions,
intermediate_tensors=intermediate_tensors,
inputs_embeds=inputs_embeds,
)
return hidden_states
def compute_logits(
self,
hidden_states: torch.Tensor,
) -> torch.Tensor | None:
return self.language_model.compute_logits(hidden_states)
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
"""Load weights, keeping VAE encoder/decoder projector for understanding."""
skip_prefixes = [
"model.time_embed.",
"model.gen_projector.",
"model.hi_gate.",
"model.hi_projector.",
"model.vae_model.decoder.",
]
skip_keywords = [
"text_loss_fc",
]
filtered_weights = []
for name, tensor in weights:
if any(name.startswith(p) for p in skip_prefixes):
continue
if any(kw in name for kw in skip_keywords):
continue
filtered_weights.append((name, tensor))
loader = AutoWeightsLoader(self)
return loader.load_weights(filtered_weights, mapper=self.hf_to_vllm_mapper)

View File

@@ -350,6 +350,8 @@ _MULTIMODAL_MODELS = {
"chameleon",
"ChameleonForConditionalGeneration",
),
"Cheers": ("cheers", "CheersForConditionalGeneration"),
"CheersForConditionalGeneration": ("cheers", "CheersForConditionalGeneration"),
"Cohere2VisionForConditionalGeneration": (
"cohere2_vision",
"Cohere2VisionForConditionalGeneration",

View File

@@ -80,6 +80,7 @@ class LazyConfigDict(dict):
_CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
afmoe="AfmoeConfig",
bagel="BagelConfig",
umm="CheersConfig",
chatglm="ChatGLMConfig",
colmodernvbert="ColModernVBertConfig",
colpali="ColPaliConfig",

View File

@@ -18,6 +18,7 @@ _CLASS_TO_MODULE: dict[str, str] = {
"AfmoeConfig": "vllm.transformers_utils.configs.afmoe",
"AXK1Config": "vllm.transformers_utils.configs.AXK1",
"BagelConfig": "vllm.transformers_utils.configs.bagel",
"CheersConfig": "vllm.transformers_utils.configs.cheers",
"ChatGLMConfig": "vllm.transformers_utils.configs.chatglm",
"ColModernVBertConfig": "vllm.transformers_utils.configs.colmodernvbert",
"ColPaliConfig": "vllm.transformers_utils.configs.colpali",
@@ -75,6 +76,7 @@ __all__ = [
"AfmoeConfig",
"AXK1Config",
"BagelConfig",
"CheersConfig",
"ChatGLMConfig",
"ColModernVBertConfig",
"ColPaliConfig",

View File

@@ -0,0 +1,109 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from transformers import PretrainedConfig, SiglipVisionConfig
from transformers.modeling_rope_utils import rope_config_validation
class CheersTextConfig(PretrainedConfig):
"""Qwen2-based text config with Cheers-specific defaults."""
model_type = "umm"
base_config_key = "text_config"
def __init__(
self,
vocab_size=152064,
hidden_size=3584,
intermediate_size=18944,
num_hidden_layers=28,
num_attention_heads=28,
num_key_value_heads=4,
hidden_act="silu",
max_position_embeddings=131072,
initializer_range=0.02,
rms_norm_eps=1e-6,
use_cache=True,
tie_word_embeddings=False,
rope_theta=1000000.0,
rope_scaling=None,
use_sliding_window=False,
sliding_window=131072,
max_window_layers=28,
layer_types=None,
attention_dropout=0.0,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.use_sliding_window = use_sliding_window
self.sliding_window = sliding_window if self.use_sliding_window else None
self.max_window_layers = max_window_layers
if num_key_value_heads is None:
num_key_value_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
self.attention_dropout = attention_dropout
if self.rope_scaling is not None and "type" in self.rope_scaling:
self.rope_scaling["rope_type"] = self.rope_scaling["type"]
rope_config_validation(self)
self.layer_types = layer_types
if self.layer_types is None:
self.layer_types = [
"sliding_attention"
if self.sliding_window is not None and i >= self.max_window_layers
else "full_attention"
for i in range(self.num_hidden_layers)
]
super().__init__(
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
class CheersConfig(PretrainedConfig):
"""Configuration class for Cheers (UMM) model."""
model_type = "umm"
def __init__(
self,
text_config: dict | CheersTextConfig | None = None,
vision_representation_config: dict | SiglipVisionConfig | None = None,
vae_encoder_config: dict | None = None,
vae_decoder_config: dict | None = None,
**kwargs,
):
super().__init__(**kwargs)
if isinstance(text_config, dict):
self.text_config = CheersTextConfig(**text_config)
else:
self.text_config = text_config or CheersTextConfig()
if isinstance(vision_representation_config, dict):
self.vision_representation_config = SiglipVisionConfig(
**vision_representation_config
)
else:
self.vision_representation_config = (
vision_representation_config or SiglipVisionConfig()
)
self.vae_encoder_config = vae_encoder_config or {"resolution": 512}
self.vae_decoder_config = vae_decoder_config or {"resolution": 512}
@property
def hidden_size(self) -> int:
"""Return the hidden size of the language model."""
return self.text_config.hidden_size

View File

@@ -12,6 +12,7 @@ import importlib
__all__ = [
"BagelProcessor",
"CheersProcessor",
"CohereASRProcessor",
"DeepseekVLV2Processor",
"FireRedASR2Processor",
@@ -39,6 +40,7 @@ __all__ = [
_CLASS_TO_MODULE: dict[str, str] = {
"BagelProcessor": "vllm.transformers_utils.processors.bagel",
"CheersProcessor": "vllm.transformers_utils.processors.cheers",
"CohereASRProcessor": "vllm.transformers_utils.processors.cohere_asr",
"DeepseekVLV2Processor": "vllm.transformers_utils.processors.deepseek_vl2",
"FireRedASR2Processor": "vllm.transformers_utils.processors.fireredasr2",

View File

@@ -0,0 +1,89 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Cheers (UMM) processor for image and text inputs."""
from transformers import AutoProcessor
from transformers.feature_extraction_utils import BatchFeature
from transformers.image_utils import ImageInput
from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
class CheersProcessorKwargs(ProcessingKwargs, total=False): # type: ignore[call-arg]
_defaults = {
"images_kwargs": {
"return_tensors": "pt",
},
}
class CheersProcessor(ProcessorMixin):
"""
Constructs a Cheers processor which wraps a
SigLIP image processor and a Qwen2 tokenizer.
"""
attributes = ["image_processor", "tokenizer"]
image_processor_class = "AutoImageProcessor"
tokenizer_class = "AutoTokenizer"
def __call__(
self,
text: TextInput
| PreTokenizedInput
| list[TextInput]
| list[PreTokenizedInput] = None,
images: ImageInput = None,
**kwargs: Unpack[CheersProcessorKwargs],
):
output_kwargs = self._merge_kwargs(
CheersProcessorKwargs,
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
**kwargs,
)
if images is not None:
import torch
if isinstance(images, (list, tuple)):
all_pv = []
all_ghw = []
for img in images:
result = self.image_processor(img, **output_kwargs["images_kwargs"])
all_pv.append(result["pixel_values"])
if "grid_hws" in result:
all_ghw.append(result["grid_hws"])
pixel_values = {
"pixel_values": torch.cat(all_pv, dim=0),
}
if all_ghw:
pixel_values["grid_hws"] = torch.cat(all_ghw, dim=0)
else:
pixel_values = self.image_processor(
images, **output_kwargs["images_kwargs"]
)
else:
pixel_values = {}
text_inputs = (
self.tokenizer(text, **output_kwargs["text_kwargs"])
if text is not None
else {}
)
return BatchFeature(data={**pixel_values, **text_inputs})
def batch_decode(self, *args, **kwargs):
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
return self.tokenizer.decode(*args, **kwargs)
@property
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
image_processor_input_names = self.image_processor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
AutoProcessor.register("CheersProcessor", CheersProcessor)