feat(models): implement Google Gemma 4 architecture support (MoE, Multimodal, Reasoning, Tool-Use) (#38826)

Signed-off-by: Luciano Martins <lucianommartins@users.noreply.github.com> Signed-off-by: Luciano Martins <lucianomartins@google.com> Co-authored-by: Luciano Martins <lucianommartins@users.noreply.github.com> Co-authored-by: Isotr0py <2037008807@qq.com>
2026-04-02 15:13:28 -03:00
parent ecd5443dbc
commit 08ed2b9688
20 changed files with 5051 additions and 1 deletions
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -9,6 +9,7 @@ from vllm.utils.math_utils import round_up
 if TYPE_CHECKING:
    from vllm.config import ModelConfig, VllmConfig

+
 logger = init_logger(__name__)


@@ -52,6 +53,58 @@ class Gemma3TextModelConfig(VerifyAndUpdateConfig):
        hf_config.is_causal = not hf_config.use_bidirectional_attention


+class Gemma4Config(VerifyAndUpdateConfig):
+    @staticmethod
+    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
+        """Force unified attention backend for models with heterogeneous
+        head dimensions.
+
+        Some Gemma4 variants use different head dimensions for
+        sliding window (head_dim) vs full attention (global_head_dim) layers.
+        When global_head_dim > 256, FlashAttention rejects those layers
+        (head_size <= 256 kernel limit), causing vLLM to select a different
+        backend for each layer type. This mixed-backend execution produces
+        numerical divergence and output corruption.
+
+        The fix detects heterogeneous head dimensions from the model config
+        and forces TRITON_ATTN (which has no head_size ceiling) for all
+        layers when the user hasn't explicitly chosen a backend.
+
+        TODO: Heterogeneous head_sizes (head_dim != global_head_dim)
+        require NixlConnector changes to support per-layer KV transfer
+        with different head dimensions for prefill-decode disaggregation.
+        """
+        hf_text_config = vllm_config.model_config.hf_text_config
+        head_dim = getattr(hf_text_config, "head_dim", None)
+        global_head_dim = getattr(hf_text_config, "global_head_dim", None)
+
+        # Only force Triton when head dimensions actually differ AND the
+        # larger one exceeds FlashAttention's kernel limit (head_size <= 256).
+        # This avoids unnecessary backend forcing on smaller models where
+        # the config carries global_head_dim but all layers can still use
+        # the same FA backend.
+        max_head_dim = max(head_dim or 0, global_head_dim or 0)
+        if (
+            head_dim is not None
+            and global_head_dim is not None
+            and head_dim != global_head_dim
+            and max_head_dim > 256
+            and vllm_config.attention_config.backend is None
+        ):
+            from vllm.v1.attention.backends.registry import (
+                AttentionBackendEnum,
+            )
+
+            vllm_config.attention_config.backend = AttentionBackendEnum.TRITON_ATTN
+            logger.info(
+                "Gemma4 model has heterogeneous head dimensions "
+                "(head_dim=%d, global_head_dim=%d). Forcing TRITON_ATTN "
+                "backend to prevent mixed-backend numerical divergence.",
+                head_dim,
+                global_head_dim,
+            )
+
+
 class GptOssForCausalLMConfig(VerifyAndUpdateConfig):
    @staticmethod
    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
@@ -533,6 +586,8 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
    "Ernie4_5_VLMoeForConditionalGeneration": Ernie4_5_VLMoeForConditionalGenerationConfig,  # noqa: E501
    "FalconMambaForCausalLM": MambaModelConfig,
    "Gemma3TextModel": Gemma3TextModelConfig,
+    "Gemma4ForCausalLM": Gemma4Config,
+    "Gemma4ForConditionalGeneration": Gemma4Config,
    "GptOssForCausalLM": GptOssForCausalLMConfig,
    "GteModel": SnowflakeGteNewModelConfig,
    "GteNewForSequenceClassification": GteNewModelConfig,