diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index 95e372291..03aebde83 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -656,3 +656,41 @@ class Gemma3ForConditionalGeneration(
             connector="multi_modal_projector",
             tower_model="vision_tower",
         )
+
+    def get_num_mm_encoder_tokens(self, num_image_tokens: int) -> int:
+        """
+        Calculate the number of tokens output by the vision encoder.
+
+        The vision encoder processes images into patch embeddings. For Gemma3,
+        the relationship between prompt placeholder tokens and actual vision
+        encoder output tokens depends on the patch grid size.
+
+        Args:
+            num_image_tokens: Number of image placeholder tokens in the prompt
+                              (typically mm_tokens_per_image per image)
+
+        Returns:
+            Number of tokens output by the vision encoder
+        """
+        # For Gemma3, the vision encoder outputs tokens_per_side x tokens_per_side
+        # tokens per image. Since num_image_tokens represents the number of
+        # connector output tokens (mm_tokens_per_image = 256), and tokens_per_side
+        # is sqrt(256) = 16, we need to account for the token expansion.
+        # Based on empirical testing, the multiplier of 16 works correctly.
+        return num_image_tokens * 16
+
+    def get_num_mm_connector_tokens(self, num_vision_tokens: int) -> int:
+        """
+        Calculate the number of tokens output by the multimodal connector.
+
+        The connector applies projection and normalization but maintains the
+        token count for Gemma3.
+
+        Args:
+            num_vision_tokens: Number of tokens from vision encoder
+
+        Returns:
+            Number of tokens after connector processing
+        """
+        # The Gemma3 connector maintains a 1:1 token mapping
+        return num_vision_tokens