From 9572f74f1509e138988bf8288291f1f217f101ae Mon Sep 17 00:00:00 2001 From: ShaanveerS <146979751+ShaanveerS@users.noreply.github.com> Date: Thu, 8 Jan 2026 07:50:16 +0100 Subject: [PATCH] [Model] Enable LoRA support for tower and connector in DotsOCR (#31825) Signed-off-by: ShaanveerS --- docs/models/supported_models.md | 2 +- vllm/model_executor/models/dots_ocr.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index e1287bdb4..07b1ced5c 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -371,7 +371,7 @@ th { | `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat`, etc. | ✅︎ | ✅︎ | | `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3`, `deepseek-ai/DeepSeek-R1`, `deepseek-ai/DeepSeek-V3.1`, etc. | ✅︎ | ✅︎ | | `Dots1ForCausalLM` | dots.llm1 | `rednote-hilab/dots.llm1.base`, `rednote-hilab/dots.llm1.inst`, etc. | | ✅︎ | -| `DotsOCRForCausalLM` | dots_ocr | `rednote-hilab/dots.ocr` | | ✅︎ | +| `DotsOCRForCausalLM` | dots_ocr | `rednote-hilab/dots.ocr` | ✅︎ | ✅︎ | | `Ernie4_5ForCausalLM` | Ernie4.5 | `baidu/ERNIE-4.5-0.3B-PT`, etc. | ✅︎ | ✅︎ | | `Ernie4_5_MoeForCausalLM` | Ernie4.5MoE | `baidu/ERNIE-4.5-21B-A3B-PT`, `baidu/ERNIE-4.5-300B-A47B-PT`, etc. |✅︎| ✅︎ | | `ExaoneForCausalLM` | EXAONE-3 | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. | ✅︎ | ✅︎ | diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py index 6d8dbec92..990e5e4c5 100644 --- a/vllm/model_executor/models/dots_ocr.py +++ b/vllm/model_executor/models/dots_ocr.py @@ -765,6 +765,14 @@ class DotsOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA def get_language_model(self) -> torch.nn.Module: return self.language_model + def get_num_mm_encoder_tokens(self, num_image_tokens: int) -> int: + merge_size = self.vision_tower.spatial_merge_size + return num_image_tokens * (merge_size**2) + + def get_num_mm_connector_tokens(self, num_vision_tokens: int) -> int: + merge_size = self.vision_tower.spatial_merge_size + return num_vision_tokens // (merge_size**2) + def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: