diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index b1be3a376..f4cb701c0 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -526,7 +526,10 @@ VLM_TEST_SETTINGS = { auto_cls=AutoModelForImageTextToText, ), "isaac": VLMTestInfo( - models=["PerceptronAI/Isaac-0.1"], + models=[ + "PerceptronAI/Isaac-0.1", + "PerceptronAI/Isaac-0.2-2B-Preview", + ], test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), prompt_formatter=lambda img_prompt: ( f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n" diff --git a/tests/models/registry.py b/tests/models/registry.py index ebff16cec..fc57e8799 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -708,6 +708,7 @@ _MULTIMODAL_EXAMPLE_MODELS = { "IsaacForConditionalGeneration": _HfExamplesInfo( "PerceptronAI/Isaac-0.1", trust_remote_code=True, + extras={"0.2-2B-Preview": "PerceptronAI/Isaac-0.2-2B-Preview"}, ), "InternS1ForConditionalGeneration": _HfExamplesInfo( "internlm/Intern-S1", trust_remote_code=True diff --git a/vllm/model_executor/models/isaac.py b/vllm/model_executor/models/isaac.py index ffcc24446..6d331b95f 100644 --- a/vllm/model_executor/models/isaac.py +++ b/vllm/model_executor/models/isaac.py @@ -65,6 +65,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors from vllm.tokenizers import get_tokenizer from vllm.tokenizers.hf import get_cached_tokenizer +from vllm.transformers_utils.config import patch_rope_parameters from vllm.transformers_utils.configs import ( IsaacConfig, PixelShuffleSiglip2VisionConfig, @@ -1284,11 +1285,14 @@ class IsaacForConditionalGeneration( hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ "lm_head.": "language_model.lm_head.", + "model.text_model.lm_head.": "language_model.lm_head.", + "model.text_model.": "language_model.model.", "model.vision_embedding.0": "vision_embedding.transformer", "model.vision_embedding.1": "vision_embedding.linear_fc1", "model.vision_embedding.2": "vision_embedding.act", "model.vision_embedding.3": "vision_embedding.linear_fc2", "model.vision_embedding.": "vision_embedding.", + "model.lm_head.": "language_model.lm_head.", "model.": "language_model.model.", } ) @@ -1319,7 +1323,25 @@ class IsaacForConditionalGeneration( ) config.image_token_id = self.vision_token_id - config.rope_scaling["mrope_section"] = calculated_mrope_section + text_cfg = getattr(config, "text_config", None) + target_cfg = ( + text_cfg + if text_cfg is not None and not isinstance(text_cfg, dict) + else config + ) + + rope_scaling = getattr(target_cfg, "rope_scaling", None) + if rope_scaling is None and target_cfg is config: + rope_scaling = getattr(config, "_rope_scaling", None) + + patch_rope_parameters(target_cfg) + rope_parameters = target_cfg.rope_parameters + rope_parameters["mrope_section"] = calculated_mrope_section + if rope_scaling is not None and "mrope_interleaved" in rope_scaling: + rope_parameters.setdefault( + "mrope_interleaved", rope_scaling["mrope_interleaved"] + ) + target_cfg.rope_parameters = rope_parameters self.language_model = init_vllm_registered_model( vllm_config=vllm_config, architectures=["Qwen3ForCausalLM"], diff --git a/vllm/transformers_utils/configs/isaac.py b/vllm/transformers_utils/configs/isaac.py index fc15011b5..ed36d19eb 100644 --- a/vllm/transformers_utils/configs/isaac.py +++ b/vllm/transformers_utils/configs/isaac.py @@ -32,10 +32,14 @@ class IsaacConfig(Qwen3Config): """Configuration class for Isaac multimodal model.""" model_type = "isaac" - sub_configs = {"vision_config": PixelShuffleSiglip2VisionConfig} + sub_configs = { + "vision_config": PixelShuffleSiglip2VisionConfig, + "text_config": Qwen3Config, + } def __init__( self, + text_config=None, vision_config=None, vision_patch_size: int = 16, vision_max_num_patches: int = 256, @@ -48,6 +52,16 @@ class IsaacConfig(Qwen3Config): ): super().__init__(**kwargs) + if isinstance(text_config, dict): + # from HF config + self.text_config = self.sub_configs["text_config"](**text_config) + elif text_config is None: + # For BC use all kwargs to init text config. + self.text_config = self.sub_configs["text_config"](**kwargs) + else: + # from Qwen3Config + self.text_config = text_config + # EventStreamProcessor parameters (for backward compatibility) self.video_patch_size = vision_patch_size self.vision_max_num_patches = vision_max_num_patches