feature/issac 0.2 (#31550)

Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.io>
This commit is contained in:
Akshat Shrivastava
2026-01-09 19:18:05 -08:00
committed by GitHub
parent ea6d067a2a
commit e45946bd91
4 changed files with 43 additions and 3 deletions

View File

@@ -526,7 +526,10 @@ VLM_TEST_SETTINGS = {
auto_cls=AutoModelForImageTextToText,
),
"isaac": VLMTestInfo(
models=["PerceptronAI/Isaac-0.1"],
models=[
"PerceptronAI/Isaac-0.1",
"PerceptronAI/Isaac-0.2-2B-Preview",
],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: (
f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n"

View File

@@ -708,6 +708,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"IsaacForConditionalGeneration": _HfExamplesInfo(
"PerceptronAI/Isaac-0.1",
trust_remote_code=True,
extras={"0.2-2B-Preview": "PerceptronAI/Isaac-0.2-2B-Preview"},
),
"InternS1ForConditionalGeneration": _HfExamplesInfo(
"internlm/Intern-S1", trust_remote_code=True

View File

@@ -65,6 +65,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors
from vllm.tokenizers import get_tokenizer
from vllm.tokenizers.hf import get_cached_tokenizer
from vllm.transformers_utils.config import patch_rope_parameters
from vllm.transformers_utils.configs import (
IsaacConfig,
PixelShuffleSiglip2VisionConfig,
@@ -1284,11 +1285,14 @@ class IsaacForConditionalGeneration(
hf_to_vllm_mapper = WeightsMapper(
orig_to_new_prefix={
"lm_head.": "language_model.lm_head.",
"model.text_model.lm_head.": "language_model.lm_head.",
"model.text_model.": "language_model.model.",
"model.vision_embedding.0": "vision_embedding.transformer",
"model.vision_embedding.1": "vision_embedding.linear_fc1",
"model.vision_embedding.2": "vision_embedding.act",
"model.vision_embedding.3": "vision_embedding.linear_fc2",
"model.vision_embedding.": "vision_embedding.",
"model.lm_head.": "language_model.lm_head.",
"model.": "language_model.model.",
}
)
@@ -1319,7 +1323,25 @@ class IsaacForConditionalGeneration(
)
config.image_token_id = self.vision_token_id
config.rope_scaling["mrope_section"] = calculated_mrope_section
text_cfg = getattr(config, "text_config", None)
target_cfg = (
text_cfg
if text_cfg is not None and not isinstance(text_cfg, dict)
else config
)
rope_scaling = getattr(target_cfg, "rope_scaling", None)
if rope_scaling is None and target_cfg is config:
rope_scaling = getattr(config, "_rope_scaling", None)
patch_rope_parameters(target_cfg)
rope_parameters = target_cfg.rope_parameters
rope_parameters["mrope_section"] = calculated_mrope_section
if rope_scaling is not None and "mrope_interleaved" in rope_scaling:
rope_parameters.setdefault(
"mrope_interleaved", rope_scaling["mrope_interleaved"]
)
target_cfg.rope_parameters = rope_parameters
self.language_model = init_vllm_registered_model(
vllm_config=vllm_config,
architectures=["Qwen3ForCausalLM"],

View File

@@ -32,10 +32,14 @@ class IsaacConfig(Qwen3Config):
"""Configuration class for Isaac multimodal model."""
model_type = "isaac"
sub_configs = {"vision_config": PixelShuffleSiglip2VisionConfig}
sub_configs = {
"vision_config": PixelShuffleSiglip2VisionConfig,
"text_config": Qwen3Config,
}
def __init__(
self,
text_config=None,
vision_config=None,
vision_patch_size: int = 16,
vision_max_num_patches: int = 256,
@@ -48,6 +52,16 @@ class IsaacConfig(Qwen3Config):
):
super().__init__(**kwargs)
if isinstance(text_config, dict):
# from HF config
self.text_config = self.sub_configs["text_config"](**text_config)
elif text_config is None:
# For BC use all kwargs to init text config.
self.text_config = self.sub_configs["text_config"](**kwargs)
else:
# from Qwen3Config
self.text_config = text_config
# EventStreamProcessor parameters (for backward compatibility)
self.video_patch_size = vision_patch_size
self.vision_max_num_patches = vision_max_num_patches