feature/issac 0.2 (#31550)
Signed-off-by: Roger Wang <hey@rogerw.io> Co-authored-by: Roger Wang <hey@rogerw.io>
This commit is contained in:
committed by
GitHub
parent
ea6d067a2a
commit
e45946bd91
@@ -526,7 +526,10 @@ VLM_TEST_SETTINGS = {
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
),
|
||||
"isaac": VLMTestInfo(
|
||||
models=["PerceptronAI/Isaac-0.1"],
|
||||
models=[
|
||||
"PerceptronAI/Isaac-0.1",
|
||||
"PerceptronAI/Isaac-0.2-2B-Preview",
|
||||
],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: (
|
||||
f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n"
|
||||
|
||||
@@ -708,6 +708,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
"IsaacForConditionalGeneration": _HfExamplesInfo(
|
||||
"PerceptronAI/Isaac-0.1",
|
||||
trust_remote_code=True,
|
||||
extras={"0.2-2B-Preview": "PerceptronAI/Isaac-0.2-2B-Preview"},
|
||||
),
|
||||
"InternS1ForConditionalGeneration": _HfExamplesInfo(
|
||||
"internlm/Intern-S1", trust_remote_code=True
|
||||
|
||||
@@ -65,6 +65,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
from vllm.tokenizers.hf import get_cached_tokenizer
|
||||
from vllm.transformers_utils.config import patch_rope_parameters
|
||||
from vllm.transformers_utils.configs import (
|
||||
IsaacConfig,
|
||||
PixelShuffleSiglip2VisionConfig,
|
||||
@@ -1284,11 +1285,14 @@ class IsaacForConditionalGeneration(
|
||||
hf_to_vllm_mapper = WeightsMapper(
|
||||
orig_to_new_prefix={
|
||||
"lm_head.": "language_model.lm_head.",
|
||||
"model.text_model.lm_head.": "language_model.lm_head.",
|
||||
"model.text_model.": "language_model.model.",
|
||||
"model.vision_embedding.0": "vision_embedding.transformer",
|
||||
"model.vision_embedding.1": "vision_embedding.linear_fc1",
|
||||
"model.vision_embedding.2": "vision_embedding.act",
|
||||
"model.vision_embedding.3": "vision_embedding.linear_fc2",
|
||||
"model.vision_embedding.": "vision_embedding.",
|
||||
"model.lm_head.": "language_model.lm_head.",
|
||||
"model.": "language_model.model.",
|
||||
}
|
||||
)
|
||||
@@ -1319,7 +1323,25 @@ class IsaacForConditionalGeneration(
|
||||
)
|
||||
config.image_token_id = self.vision_token_id
|
||||
|
||||
config.rope_scaling["mrope_section"] = calculated_mrope_section
|
||||
text_cfg = getattr(config, "text_config", None)
|
||||
target_cfg = (
|
||||
text_cfg
|
||||
if text_cfg is not None and not isinstance(text_cfg, dict)
|
||||
else config
|
||||
)
|
||||
|
||||
rope_scaling = getattr(target_cfg, "rope_scaling", None)
|
||||
if rope_scaling is None and target_cfg is config:
|
||||
rope_scaling = getattr(config, "_rope_scaling", None)
|
||||
|
||||
patch_rope_parameters(target_cfg)
|
||||
rope_parameters = target_cfg.rope_parameters
|
||||
rope_parameters["mrope_section"] = calculated_mrope_section
|
||||
if rope_scaling is not None and "mrope_interleaved" in rope_scaling:
|
||||
rope_parameters.setdefault(
|
||||
"mrope_interleaved", rope_scaling["mrope_interleaved"]
|
||||
)
|
||||
target_cfg.rope_parameters = rope_parameters
|
||||
self.language_model = init_vllm_registered_model(
|
||||
vllm_config=vllm_config,
|
||||
architectures=["Qwen3ForCausalLM"],
|
||||
|
||||
@@ -32,10 +32,14 @@ class IsaacConfig(Qwen3Config):
|
||||
"""Configuration class for Isaac multimodal model."""
|
||||
|
||||
model_type = "isaac"
|
||||
sub_configs = {"vision_config": PixelShuffleSiglip2VisionConfig}
|
||||
sub_configs = {
|
||||
"vision_config": PixelShuffleSiglip2VisionConfig,
|
||||
"text_config": Qwen3Config,
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
text_config=None,
|
||||
vision_config=None,
|
||||
vision_patch_size: int = 16,
|
||||
vision_max_num_patches: int = 256,
|
||||
@@ -48,6 +52,16 @@ class IsaacConfig(Qwen3Config):
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
if isinstance(text_config, dict):
|
||||
# from HF config
|
||||
self.text_config = self.sub_configs["text_config"](**text_config)
|
||||
elif text_config is None:
|
||||
# For BC use all kwargs to init text config.
|
||||
self.text_config = self.sub_configs["text_config"](**kwargs)
|
||||
else:
|
||||
# from Qwen3Config
|
||||
self.text_config = text_config
|
||||
|
||||
# EventStreamProcessor parameters (for backward compatibility)
|
||||
self.video_patch_size = vision_patch_size
|
||||
self.vision_max_num_patches = vision_max_num_patches
|
||||
|
||||
Reference in New Issue
Block a user