diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 60e1c9b07..98a09f200 100755 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -1889,6 +1889,32 @@ def run_step3(questions: list[str], modality: str) -> ModelRequestData: ) +# StepVL10B +def run_step_vl(questions: list[str], modality: str) -> ModelRequestData: + assert modality == "image" + + model_name = "stepfun-ai/Step3-VL-10B" + engine_args = EngineArgs( + model=model_name, + max_num_batched_tokens=4096, + tensor_parallel_size=1, + trust_remote_code=True, + limit_mm_per_prompt={modality: 1}, + reasoning_parser="deepseek_r1", + ) + + prompts = [ + "<|begin▁of▁sentence|> You are a helpful assistant.<|BOT|>user\n " + f"{question} <|EOT|><|BOT|>assistant\n\n" + for question in questions + ] + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) + + # omni-research/Tarsier-7b def run_tarsier(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -2006,6 +2032,7 @@ model_example_map = { "skywork_chat": run_skyworkr1v, "smolvlm": run_smolvlm, "step3": run_step3, + "stepvl": run_step_vl, "tarsier": run_tarsier, "tarsier2": run_tarsier2, } diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index db213d1ff..9973fe27a 100755 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -1182,6 +1182,32 @@ def load_step3(question: str, image_urls: list[str]) -> ModelRequestData: ) +def load_step_vl(question: str, image_urls: list[str]) -> ModelRequestData: + model_name = "stepfun-ai/Step3-VL-10B" + + engine_args = EngineArgs( + model=model_name, + max_num_batched_tokens=4096, + limit_mm_per_prompt={"image": len(image_urls)}, + hf_overrides={"vision_config": {"enable_patch": False}}, + trust_remote_code=True, + reasoning_parser="deepseek_r1", + ) + + prompt = ( + "<|begin▁of▁sentence|> You are a helpful assistant.<|BOT|>user\n " + f"{'' * len(image_urls)}{question}<|EOT|><|BOT|>" + "assistant\n\n" + ) + image_data = [fetch_image(url) for url in image_urls] + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + image_data=image_data, + ) + + def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "omni-research/Tarsier-7b" @@ -1374,6 +1400,7 @@ model_example_map = { "rvl": load_r_vl, "smolvlm": load_smolvlm, "step3": load_step3, + "stepvl": load_step_vl, "tarsier": load_tarsier, "tarsier2": load_tarsier2, "glm4_5v": load_glm4_5v,