Online video support for VLMs (#10020)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: litianjian <litianjian@bytedance.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
litianjian
2024-11-08 04:25:59 +08:00
committed by GitHub
parent 97b8475beb
commit 28b2877d30
12 changed files with 598 additions and 31 deletions

View File

@@ -341,7 +341,7 @@ def input_processor_when_multimodal_input_video(ctx: InputContext,
get_llava_onevision_video_tokens(ctx, num_frames))
tokenizer = cached_get_tokenizer(model_config.tokenizer)
new_prompt, new_token_ids = repeat_and_pad_placeholder_tokens(
new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
tokenizer,
inputs.get("prompt"),
inputs["prompt_token_ids"],
@@ -350,7 +350,8 @@ def input_processor_when_multimodal_input_video(ctx: InputContext,
)
return token_inputs(prompt_token_ids=new_token_ids,
prompt=new_prompt,
multi_modal_data=multi_modal_data)
multi_modal_data=multi_modal_data,
multi_modal_placeholders={"video": ranges})
else:
raise TypeError(f"Invalid video type: {type(video_data)}")