[Multimodal] Simplify MM input definitions (#33331)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2026-01-29 21:32:04 +08:00
committed by GitHub
parent 17b17c0684
commit c6e7404cc5
17 changed files with 142 additions and 164 deletions

View File

@@ -131,7 +131,7 @@ def test_e2e_streaming_with_multimodal_features(mock_model_runner_with_input_bat
# Step 1: Create initial request state with one multimodal feature
mm_feature_1 = MultiModalFeatureSpec(
data=MultiModalKwargsItem.dummy("audio"),
data=MultiModalKwargsItem.dummy(),
modality="audio",
identifier="audio_1",
mm_position=PlaceholderRange(offset=2, length=10),
@@ -158,7 +158,7 @@ def test_e2e_streaming_with_multimodal_features(mock_model_runner_with_input_bat
# The scheduler has already set prompt_token_ids to the full sequence
# (original prompt + intermediate outputs + new prompt with new multimodal feature)
mm_feature_2 = MultiModalFeatureSpec(
data=MultiModalKwargsItem.dummy("audio"),
data=MultiModalKwargsItem.dummy(),
modality="audio",
identifier="audio_2",
mm_position=PlaceholderRange(offset=15, length=5),