[Multimodal] Simplify MM input definitions (#33331)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-01-29 21:32:04 +08:00
parent 17b17c0684
commit c6e7404cc5
17 changed files with 142 additions and 164 deletions
--- a/tests/v1/streaming_input/test_gpu_model_runner_streaming.py
+++ b/tests/v1/streaming_input/test_gpu_model_runner_streaming.py
@@ -131,7 +131,7 @@ def test_e2e_streaming_with_multimodal_features(mock_model_runner_with_input_bat

    # Step 1: Create initial request state with one multimodal feature
    mm_feature_1 = MultiModalFeatureSpec(
-        data=MultiModalKwargsItem.dummy("audio"),
+        data=MultiModalKwargsItem.dummy(),
        modality="audio",
        identifier="audio_1",
        mm_position=PlaceholderRange(offset=2, length=10),
@@ -158,7 +158,7 @@ def test_e2e_streaming_with_multimodal_features(mock_model_runner_with_input_bat
    # The scheduler has already set prompt_token_ids to the full sequence
    # (original prompt + intermediate outputs + new prompt with new multimodal feature)
    mm_feature_2 = MultiModalFeatureSpec(
-        data=MultiModalKwargsItem.dummy("audio"),
+        data=MultiModalKwargsItem.dummy(),
        modality="audio",
        identifier="audio_2",
        mm_position=PlaceholderRange(offset=15, length=5),