[Model] Add support for moonshotai/Kimi-Audio-7B-Instruct (#36127)

Signed-off-by: tunglinwood <tunglinwood@gmail.com> Signed-off-by: tunglinwood <tomwu.tunglin@gmail.com> Signed-off-by: tunglinwood <113751333+tunglinwood@users.noreply.github.com>
2026-03-11 12:24:48 +08:00
parent a197eda9c3
commit 42fadebecb
14 changed files with 1446 additions and 29 deletions
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -198,13 +198,17 @@ def get_text_token_prompts(
            mm_counts,
            mm_options={},
        )
-        assert isinstance(inputs.prompt, str)
-
-        text_prompt = inputs.prompt
-        token_prompt = tokenizer.encode(
-            text_prompt,
-            add_special_tokens=_ADD_SPECIAL_TOKENS_OVERRIDES.get(model_type, True),
-        )
+        # Some models (e.g., Kimi-Audio) return token IDs directly instead of str
+        if isinstance(inputs.prompt, list):
+            text_prompt = None
+            token_prompt = inputs.prompt
+        else:
+            assert isinstance(inputs.prompt, str)
+            text_prompt = inputs.prompt
+            token_prompt = tokenizer.encode(
+                text_prompt,
+                add_special_tokens=_ADD_SPECIAL_TOKENS_OVERRIDES.get(model_type, True),
+            )

    return text_prompt, token_prompt

--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -857,6 +857,15 @@ _MULTIMODAL_EXAMPLE_MODELS = {
        "Kwai-Keye/Keye-VL-1_5-8B",
        trust_remote_code=True,
    ),
+    "MoonshotKimiaForCausalLM": _HfExamplesInfo(
+        "moonshotai/Kimi-Audio-7B-Instruct",
+        tokenizer_mode="kimi_audio",
+        trust_remote_code=True,
+    ),
+    "KimiK25ForConditionalGeneration": _HfExamplesInfo(
+        "moonshotai/Kimi-K2.5",
+        trust_remote_code=True,
+    ),
    "KimiVLForConditionalGeneration": _HfExamplesInfo(
        "moonshotai/Kimi-VL-A3B-Instruct",
        extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"},
@@ -870,10 +879,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
            )
        },
    ),
-    "KimiK25ForConditionalGeneration": _HfExamplesInfo(
-        "moonshotai/Kimi-K2.5",
-        trust_remote_code=True,
-    ),
    "LightOnOCRForConditionalGeneration": _HfExamplesInfo(
        "lightonai/LightOnOCR-1B-1025"
    ),
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -103,6 +103,12 @@ def can_initialize(
            "pickle error when loading `transformers.models.auto.CONFIG_MAPPING`"
        )

+    if model_arch == "MoonshotKimiaForCausalLM":
+        pytest.skip(
+            "Kimi-Audio requires SpeechToTextConfig "
+            "which is not configured in test environment"
+        )
+
    if model_arch in ["DeepseekV32ForCausalLM", "GlmMoeDsaForCausalLM"]:
        from vllm.platforms import current_platform