[CI] Heavy refactoring of Voxtral multimodal audio model tests (#34294)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
This commit is contained in:
Andreas Karatzas
2026-02-13 22:04:29 -06:00
committed by GitHub
parent 60ca7981bc
commit de42abb366
11 changed files with 350 additions and 70 deletions

View File

@@ -184,22 +184,42 @@ def get_text_token_prompts(
text_prompt: str | None
token_prompt: list[int]
if isinstance(tokenizer, MistralTokenizer):
images = parsed_data.get("image", [])
request = ChatCompletionRequest(
messages=[
UserMessage(
content=[
TextChunk(text=""),
*(ImageChunk(image=image) for image in images),
]
),
]
# ChatCompletionRequest only supports ImageChunk natively;
# for other modalities (e.g. audio), fall back to the model's
# own dummy inputs builder which knows the right placeholders.
has_non_image = any(
k != "image" and count > 0 for k, count in mm_counts.items()
)
res = tokenizer.mistral.encode_chat_completion(request)
# Mistral does not support decode_tokens with skip_special_tokens=False
text_prompt = None
token_prompt = res.tokens
if has_non_image:
inputs = dummy_inputs.get_dummy_processor_inputs(
model_config.max_model_len,
mm_counts,
)
text_prompt = None
token_prompt = (
inputs.prompt
if isinstance(inputs.prompt, list)
else tokenizer.encode(inputs.prompt, add_special_tokens=False)
)
else:
images = parsed_data.get("image", [])
request = ChatCompletionRequest(
messages=[
UserMessage(
content=[
TextChunk(text=""),
*(ImageChunk(image=image) for image in images),
]
),
]
)
res = tokenizer.mistral.encode_chat_completion(request)
# Mistral does not support decode_tokens with
# skip_special_tokens=False
text_prompt = None
token_prompt = res.tokens
else:
inputs = dummy_inputs.get_dummy_processor_inputs(
model_config.max_model_len,