[CI] Heavy refactoring of Voxtral multimodal audio model tests (#34294)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
This commit is contained in:
Andreas Karatzas
2026-02-13 22:04:29 -06:00
committed by GitHub
parent 60ca7981bc
commit de42abb366
11 changed files with 350 additions and 70 deletions

View File

@@ -10,6 +10,7 @@ from mistral_common.protocol.transcription.request import (
TranscriptionRequest,
)
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
from mistral_common.tokens.tokenizers.tekken import SpecialTokenPolicy
from vllm import LLM, EngineArgs, SamplingParams
from vllm.assets.audio import AudioAsset
@@ -26,7 +27,7 @@ ENGINE_CONFIG = dict(
load_format="mistral",
tokenizer_mode="mistral",
enforce_eager=True,
gpu_memory_utilization=0.4,
gpu_memory_utilization=0.9,
)
@@ -148,6 +149,9 @@ async def test_voxtral_realtime_generator(audio_assets, tokenizer, async_engine)
output_tokens_list.append(output_tokens)
texts = [tokenizer.decode(output_tokens) for output_tokens in output_tokens_list]
texts = [
tokenizer.decode(output_tokens, special_token_policy=SpecialTokenPolicy.IGNORE)
for output_tokens in output_tokens_list
]
texts[1] = texts[1].replace("a base hit", "OBS").replace("oh my", "oh, my")
assert texts == EXPECTED_TEXT