Default model load/config/tokenizer to mistral format if relevant files exist (#28659)

Signed-off-by: Julien Denize <julien.denize@mistral.ai> Signed-off-by: Julien Denize <40604584+juliendenize@users.noreply.github.com> Signed-off-by: mgoin <mgoin64@gmail.com> Signed-off-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: mgoin <mgoin64@gmail.com>
2025-11-21 22:58:59 +01:00
parent c68c7b403d
commit 57430fc95c
15 changed files with 230 additions and 34 deletions
--- a/tests/models/language/generation/test_mistral.py
+++ b/tests/models/language/generation/test_mistral.py
@@ -208,7 +208,7 @@ def test_mistral_format(
    with vllm_runner(
        model,
        dtype=dtype,
-        tokenizer_mode="auto",
+        tokenizer_mode="hf",
        load_format="safetensors",
        config_format="hf",
    ) as hf_format_model:
--- a/tests/models/multimodal/test_mapping.py
+++ b/tests/models/multimodal/test_mapping.py
@@ -50,12 +50,24 @@ def test_hf_model_weights_mapper(model_arch: str):
    model_info.check_available_online(on_fail="skip")
    model_info.check_transformers_version(on_fail="skip")

+    is_mistral_model = model_arch in [
+        "Mistral3ForConditionalGeneration",
+        "PixtralForConditionalGeneration",
+        "VoxtralForConditionalGeneration",
+    ]
+
+    if not is_mistral_model or model_info.tokenizer_mode == "mistral":
+        tokenizer_mode = model_info.tokenizer_mode
+    else:
+        tokenizer_mode = "hf"
+
    model_id = model_info.default

    model_config = ModelConfig(
        model_id,
        tokenizer=model_info.tokenizer or model_id,
-        tokenizer_mode=model_info.tokenizer_mode,
+        tokenizer_mode=tokenizer_mode,
+        config_format="hf",
        revision=model_info.revision,
        trust_remote_code=model_info.trust_remote_code,
        hf_overrides=model_info.hf_overrides,
--- a/tests/models/quantization/test_bitsandbytes.py
+++ b/tests/models/quantization/test_bitsandbytes.py
@@ -259,6 +259,9 @@ def validate_generated_texts(
        tensor_parallel_size=vllm_tp_size,
        enforce_eager=False,
        default_torch_num_threads=1,
+        tokenizer_mode="hf",
+        load_format="hf",
+        config_format="hf",
    ) as llm:
        vllm_outputs = llm.generate_greedy(prompts, max_tokens)
        vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner")