[VLM] Separate text-only and vision variants of the same model architecture (#13157)

2025-02-13 22:19:15 +08:00
parent 02ed8a1fbe
commit 1bc3b5e71b
15 changed files with 1728 additions and 1642 deletions
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -105,7 +105,9 @@ def run_glm4v(question: str, modality: str):
              max_num_seqs=2,
              trust_remote_code=True,
              enforce_eager=True,
+              hf_overrides={"architectures": ["GLM4VForCausalLM"]},
              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+
    prompt = f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
        {question}<|assistant|>"

@@ -495,6 +497,7 @@ def run_qwen_vl(question: str, modality: str):
        trust_remote_code=True,
        max_model_len=1024,
        max_num_seqs=2,
+        hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )