[VLM] Separate text-only and vision variants of the same model architecture (#13157)

This commit is contained in:
Cyrus Leung
2025-02-13 22:19:15 +08:00
committed by GitHub
parent 02ed8a1fbe
commit 1bc3b5e71b
15 changed files with 1728 additions and 1642 deletions

View File

@@ -105,7 +105,9 @@ def run_glm4v(question: str, modality: str):
max_num_seqs=2,
trust_remote_code=True,
enforce_eager=True,
hf_overrides={"architectures": ["GLM4VForCausalLM"]},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
prompt = f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
{question}<|assistant|>"
@@ -495,6 +497,7 @@ def run_qwen_vl(question: str, modality: str):
trust_remote_code=True,
max_model_len=1024,
max_num_seqs=2,
hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
)