Enable conversion of multimodal models to pooling tasks (#24451)
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
This commit is contained in:
committed by
GitHub
parent
6a50eaa0d3
commit
e090b7b45b
114
tests/models/language/pooling/test_mm_classifier_conversion.py
Normal file
114
tests/models/language/pooling/test_mm_classifier_conversion.py
Normal file
@@ -0,0 +1,114 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
def test_idefics_multimodal(
|
||||
vllm_runner,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
if current_platform.is_rocm():
|
||||
# ROCm Triton FA does not currently support sliding window attention
|
||||
# switch to use ROCm CK FA backend
|
||||
monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
with vllm_runner(model_name="HuggingFaceM4/Idefics3-8B-Llama3",
|
||||
runner="pooling",
|
||||
task="classify",
|
||||
convert="classify",
|
||||
load_format="dummy",
|
||||
max_model_len=512,
|
||||
enforce_eager=True,
|
||||
tensor_parallel_size=1,
|
||||
disable_log_stats=True,
|
||||
dtype="bfloat16") as vllm_model:
|
||||
llm = vllm_model.get_llm()
|
||||
outputs = llm.classify(prompts)
|
||||
for output in outputs:
|
||||
assert len(output.outputs.probs) == 2
|
||||
|
||||
|
||||
def update_config(config):
|
||||
config.text_config.update({
|
||||
"architectures": ["Gemma3ForSequenceClassification"],
|
||||
"classifier_from_token": ["A", "B", "C", "D", "E"],
|
||||
"method":
|
||||
"no_post_processing",
|
||||
"id2label": {
|
||||
"A": "Chair",
|
||||
"B": "Couch",
|
||||
"C": "Table",
|
||||
"D": "Bed",
|
||||
"E": "Cupboard"
|
||||
},
|
||||
})
|
||||
return config
|
||||
|
||||
|
||||
def test_gemma_multimodal(
|
||||
vllm_runner,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
if current_platform.is_rocm():
|
||||
# ROCm Triton FA does not currently support sliding window attention
|
||||
# switch to use ROCm CK FA backend
|
||||
monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
|
||||
|
||||
messages = [{
|
||||
"role":
|
||||
"system",
|
||||
"content":
|
||||
"""
|
||||
You are a helpful assistant. You will be given a product description
|
||||
which may also include an image. Classify the following product into
|
||||
one of the categories:
|
||||
|
||||
A = chair
|
||||
B = couch
|
||||
C = table
|
||||
D = bed
|
||||
E = cupboard
|
||||
|
||||
You'll answer with exactly one letter (A, B, C, D, or E)."""
|
||||
}, {
|
||||
"role":
|
||||
"user",
|
||||
"content": [{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url":
|
||||
"https://upload.wikimedia.org/wikipedia/commons/c/c6/Set_of_fourteen_side_chairs_MET_DP110780.jpg"
|
||||
}
|
||||
}, {
|
||||
"type": "text",
|
||||
"text": "A fine 19th century piece of furniture."
|
||||
}]
|
||||
}]
|
||||
|
||||
with vllm_runner(model_name="google/gemma-3-4b-it",
|
||||
runner="pooling",
|
||||
task="classify",
|
||||
convert="classify",
|
||||
load_format="auto",
|
||||
hf_overrides=update_config,
|
||||
override_pooler_config={"pooling_type": "LAST"},
|
||||
max_model_len=512,
|
||||
enforce_eager=True,
|
||||
tensor_parallel_size=1,
|
||||
disable_log_stats=True,
|
||||
dtype="bfloat16") as vllm_model:
|
||||
|
||||
llm = vllm_model.get_llm()
|
||||
prompts = llm.preprocess_chat(messages)
|
||||
|
||||
result = llm.classify(prompts)
|
||||
assert result[0].outputs.probs[0] > 0.95
|
||||
assert all(c < 0.05 for c in result[0].outputs.probs[1:])
|
||||
Reference in New Issue
Block a user