Re-submit: Fix: Proper RGBA -> RGB conversion for PIL images. (#18569)

Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
2025-05-22 18:59:18 -07:00
parent 46791e1b4b
commit 04eb88dc80
15 changed files with 89 additions and 20 deletions
--- a/tests/models/multimodal/generation/test_phi4mm.py
+++ b/tests/models/multimodal/generation/test_phi4mm.py
@@ -12,7 +12,7 @@ from transformers import AutoTokenizer

 from vllm.assets.image import ImageAsset
 from vllm.lora.request import LoRARequest
-from vllm.multimodal.image import rescale_image_size
+from vllm.multimodal.image import convert_image_mode, rescale_image_size
 from vllm.platforms import current_platform
 from vllm.sequence import SampleLogprobs

@@ -267,7 +267,7 @@ def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str,

    # use the example speech question so that the model outputs are reasonable
    audio = librosa.load(speech_question, sr=None)
-    image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
+    image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")

    inputs_vision_speech = [
        (