nano_nemotron_vl: suppress readonly torch.from_numpy() warning in image and video resize paths (#37903)

Signed-off-by: Netanel Haber <58652339+netanel-haber@users.noreply.github.com>
2026-03-25 01:25:56 +02:00
parent b73b5b0629
commit a0d487b2e1
1 changed files with 32 additions and 44 deletions
--- a/vllm/transformers_utils/processors/nano_nemotron_vl.py
+++ b/vllm/transformers_utils/processors/nano_nemotron_vl.py
@@ -8,6 +8,7 @@
 # --------------------------------------------------------

 import math
+import warnings
 from abc import ABC, abstractmethod
 from collections.abc import Sequence
 from dataclasses import dataclass
@@ -66,6 +67,30 @@ def input_conditioner(x: torch.Tensor, norm_mean: torch.Tensor, norm_std: torch.
    return (x - norm_mean) / norm_std


+def _bicubic_from_ndarray(
+    array: npt.NDArray[Any], *, size: tuple[int, int]
+) -> torch.Tensor:
+    """
+    Convert a 4D NHWC ndarray to NCHW and interpolate with bicubic.
+    Suppresses PyTorch's non-writable NumPy warning because interpolate copies,
+    and torch.from_numpy(array) is discarded at the end of function scope.
+    """
+
+    with warnings.catch_warnings():
+        msg = "The given NumPy array is not writ.*"
+        # Apparently, different versions of PyTorch use writable or writeable.
+        warnings.filterwarnings("ignore", message=msg, category=UserWarning)
+        tensor = torch.from_numpy(array)
+    assert tensor.ndim == 4, f"{tensor.ndim=}"
+    tensor = tensor.permute(0, 3, 1, 2)
+    return (
+        torch.nn.functional.interpolate(
+            tensor, size=size, mode="bicubic", align_corners=False, antialias=True
+        )
+        / 255.0
+    )
+
+
 def dynamic_preprocess(
    image,
    *,
@@ -90,36 +115,19 @@ def dynamic_preprocess(
        image.convert("RGB") if image.mode != "RGB" else image, dtype=np.uint8
    )

-    image = torch.from_numpy(image).unsqueeze(0)  # (1, H, W, 3)
-    image = image.permute(0, 3, 1, 2)  # (1, 3, H, W)
+    image = np.expand_dims(image, axis=0)

-    resized_img = torch.nn.functional.interpolate(
-        image,
-        size=(target_height, target_width),
-        mode="bicubic",
-        align_corners=False,
-        antialias=True,
-    )
+    resized_img = _bicubic_from_ndarray(image, size=(target_height, target_width))
    B, C, H, W = resized_img.shape
    hp, wp = H // image_size, W // image_size
    patches = (
        resized_img.reshape(B, C, hp, image_size, wp, image_size)
        .permute(0, 2, 4, 1, 3, 5)
        .reshape(B * hp * wp, C, image_size, image_size)
-        / 255.0
    )

    if use_thumbnail and patches.shape[0] > 1:
-        thumb = (
-            torch.nn.functional.interpolate(
-                image,
-                size=(image_size, image_size),
-                mode="bicubic",
-                align_corners=False,
-                antialias=True,
-            )
-            / 255.0
-        )
+        thumb = _bicubic_from_ndarray(image, size=(image_size, image_size))
        patches = torch.cat([patches, thumb], dim=0)

    return list(patches)
@@ -241,21 +249,9 @@ def video_to_pixel_values(
            downsample_ratio=downsample_ratio,
        )
        if video_tensor.shape[2] != target_h or video_tensor.shape[3] != target_w:
-            video_tensor = torch.nn.functional.interpolate(
-                video_tensor,
-                size=(target_h, target_w),
-                mode="bicubic",
-                align_corners=False,
-                antialias=True,
-            )
+            return _bicubic_from_ndarray(video, size=(target_h, target_w))
    elif video_tensor.shape[2] != input_size or video_tensor.shape[3] != input_size:
-        video_tensor = torch.nn.functional.interpolate(
-            video_tensor,
-            size=(input_size, input_size),
-            mode="bicubic",
-            align_corners=False,
-            antialias=True,
-        )
+        return _bicubic_from_ndarray(video, size=(input_size, input_size))

    video_tensor = video_tensor / 255.0

@@ -385,16 +381,8 @@ class DynamicResolutionImageTiler:
            params.media.convert("RGB") if params.media.mode != "RGB" else params.media,
            dtype=np.uint8,
        )
-        resized_img = (
-            torch.nn.functional.interpolate(
-                torch.from_numpy(image).unsqueeze(0).permute(0, 3, 1, 2),
-                size=target_size,
-                mode="bicubic",
-                align_corners=False,
-                antialias=True,
-            )
-            / 255.0
-        )
+        image = np.expand_dims(image, axis=0)
+        resized_img = _bicubic_from_ndarray(image, size=target_size)
        return list(resized_img)

    def process_media(