diff --git a/requirements/common.txt b/requirements/common.txt
index a569df882..26d53f80a 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -42,7 +42,6 @@ depyf==0.20.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
 watchfiles # required for http server to monitor the updates of TLS files
 python-json-logger # Used by logging as per examples/others/logging_configuration.md
-scipy # Required for phi-4-multimodal-instruct
 ninja # Required for xgrammar, rocm, tpu, xpu
 pybase64 # fast base64 implementation
 cbor2 # Required for cross-language serialization of hashable objects
diff --git a/setup.py b/setup.py
index ea0dabfb0..8c952e01a 100644
--- a/setup.py
+++ b/setup.py
@@ -978,12 +978,13 @@ setup(
     ext_modules=ext_modules,
     install_requires=get_requirements(),
     extras_require={
-        "bench": ["pandas", "matplotlib", "seaborn", "datasets"],
+        "bench": ["pandas", "matplotlib", "seaborn", "datasets", "scipy"],
         "tensorizer": ["tensorizer==2.10.1"],
         "fastsafetensors": ["fastsafetensors >= 0.1.10"],
         "runai": ["runai-model-streamer[s3,gcs] >= 0.15.3"],
         "audio": [
             "librosa",
+            "scipy",
             "soundfile",
             "mistral_common[audio]",
         ],  # Required for audio processing
diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py
index 6e339d2ef..813725d6d 100644
--- a/vllm/multimodal/audio.py
+++ b/vllm/multimodal/audio.py
@@ -27,6 +27,12 @@ try:
 except ImportError:
     soundfile = PlaceholderModule("soundfile")  # type: ignore[assignment]
 
+
+try:
+    import scipy.signal as scipy_signal
+except ImportError:
+    scipy_signal = PlaceholderModule("scipy").placeholder_attr("signal")  # type: ignore[assignment]
+
 # ============================================================
 
 
@@ -173,13 +179,10 @@ def resample_audio_scipy(
     orig_sr: float,
     target_sr: float,
 ):
-    # lazy import scipy.signal, otherwise it will crash doc build.
-    import scipy.signal
-
     if orig_sr > target_sr:
-        return scipy.signal.resample_poly(audio, 1, orig_sr // target_sr)
+        return scipy_signal.resample_poly(audio, 1, orig_sr // target_sr)
     elif orig_sr < target_sr:
-        return scipy.signal.resample_poly(audio, target_sr // orig_sr, 1)
+        return scipy_signal.resample_poly(audio, target_sr // orig_sr, 1)
     return audio