diff --git a/docker/Dockerfile b/docker/Dockerfile index a0915f9a7..a2fe0971d 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -538,7 +538,9 @@ RUN CUDA_VERSION_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') && \ cuda-nvrtc-${CUDA_VERSION_DASH} \ cuda-cuobjdump-${CUDA_VERSION_DASH} \ libcurand-dev-${CUDA_VERSION_DASH} \ - libcublas-${CUDA_VERSION_DASH} && \ + libcublas-${CUDA_VERSION_DASH} \ + # Required by fastsafetensors (fixes #20384) + libnuma-dev && \ # Fixes nccl_allocator requiring nccl.h at runtime # https://github.com/vllm-project/vllm/blob/1336a1ea244fa8bfd7e72751cabbdb5b68a0c11a/vllm/distributed/device_communicators/pynccl_allocator.py#L22 # NCCL packages don't use the cuda-MAJOR-MINOR naming convention, diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index fad79af9c..218356367 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -19,7 +19,8 @@ ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}} # Install some basic utilities RUN apt-get update -q -y && apt-get install -q -y \ sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \ - apt-transport-https ca-certificates wget curl + apt-transport-https ca-certificates wget curl \ + libnuma-dev RUN python3 -m pip install --upgrade pip # Remove sccache only if not using sccache (it exists in base image from Dockerfile.rocm_base) ARG USE_SCCACHE diff --git a/requirements/cuda.txt b/requirements/cuda.txt index 75831c39e..c6b8a82ea 100644 --- a/requirements/cuda.txt +++ b/requirements/cuda.txt @@ -15,6 +15,9 @@ flashinfer-cubin==0.6.7 # breaking changes in 1.19.0 nvidia-cudnn-frontend>=1.13.0,<1.19.0 +# Required for faster safetensors model loading +fastsafetensors >= 0.2.2 + # QuACK and Cutlass DSL for FA4 (cute-DSL implementation) nvidia-cutlass-dsl>=4.4.2 quack-kernels>=0.3.3 diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt index a441bfef0..ddea10f5d 100644 --- a/requirements/rocm-test.txt +++ b/requirements/rocm-test.txt @@ -276,7 +276,9 @@ fastar==0.9.0 fastparquet==2026.3.0 # via genai-perf fastsafetensors==0.2.2 - # via -r requirements/rocm-test.in + # via + # -c requirements/rocm.txt + # -r requirements/rocm-test.in filelock==3.25.2 # via # -c requirements/common.txt diff --git a/requirements/rocm.txt b/requirements/rocm.txt index 6639e71a4..011d0e53f 100644 --- a/requirements/rocm.txt +++ b/requirements/rocm.txt @@ -20,4 +20,6 @@ conch-triton-kernels==1.2.1 timm>=1.0.17 # amd-quark: required for Quark quantization on ROCm # To be consistent with test_quark.py -amd-quark>=0.8.99 \ No newline at end of file +amd-quark>=0.8.99 +# Required for faster safetensors model loading +fastsafetensors >= 0.2.2 \ No newline at end of file