diff --git a/docker/Dockerfile b/docker/Dockerfile index 61ebf970f..964700e2a 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -132,9 +132,7 @@ WORKDIR /workspace COPY requirements/common.txt requirements/common.txt COPY requirements/cuda.txt requirements/cuda.txt RUN --mount=type=cache,target=/root/.cache/uv \ - # TODO: remove apache-tvm-ffi once FlashInfer is fixed https://github.com/flashinfer-ai/flashinfer/issues/1962 - uv pip install --python /opt/venv/bin/python3 --pre apache-tvm-ffi==0.1.0b15 \ - && uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \ + uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \ --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') # cuda arch list used by torch @@ -356,16 +354,14 @@ RUN --mount=type=cache,target=/root/.cache/uv \ # Install vllm wheel first, so that torch etc will be installed. RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \ --mount=type=cache,target=/root/.cache/uv \ - # TODO: remove apache-tvm-ffi once FlashInfer is fixed https://github.com/flashinfer-ai/flashinfer/issues/1962 - uv pip install --system --pre apache-tvm-ffi==0.1.0b15 \ - && uv pip install --system dist/*.whl --verbose \ + uv pip install --system dist/*.whl --verbose \ --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') # Install FlashInfer pre-compiled kernel cache and binaries # https://docs.flashinfer.ai/installation.html RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system flashinfer-cubin==0.4.1 \ - && uv pip install --system flashinfer-jit-cache==0.4.1 \ + uv pip install --system flashinfer-cubin==0.5.2 \ + && uv pip install --system flashinfer-jit-cache==0.5.2 \ --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \ && flashinfer show-config diff --git a/docker/Dockerfile.nightly_torch b/docker/Dockerfile.nightly_torch index 6dfa56017..b88b9c499 100644 --- a/docker/Dockerfile.nightly_torch +++ b/docker/Dockerfile.nightly_torch @@ -246,7 +246,7 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2. # build flashinfer for torch nightly from source around 10 mins -# release version: v0.4.1 +# release version: v0.5.2 # todo(elainewy): cache flashinfer build result for faster build ENV CCACHE_DIR=/root/.cache/ccache RUN --mount=type=cache,target=/root/.cache/ccache \ @@ -254,7 +254,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \ echo "git clone flashinfer..." \ && git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \ && cd flashinfer \ - && git checkout v0.4.1\ + && git checkout v0.5.2 \ && git submodule update --init --recursive \ && echo "finish git clone flashinfer..." \ && rm -rf build \ diff --git a/requirements/cuda.txt b/requirements/cuda.txt index 5f7d520cd..4e393d6b6 100644 --- a/requirements/cuda.txt +++ b/requirements/cuda.txt @@ -12,4 +12,4 @@ torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytor # Build from https://github.com/facebookresearch/xformers/releases/tag/v0.0.32.post1 xformers==0.0.33+5d4b92a5.d20251029; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.9 # FlashInfer should be updated together with the Dockerfile -flashinfer-python==0.4.1 +flashinfer-python==0.5.2 diff --git a/tests/kernels/attention/test_flashinfer_trtllm_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_attention.py index 79981009c..693b849eb 100644 --- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py +++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py @@ -238,9 +238,11 @@ def test_flashinfer_trtllm_decode_with_baseline( if q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP4_DTYPE: rtol, atol = 7e-2, 9e-2 elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP8_DTYPE: - rtol, atol = 2e-2, 4e-2 + rtol, atol = 3e-2, 4e-2 elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == dtype: - rtol, atol = 1e-2, 2e-2 + rtol, atol = 2e-2, 2e-2 + elif kv_quant_dtype == FP8_DTYPE: + rtol, atol = 4e-2, 6e-2 else: rtol, atol = 1e-2, 1e-2