From 77d24c4bfedc9812a703b865cf5935f4e941660f Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Sun, 22 Mar 2026 08:57:20 -0400 Subject: [PATCH] [Bug] Fix fp8 deepgemm batch invariant (#37718) Signed-off-by: yewentao256 --- vllm/model_executor/layers/quantization/utils/fp8_utils.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index 78b123402..a974e2c57 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -305,6 +305,11 @@ def _flashinfer_fp8_blockscale_gemm_impl( ) return output + from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant + + if vllm_is_batch_invariant(): + return run_deepgemm(input, weight, weight_scale) + condition = input.shape[0] < 32 # PyTorch's torch.compile cannot handle input-dependent control flow in standard