[feat] enable SM100 CUTLASS block scaled group gemm for smaller batch sizes (#20640)

Signed-off-by: Duncan Moss <djm.moss@gmail.com>
2025-07-08 20:03:35 -07:00
parent 34dad19e7b
commit 97abeb1daa
2 changed files with 5 additions and 7 deletions
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1180,7 +1180,7 @@ def fused_experts(
            apply_router_weight_on_input=apply_router_weight_on_input,
        )
    elif (allow_cutlass_block_scaled_grouped_gemm and use_fp8_w8a8
-          and _valid_cutlass_block_scaled_grouped_gemm(hidden_states, w1, w2)):
+          and _valid_cutlass_block_scaled_grouped_gemm(w1, w2)):
        assert apply_router_weight_on_input is False
        return run_cutlass_block_scaled_fused_experts(
            a=hidden_states,