[Kernel] Fix CUTLASS 3.x custom broadcast load epilogue (#5516)

2024-06-14 12:30:15 -04:00
parent d47af2bc02
commit 703475f6c2
2 changed files with 2 additions and 4 deletions
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -257,9 +257,7 @@ class Fp8LinearMethod(LinearMethodBase):
        #   If dynamic, layer.input_scale is None and x_scale computed from x.
        #   If static, layer.input_scale is scalar and x_scale is input_scale.

-        # Temporarily disable CUTLASS kernels due to an illegal memory access
-        #if  bias is None and self.cutlass_fp8_supported:
-        if False:
+        if bias is None and self.cutlass_fp8_supported:
            qinput, x_scale = ops.scaled_fp8_quant(x, layer.input_scale)

            # Fused GEMM_DQ