[ Kernel ] Enable Dynamic Per Token fp8 (#6547)

2024-07-19 19:08:15 -04:00
parent 07eb6f19f3
commit 4cc24f01b1
7 changed files with 67 additions and 38 deletions
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -214,7 +214,8 @@ class Fp8LinearMethod(LinearMethodBase):
            weight_scale=layer.weight_scale,
            input_scale=layer.input_scale,
            bias=bias,
-            cutlass_fp8_supported=self.cutlass_fp8_supported)
+            cutlass_fp8_supported=self.cutlass_fp8_supported,
+            use_per_token_if_dynamic=False)


 class Fp8MoEMethod(FusedMoEMethodBase):