[Bugfix] Fix chunked prefill with model dtype float32 on Turing Devices (#9850)

Signed-off-by: Wallas Santos <wallashss@ibm.com> Co-authored-by: Michael Goin <michael@neuralmagic.com>
2024-11-25 14:23:32 -03:00
parent d04b13a380
commit c27df94e1f
6 changed files with 122 additions and 13 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2388,6 +2388,16 @@ class VllmConfig:
            self.quant_config = VllmConfig._get_quantization_config(
                self.model_config, self.load_config)

+        if self.scheduler_config is not None and \
+            self.model_config is not None and \
+            self.scheduler_config.chunked_prefill_enabled and \
+            self.model_config.dtype == torch.float32 and \
+            current_platform.get_device_capability() == (7, 5):
+            print_warning_once(
+                "Turing devices tensor cores do not support float32 matmul. "
+                "To workaround this limitation, vLLM will set 'ieee' input "
+                "precision for chunked prefill triton kernels.")
+
        if self.compilation_config is None:
            self.compilation_config = CompilationConfig()
        if envs.VLLM_USE_V1 and not self.model_config.enforce_eager: