[Bugfix] Fix chunked prefill with model dtype float32 on Turing Devices (#9850)

Signed-off-by: Wallas Santos <wallashss@ibm.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
This commit is contained in:
Wallas Henrique
2024-11-25 14:23:32 -03:00
committed by GitHub
parent d04b13a380
commit c27df94e1f
6 changed files with 122 additions and 13 deletions

View File

@@ -2388,6 +2388,16 @@ class VllmConfig:
self.quant_config = VllmConfig._get_quantization_config(
self.model_config, self.load_config)
if self.scheduler_config is not None and \
self.model_config is not None and \
self.scheduler_config.chunked_prefill_enabled and \
self.model_config.dtype == torch.float32 and \
current_platform.get_device_capability() == (7, 5):
print_warning_once(
"Turing devices tensor cores do not support float32 matmul. "
"To workaround this limitation, vLLM will set 'ieee' input "
"precision for chunked prefill triton kernels.")
if self.compilation_config is None:
self.compilation_config = CompilationConfig()
if envs.VLLM_USE_V1 and not self.model_config.enforce_eager: