[Bugfix] Fix chunked prefill with model dtype float32 on Turing Devices (#9850)

Signed-off-by: Wallas Santos <wallashss@ibm.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
This commit is contained in:
Wallas Henrique
2024-11-25 14:23:32 -03:00
committed by GitHub
parent d04b13a380
commit c27df94e1f
6 changed files with 122 additions and 13 deletions

View File

@@ -1055,6 +1055,7 @@ class EngineArgs:
msg = "Chunked prefill is not supported for embedding models"
raise ValueError(msg)
speculative_config = SpeculativeConfig.maybe_create_spec_config(
target_model_config=model_config,
target_parallel_config=parallel_config,