[Bugfix] Fix chunked prefill with model dtype float32 on Turing Devices (#9850)
Signed-off-by: Wallas Santos <wallashss@ibm.com> Co-authored-by: Michael Goin <michael@neuralmagic.com>
This commit is contained in:
@@ -1055,6 +1055,7 @@ class EngineArgs:
|
||||
msg = "Chunked prefill is not supported for embedding models"
|
||||
raise ValueError(msg)
|
||||
|
||||
|
||||
speculative_config = SpeculativeConfig.maybe_create_spec_config(
|
||||
target_model_config=model_config,
|
||||
target_parallel_config=parallel_config,
|
||||
|
||||
Reference in New Issue
Block a user