[Bugfix] Fix for Spec model TP + Chunked Prefill (#10232)

Signed-off-by: andoorve <37849411+andoorve@users.noreply.github.com> Signed-off-by: Sourashis Roy <sroy@roblox.com> Co-authored-by: Sourashis Roy <sroy@roblox.com>
2024-11-26 09:11:16 -08:00
parent 1f6584ee85
commit db66e018ea
8 changed files with 144 additions and 72 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1409,16 +1409,6 @@ class SpeculativeConfig:
                    draft_hf_config
            )

-            if (enable_chunked_prefill and \
-                 speculative_draft_tensor_parallel_size != 1):
-                # TODO - Investigate why the error reported in
-                # https://github.com/vllm-project/vllm/pull/9291#issuecomment-2463266258
-                # is happening and re-enable it.
-                raise ValueError(
-                    "Chunked prefill and speculative decoding can be enabled "
-                    "simultaneously only for draft models with tensor "
-                    "parallel size 1.")
-
            draft_model_config.max_model_len = (
                SpeculativeConfig._maybe_override_draft_max_model_len(
                    speculative_max_model_len,