[Bugfix] Fix incorrect updates to num_computed_tokens in multi-step scheduling (#9038)

Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2024-10-06 15:48:11 -04:00
parent fdf59d30ea
commit cb3b2b9ba4
6 changed files with 179 additions and 110 deletions
--- a/tests/core/utils.py
+++ b/tests/core/utils.py
@@ -16,6 +16,8 @@ def create_dummy_prompt(
    use_beam_search: bool = False,
    best_of: int = 1,
    prompt_tokens: Optional[List[int]] = None,
+    min_tokens: int = 0,
+    max_tokens: int = 16,
 ) -> Tuple[Sequence, SequenceGroup]:
    if not block_size:
        block_size = prompt_length
@@ -36,7 +38,9 @@ def create_dummy_prompt(
                              arrival_time=time.time(),
                              sampling_params=SamplingParams(
                                  use_beam_search=use_beam_search,
-                                  best_of=best_of),
+                                  best_of=best_of,
+                                  max_tokens=max_tokens,
+                                  min_tokens=min_tokens),
                              lora_request=lora_request)

    return prompt, seq_group