[Core] Enable decode of context length equal to max model length (#26168)

Signed-off-by: Yannick Schnider <yannick.schnider1@ibm.com>
2025-10-04 11:59:26 +02:00
parent d0df145c2a
commit f05fea1f5e
4 changed files with 32 additions and 15 deletions
--- a/tests/entrypoints/llm/test_generate.py
+++ b/tests/entrypoints/llm/test_generate.py
@@ -82,10 +82,11 @@ def test_max_model_len():
    for output in outputs:
        num_total_tokens = len(output.prompt_token_ids) + len(
            output.outputs[0].token_ids)
-        # Total tokens must not exceed max_model_len.
+        # Total tokens must not exceed max_model_len + 1 (the last token can be
+        # generated with the context length equal to the max model length)
        # It can be less if generation finishes due to other reasons (e.g., EOS)
        # before reaching the absolute model length limit.
-        assert num_total_tokens <= max_model_len
+        assert num_total_tokens <= max_model_len + 1


 def test_log_stats():