[Feature][V1]: suupports cached_tokens in response usage (#18149)

Co-authored-by: simon-mo <xmo@berkeley.edu>
2025-05-23 16:41:03 +08:00
parent 54af915949
commit b046cf792d
5 changed files with 27 additions and 5 deletions
--- a/tests/v1/core/test_scheduler_e2e.py
+++ b/tests/v1/core/test_scheduler_e2e.py
@@ -19,7 +19,8 @@ def model() -> LLM:
               enable_prefix_caching=True,
               long_prefill_token_threshold=2,
               max_num_batched_tokens=6,
-               max_num_seqs=3)
+               max_num_seqs=3,
+               block_size=16)


 def test_concurrent_partial_prefill(model):
@@ -27,3 +28,11 @@ def test_concurrent_partial_prefill(model):
    assert len(outputs) == 3
    for output in outputs:
        assert len(output.outputs) == 1
+
+
+def test_prefix_cache_stats_is_recorded(model):
+    # 17 tokens will make sure first 16 tokens are cached in a block
+    input_tokens = {"prompt_token_ids": [101] * 17}
+    _ = model.generate([input_tokens])
+    outputs = model.generate([input_tokens])
+    assert outputs[0].num_cached_tokens == 16