[Feature][V1]: suupports cached_tokens in response usage (#18149)
Co-authored-by: simon-mo <xmo@berkeley.edu>
This commit is contained in:
@@ -19,7 +19,8 @@ def model() -> LLM:
|
||||
enable_prefix_caching=True,
|
||||
long_prefill_token_threshold=2,
|
||||
max_num_batched_tokens=6,
|
||||
max_num_seqs=3)
|
||||
max_num_seqs=3,
|
||||
block_size=16)
|
||||
|
||||
|
||||
def test_concurrent_partial_prefill(model):
|
||||
@@ -27,3 +28,11 @@ def test_concurrent_partial_prefill(model):
|
||||
assert len(outputs) == 3
|
||||
for output in outputs:
|
||||
assert len(output.outputs) == 1
|
||||
|
||||
|
||||
def test_prefix_cache_stats_is_recorded(model):
|
||||
# 17 tokens will make sure first 16 tokens are cached in a block
|
||||
input_tokens = {"prompt_token_ids": [101] * 17}
|
||||
_ = model.generate([input_tokens])
|
||||
outputs = model.generate([input_tokens])
|
||||
assert outputs[0].num_cached_tokens == 16
|
||||
|
||||
Reference in New Issue
Block a user