Add Automatic Prefix Caching (#2762)

Co-authored-by: ElizaWszola <eliza@neuralmagic.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
This commit is contained in:
Sage Moore
2024-03-02 03:50:01 -05:00
committed by GitHub
parent baee28c46c
commit ce4f5a29fb
18 changed files with 615 additions and 289 deletions

View File

@@ -39,15 +39,11 @@ async def generate(request: Request) -> Response:
"""
request_dict = await request.json()
prompt = request_dict.pop("prompt")
prefix_pos = request_dict.pop("prefix_pos", None)
stream = request_dict.pop("stream", False)
sampling_params = SamplingParams(**request_dict)
request_id = random_uuid()
results_generator = engine.generate(prompt,
sampling_params,
request_id,
prefix_pos=prefix_pos)
results_generator = engine.generate(prompt, sampling_params, request_id)
# Streaming case
async def stream_results() -> AsyncGenerator[bytes, None]: