[Core][5/N] Fully working chunked prefill e2e (#3884)

This commit is contained in:
SangBin Cho
2024-04-11 09:56:48 +09:00
committed by GitHub
parent 63e7176f26
commit 67b4221a61
26 changed files with 927 additions and 315 deletions

View File

@@ -33,11 +33,16 @@ def test_models(
dtype: str,
max_tokens: int,
) -> None:
hf_model = hf_runner(model, dtype=dtype)
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
del hf_model
vllm_model = vllm_runner(model, dtype=dtype, tensor_parallel_size=2)
vllm_model = vllm_runner(
model,
dtype=dtype,
tensor_parallel_size=2,
)
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
del vllm_model