[Core] Remove prompt string from engine core data structures (#17214)

Signed-off-by: Nick Hill <nhill@redhat.com>
This commit is contained in:
Nick Hill
2025-04-25 23:41:05 -07:00
committed by GitHub
parent 513f074766
commit df6f3ce883
21 changed files with 40 additions and 76 deletions

View File

@@ -50,7 +50,6 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind,
# Make N requests.
requests = [
EngineCoreRequest(request_id=f"request-{idx}",
prompt=prompt,
prompt_token_ids=prompt_tokens,
arrival_time=0,
mm_inputs=None,
@@ -64,14 +63,13 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind,
output_kind=request_output_kind,
stop=[],
include_stop_str_in_output=False,
)) for idx, (prompt, prompt_tokens) in enumerate(
zip(dummy_test_vectors.prompt_strings,
dummy_test_vectors.prompt_tokens))
))
for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
]
# Add requests to the detokenizer.
for request in requests:
output_processor.add_request(request)
for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
output_processor.add_request(request, prompt)
gen_strings = {}
gen_tokens = {}
@@ -398,7 +396,6 @@ def test_logprobs_processor(request_output_kind: RequestOutputKind,
]
requests = [
EngineCoreRequest(request_id=request_id_list[idx],
prompt=prompt,
prompt_token_ids=prompt_tokens,
arrival_time=0,
mm_inputs=None,
@@ -414,14 +411,13 @@ def test_logprobs_processor(request_output_kind: RequestOutputKind,
include_stop_str_in_output=False,
logprobs=num_sample_logprobs,
prompt_logprobs=num_prompt_logprobs,
)) for idx, (prompt, prompt_tokens) in enumerate(
zip(dummy_test_vectors.prompt_strings,
dummy_test_vectors.prompt_tokens))
))
for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
]
# Add requests to the detokenizer.
for request in requests:
output_processor.add_request(request)
for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
output_processor.add_request(request, prompt)
gen_tokens = {}
gen_logprobs = {}
@@ -562,7 +558,6 @@ def test_stop_token(include_stop_str_in_output: bool,
request_id = "request-0"
request = EngineCoreRequest(
request_id=request_id,
prompt=prompt_string,
prompt_token_ids=prompt_tokens,
arrival_time=0,
mm_inputs=None,
@@ -583,7 +578,7 @@ def test_stop_token(include_stop_str_in_output: bool,
))
# Add request to the detokenizer.
output_processor.add_request(request)
output_processor.add_request(request, prompt_string)
# Loop over engine core steps; run output processor
gen_string = ""
@@ -659,7 +654,6 @@ def test_stop_string(include_stop_str_in_output: bool,
requests = [
EngineCoreRequest(
request_id=request_id_list[idx],
prompt=prompt,
prompt_token_ids=prompt_tokens,
arrival_time=0,
mm_inputs=None,
@@ -675,14 +669,13 @@ def test_stop_string(include_stop_str_in_output: bool,
include_stop_str_in_output=include_stop_str_in_output,
logprobs=num_sample_logprobs,
prompt_logprobs=None,
)) for idx, (prompt, prompt_tokens) in enumerate(
zip(dummy_test_vectors.prompt_strings,
dummy_test_vectors.prompt_tokens))
))
for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
]
# Add requests to the detokenizer.
for request in requests:
output_processor.add_request(request)
for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
output_processor.add_request(request, prompt)
gen_strings = {}
gen_tokens = {}
@@ -774,7 +767,6 @@ def test_iteration_stats(dummy_test_vectors):
requests = [
EngineCoreRequest(
request_id=f"request-{idx}",
prompt=prompt,
prompt_token_ids=prompt_tokens,
arrival_time=0,
mm_inputs=None,
@@ -783,15 +775,13 @@ def test_iteration_stats(dummy_test_vectors):
eos_token_id=None,
lora_request=None,
sampling_params=SamplingParams(),
) for idx, (prompt, prompt_tokens) in enumerate(
zip(dummy_test_vectors.prompt_strings,
dummy_test_vectors.prompt_tokens))
) for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
]
# Add all requests except one to the OutputProcessor.
num_active = len(dummy_test_vectors.generation_tokens) - 1
for request in requests[:num_active]:
output_processor.add_request(request)
output_processor.add_request(request, None)
inactive_request = requests[num_active]
# First iteration has 2 prefills.
@@ -817,7 +807,7 @@ def test_iteration_stats(dummy_test_vectors):
assert iteration_stats.num_generation_tokens == num_active
# Add a new request - prefill and 2 decodes in this step.
output_processor.add_request(inactive_request)
output_processor.add_request(inactive_request, None)
num_active += 1
outputs = engine_core.get_outputs()[:num_active]
iteration_stats = IterationStats()