[V1][Core] Support for Structured Outputs (#12388)

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
This commit is contained in:
Aaron Pham
2025-03-07 10:19:11 -05:00
committed by GitHub
parent 1e3598edeb
commit 80e9afb5bc
26 changed files with 1528 additions and 715 deletions

View File

@@ -1,12 +1,13 @@
# SPDX-License-Identifier: Apache-2.0
from typing import Optional
from vllm.config import CacheConfig, ModelConfig, SchedulerConfig
from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
from vllm.sampling_params import SamplingParams
from vllm.v1.core.scheduler import Scheduler, SchedulerOutput
from vllm.v1.outputs import ModelRunnerOutput
from vllm.v1.request import Request, RequestStatus
from vllm.v1.structured_output import StructuredOutputManager
EOS_TOKEN_ID = 50256
@@ -36,13 +37,21 @@ def create_scheduler(
swap_space=0,
cache_dtype="auto",
)
vllm_config = VllmConfig(
scheduler_config=scheduler_config,
model_config=model_config,
cache_config=cache_config,
)
cache_config.num_gpu_blocks = 10000
return Scheduler(scheduler_config,
model_config,
cache_config,
speculative_config=None,
lora_config=None,
log_stats=True)
return Scheduler(
scheduler_config,
model_config,
cache_config,
speculative_config=None,
lora_config=None,
log_stats=True,
structured_output_manager=StructuredOutputManager(vllm_config),
)
def create_requests(
@@ -249,7 +258,9 @@ def test_stop_via_update_from_output():
},
num_common_prefix_blocks=0,
finished_req_ids=set(),
free_encoder_input_ids=[])
free_encoder_input_ids=[],
structured_output_request_ids={},
grammar_bitmask=None)
model_output = ModelRunnerOutput(
req_ids=[req.request_id for req in requests],
@@ -299,7 +310,9 @@ def test_stop_via_update_from_output():
},
num_common_prefix_blocks=0,
finished_req_ids=set(),
free_encoder_input_ids=[])
free_encoder_input_ids=[],
structured_output_request_ids={},
grammar_bitmask=None)
model_output = ModelRunnerOutput(
req_ids=[req.request_id for req in requests],
@@ -347,7 +360,9 @@ def test_stop_via_update_from_output():
},
num_common_prefix_blocks=0,
finished_req_ids=set(),
free_encoder_input_ids=[])
free_encoder_input_ids=[],
structured_output_request_ids={},
grammar_bitmask=None)
model_output = ModelRunnerOutput(
req_ids=[req.request_id for req in requests],
@@ -392,7 +407,9 @@ def test_stop_via_update_from_output():
},
num_common_prefix_blocks=0,
finished_req_ids=set(),
free_encoder_input_ids=[])
free_encoder_input_ids=[],
structured_output_request_ids={},
grammar_bitmask=None)
model_output = ModelRunnerOutput(
req_ids=[requests[0].request_id],