[V1][Core] Support for Structured Outputs (#12388)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz> Signed-off-by: Russell Bryant <rbryant@redhat.com> Co-authored-by: Russell Bryant <rbryant@redhat.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Nick Hill <nhill@redhat.com>
This commit is contained in:
@@ -1,5 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import concurrent
|
||||
@@ -8,6 +10,7 @@ import datetime
|
||||
import enum
|
||||
import gc
|
||||
import getpass
|
||||
import importlib
|
||||
import importlib.metadata
|
||||
import importlib.util
|
||||
import inspect
|
||||
@@ -23,6 +26,7 @@ import tempfile
|
||||
import threading
|
||||
import time
|
||||
import traceback
|
||||
import types
|
||||
import uuid
|
||||
import warnings
|
||||
import weakref
|
||||
@@ -982,7 +986,7 @@ def current_stream() -> torch.cuda.Stream:
|
||||
return _current_stream
|
||||
|
||||
|
||||
def enable_trace_function_call_for_thread(vllm_config: "VllmConfig") -> None:
|
||||
def enable_trace_function_call_for_thread(vllm_config: VllmConfig) -> None:
|
||||
"""Set up function tracing for the current thread,
|
||||
if enabled via the VLLM_TRACE_FUNCTION environment variable
|
||||
"""
|
||||
@@ -1977,7 +1981,7 @@ class MemorySnapshot:
|
||||
self.non_torch_memory = self.cuda_memory - self.torch_memory
|
||||
self.timestamp = time.time()
|
||||
|
||||
def __sub__(self, other: "MemorySnapshot") -> "MemorySnapshot":
|
||||
def __sub__(self, other: MemorySnapshot) -> MemorySnapshot:
|
||||
return MemorySnapshot(
|
||||
torch_peak=self.torch_peak - other.torch_peak,
|
||||
cuda_memory=self.cuda_memory - other.cuda_memory,
|
||||
@@ -2306,3 +2310,54 @@ def warn_for_unimplemented_methods(cls: type[T]) -> type[T]:
|
||||
|
||||
type.__setattr__(cls, '__init__', wrapped_init)
|
||||
return cls
|
||||
|
||||
|
||||
class LazyLoader(types.ModuleType):
|
||||
"""
|
||||
LazyLoader module borrowed from Tensorflow
|
||||
https://github.com/tensorflow/tensorflow/blob/main/tensorflow/python/util/lazy_loader.py
|
||||
with a addition of "module caching".
|
||||
|
||||
Lazily import a module, mainly to avoid pulling in large dependencies.
|
||||
Modules such as `xgrammar` might do additional side effects, so we
|
||||
only want to use this when it is needed, delaying all eager effects
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
local_name: str,
|
||||
parent_module_globals: dict[str, Any],
|
||||
name: str,
|
||||
):
|
||||
self._local_name = local_name
|
||||
self._parent_module_globals = parent_module_globals
|
||||
self._module: types.ModuleType | None = None
|
||||
|
||||
super().__init__(str(name))
|
||||
|
||||
def _load(self) -> types.ModuleType:
|
||||
# Import the target module and insert it into the parent's namespace
|
||||
try:
|
||||
module = importlib.import_module(self.__name__)
|
||||
self._parent_module_globals[self._local_name] = module
|
||||
# The additional add to sys.modules
|
||||
# ensures library is actually loaded.
|
||||
sys.modules[self._local_name] = module
|
||||
except ModuleNotFoundError as err:
|
||||
raise err from None
|
||||
|
||||
# Update this object's dict so that if someone keeps a
|
||||
# reference to the LazyLoader, lookups are efficient
|
||||
# (__getattr__ is only called on lookups that fail).
|
||||
self.__dict__.update(module.__dict__)
|
||||
return module
|
||||
|
||||
def __getattr__(self, item: Any) -> Any:
|
||||
if self._module is None:
|
||||
self._module = self._load()
|
||||
return getattr(self._module, item)
|
||||
|
||||
def __dir__(self) -> list[str]:
|
||||
if self._module is None:
|
||||
self._module = self._load()
|
||||
return dir(self._module)
|
||||
|
||||
Reference in New Issue
Block a user