Adds anthropic /v1/messages endpoint to openai api_server (#27882)

Signed-off-by: bbartels <benjamin@bartels.dev>
Signed-off-by: Benjamin Bartels <benjamin@bartels.dev>
This commit is contained in:
Benjamin Bartels
2025-11-01 19:45:42 +00:00
committed by GitHub
parent c2ed069b32
commit 1e88fb751b
5 changed files with 139 additions and 462 deletions

View File

@@ -247,6 +247,23 @@ class RemoteOpenAIServer:
**kwargs,
)
def get_client_anthropic(self, **kwargs):
if "timeout" not in kwargs:
kwargs["timeout"] = 600
return anthropic.Anthropic(
base_url=self.url_for(),
api_key=self.DUMMY_API_KEY,
max_retries=0,
**kwargs,
)
def get_async_client_anthropic(self, **kwargs):
if "timeout" not in kwargs:
kwargs["timeout"] = 600
return anthropic.AsyncAnthropic(
base_url=self.url_for(), api_key=self.DUMMY_API_KEY, max_retries=0, **kwargs
)
class RemoteOpenAIServerCustom(RemoteOpenAIServer):
"""Launch test server with custom child process"""
@@ -293,131 +310,6 @@ class RemoteOpenAIServerCustom(RemoteOpenAIServer):
self.proc.kill()
class RemoteAnthropicServer:
DUMMY_API_KEY = "token-abc123" # vLLM's Anthropic server does not need API key
def __init__(
self,
model: str,
vllm_serve_args: list[str],
*,
env_dict: dict[str, str] | None = None,
seed: int | None = 0,
auto_port: bool = True,
max_wait_seconds: float | None = None,
) -> None:
if auto_port:
if "-p" in vllm_serve_args or "--port" in vllm_serve_args:
raise ValueError(
"You have manually specified the port when `auto_port=True`."
)
# Don't mutate the input args
vllm_serve_args = vllm_serve_args + ["--port", str(get_open_port())]
if seed is not None:
if "--seed" in vllm_serve_args:
raise ValueError(
f"You have manually specified the seed when `seed={seed}`."
)
vllm_serve_args = vllm_serve_args + ["--seed", str(seed)]
parser = FlexibleArgumentParser(description="vLLM's remote Anthropic server.")
subparsers = parser.add_subparsers(required=False, dest="subparser")
parser = ServeSubcommand().subparser_init(subparsers)
args = parser.parse_args(["--model", model, *vllm_serve_args])
self.host = str(args.host or "localhost")
self.port = int(args.port)
self.show_hidden_metrics = args.show_hidden_metrics_for_version is not None
# download the model before starting the server to avoid timeout
is_local = os.path.isdir(model)
if not is_local:
engine_args = AsyncEngineArgs.from_cli_args(args)
model_config = engine_args.create_model_config()
load_config = engine_args.create_load_config()
model_loader = get_model_loader(load_config)
model_loader.download_model(model_config)
env = os.environ.copy()
# the current process might initialize cuda,
# to be safe, we should use spawn method
env["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
if env_dict is not None:
env.update(env_dict)
self.proc = subprocess.Popen(
[
sys.executable,
"-m",
"vllm.entrypoints.anthropic.api_server",
model,
*vllm_serve_args,
],
env=env,
stdout=sys.stdout,
stderr=sys.stderr,
)
max_wait_seconds = max_wait_seconds or 240
self._wait_for_server(url=self.url_for("health"), timeout=max_wait_seconds)
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
self.proc.terminate()
try:
self.proc.wait(8)
except subprocess.TimeoutExpired:
# force kill if needed
self.proc.kill()
def _wait_for_server(self, *, url: str, timeout: float):
# run health check
start = time.time()
while True:
try:
if requests.get(url).status_code == 200:
break
except Exception:
# this exception can only be raised by requests.get,
# which means the server is not ready yet.
# the stack trace is not useful, so we suppress it
# by using `raise from None`.
result = self.proc.poll()
if result is not None and result != 0:
raise RuntimeError("Server exited unexpectedly.") from None
time.sleep(0.5)
if time.time() - start > timeout:
raise RuntimeError("Server failed to start in time.") from None
@property
def url_root(self) -> str:
return f"http://{self.host}:{self.port}"
def url_for(self, *parts: str) -> str:
return self.url_root + "/" + "/".join(parts)
def get_client(self, **kwargs):
if "timeout" not in kwargs:
kwargs["timeout"] = 600
return anthropic.Anthropic(
base_url=self.url_for(),
api_key=self.DUMMY_API_KEY,
max_retries=0,
**kwargs,
)
def get_async_client(self, **kwargs):
if "timeout" not in kwargs:
kwargs["timeout"] = 600
return anthropic.AsyncAnthropic(
base_url=self.url_for(), api_key=self.DUMMY_API_KEY, max_retries=0, **kwargs
)
def _test_completion(
client: openai.OpenAI,
model: str,