[Frontend] Gracefully handle missing chat template and fix CI failure (#7238)
Co-authored-by: Roger Wang <ywang@roblox.com>
This commit is contained in:
@@ -9,6 +9,11 @@ from vllm.model_executor.models.opt import OPTForCausalLM
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.utils import get_open_port
|
||||
|
||||
from ...utils import VLLM_PATH, RemoteOpenAIServer
|
||||
|
||||
chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
|
||||
assert chatml_jinja_path.exists()
|
||||
|
||||
|
||||
class MyOPTForCausalLM(OPTForCausalLM):
|
||||
|
||||
@@ -21,12 +26,25 @@ class MyOPTForCausalLM(OPTForCausalLM):
|
||||
return logits
|
||||
|
||||
|
||||
def server_function(port):
|
||||
def server_function(port: int):
|
||||
# register our dummy model
|
||||
ModelRegistry.register_model("OPTForCausalLM", MyOPTForCausalLM)
|
||||
sys.argv = ["placeholder.py"] + \
|
||||
("--model facebook/opt-125m --gpu-memory-utilization 0.10 "
|
||||
f"--dtype float32 --api-key token-abc123 --port {port}").split()
|
||||
|
||||
sys.argv = ["placeholder.py"] + [
|
||||
"--model",
|
||||
"facebook/opt-125m",
|
||||
"--gpu-memory-utilization",
|
||||
"0.10",
|
||||
"--dtype",
|
||||
"float32",
|
||||
"--api-key",
|
||||
"token-abc123",
|
||||
"--port",
|
||||
str(port),
|
||||
"--chat-template",
|
||||
str(chatml_jinja_path),
|
||||
]
|
||||
|
||||
import runpy
|
||||
runpy.run_module('vllm.entrypoints.openai.api_server', run_name='__main__')
|
||||
|
||||
@@ -36,35 +54,40 @@ def test_oot_registration_for_api_server():
|
||||
ctx = torch.multiprocessing.get_context()
|
||||
server = ctx.Process(target=server_function, args=(port, ))
|
||||
server.start()
|
||||
MAX_SERVER_START_WAIT_S = 60
|
||||
client = OpenAI(
|
||||
base_url=f"http://localhost:{port}/v1",
|
||||
api_key="token-abc123",
|
||||
)
|
||||
now = time.time()
|
||||
while True:
|
||||
try:
|
||||
completion = client.chat.completions.create(
|
||||
model="facebook/opt-125m",
|
||||
messages=[{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant."
|
||||
}, {
|
||||
"role": "user",
|
||||
"content": "Hello!"
|
||||
}],
|
||||
temperature=0,
|
||||
)
|
||||
break
|
||||
except OpenAIError as e:
|
||||
if "Connection error" in str(e):
|
||||
time.sleep(3)
|
||||
if time.time() - now > MAX_SERVER_START_WAIT_S:
|
||||
raise RuntimeError("Server did not start in time") from e
|
||||
else:
|
||||
raise e
|
||||
server.kill()
|
||||
|
||||
try:
|
||||
client = OpenAI(
|
||||
base_url=f"http://localhost:{port}/v1",
|
||||
api_key="token-abc123",
|
||||
)
|
||||
now = time.time()
|
||||
while True:
|
||||
try:
|
||||
completion = client.chat.completions.create(
|
||||
model="facebook/opt-125m",
|
||||
messages=[{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant."
|
||||
}, {
|
||||
"role": "user",
|
||||
"content": "Hello!"
|
||||
}],
|
||||
temperature=0,
|
||||
)
|
||||
break
|
||||
except OpenAIError as e:
|
||||
if "Connection error" in str(e):
|
||||
time.sleep(3)
|
||||
if time.time() - now > RemoteOpenAIServer.MAX_START_WAIT_S:
|
||||
msg = "Server did not start in time"
|
||||
raise RuntimeError(msg) from e
|
||||
else:
|
||||
raise e
|
||||
finally:
|
||||
server.terminate()
|
||||
|
||||
generated_text = completion.choices[0].message.content
|
||||
assert generated_text is not None
|
||||
# make sure only the first token is generated
|
||||
rest = generated_text.replace("<s>", "")
|
||||
assert rest == ""
|
||||
|
||||
Reference in New Issue
Block a user