[Kernel] Triton-based Top-k and Top-p sampler kernels (#33538)
Signed-off-by: js_park <cakeng@naver.com> Signed-off-by: Jongseok Park <37990712+cakeng@users.noreply.github.com> Signed-off-by: Sunga Kim <sunga.kim@berkeley.edu> Signed-off-by: Nick Hill <nickhill123@gmail.com> Co-authored-by: Sunga Kim <sunga.kim@berkeley.edu> Co-authored-by: Nick Hill <nickhill123@gmail.com>
This commit is contained in:
@@ -145,6 +145,7 @@ async def test_request_cancellation(server: RemoteOpenAIServer):
|
||||
model=MODEL_NAME,
|
||||
max_tokens=10000,
|
||||
extra_body={"min_tokens": 10000},
|
||||
temperature=0.0,
|
||||
)
|
||||
)
|
||||
tasks.append(task)
|
||||
@@ -163,7 +164,7 @@ async def test_request_cancellation(server: RemoteOpenAIServer):
|
||||
# be able to respond to this one within the timeout
|
||||
client = server.get_async_client(timeout=5)
|
||||
response = await client.chat.completions.create(
|
||||
messages=chat_input, model=MODEL_NAME, max_tokens=10
|
||||
messages=chat_input, model=MODEL_NAME, max_tokens=10, temperature=0.0
|
||||
)
|
||||
|
||||
assert len(response.choices) == 1
|
||||
|
||||
Reference in New Issue
Block a user