[Kernel] Triton-based Top-k and Top-p sampler kernels (#33538)

Signed-off-by: js_park <cakeng@naver.com>
Signed-off-by: Jongseok Park <37990712+cakeng@users.noreply.github.com>
Signed-off-by: Sunga Kim <sunga.kim@berkeley.edu>
Signed-off-by: Nick Hill <nickhill123@gmail.com>
Co-authored-by: Sunga Kim <sunga.kim@berkeley.edu>
Co-authored-by: Nick Hill <nickhill123@gmail.com>
This commit is contained in:
Jongseok Park
2026-02-17 15:14:30 -08:00
committed by GitHub
parent dc5fa77a4e
commit c656ba3b4d
6 changed files with 2002 additions and 32 deletions

View File

@@ -145,6 +145,7 @@ async def test_request_cancellation(server: RemoteOpenAIServer):
model=MODEL_NAME,
max_tokens=10000,
extra_body={"min_tokens": 10000},
temperature=0.0,
)
)
tasks.append(task)
@@ -163,7 +164,7 @@ async def test_request_cancellation(server: RemoteOpenAIServer):
# be able to respond to this one within the timeout
client = server.get_async_client(timeout=5)
response = await client.chat.completions.create(
messages=chat_input, model=MODEL_NAME, max_tokens=10
messages=chat_input, model=MODEL_NAME, max_tokens=10, temperature=0.0
)
assert len(response.choices) == 1