[Kernel] Triton-based Top-k and Top-p sampler kernels (#33538)

Signed-off-by: js_park <cakeng@naver.com> Signed-off-by: Jongseok Park <37990712+cakeng@users.noreply.github.com> Signed-off-by: Sunga Kim <sunga.kim@berkeley.edu> Signed-off-by: Nick Hill <nickhill123@gmail.com> Co-authored-by: Sunga Kim <sunga.kim@berkeley.edu> Co-authored-by: Nick Hill <nickhill123@gmail.com>
2026-02-17 15:14:30 -08:00
parent dc5fa77a4e
commit c656ba3b4d
6 changed files with 2002 additions and 32 deletions
--- a/tests/entrypoints/instrumentator/test_basic.py
+++ b/tests/entrypoints/instrumentator/test_basic.py
@@ -145,6 +145,7 @@ async def test_request_cancellation(server: RemoteOpenAIServer):
                model=MODEL_NAME,
                max_tokens=10000,
                extra_body={"min_tokens": 10000},
+                temperature=0.0,
            )
        )
        tasks.append(task)
@@ -163,7 +164,7 @@ async def test_request_cancellation(server: RemoteOpenAIServer):
    # be able to respond to this one within the timeout
    client = server.get_async_client(timeout=5)
    response = await client.chat.completions.create(
-        messages=chat_input, model=MODEL_NAME, max_tokens=10
+        messages=chat_input, model=MODEL_NAME, max_tokens=10, temperature=0.0
    )

    assert len(response.choices) == 1