[V1] LogitsProcessor programming model (#16728)

Signed-off-by: Nick Hill <nhill@redhat.com> Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com> Signed-off-by: Andrew Feldman <afeldman@redhat.com> Co-authored-by: Nick Hill <nhill@redhat.com>
2025-07-02 12:10:42 -04:00
parent c1909e7e8c
commit 48fb076cbc
13 changed files with 1401 additions and 393 deletions
--- a/tests/v1/sample/test_logprobs_e2e.py
+++ b/tests/v1/sample/test_logprobs_e2e.py
@@ -13,9 +13,10 @@ EXPECTED_VALUE = 0.62

 # FIXME(rob): enable prefix caching once supported.
 MODEL = "meta-llama/Llama-3.2-1B-Instruct"
-MODEL_ARGS = f"pretrained={MODEL},enforce_eager=True,enable_prefix_caching=False"  # noqa: E501
+MODEL_ARGS = f"pretrained={MODEL},enforce_eager=True,enable_prefix_caching=False,gpu_memory_utilization=0.8"  # noqa: E501
 SERVER_ARGS = [
-    "--enforce_eager", "--no_enable_prefix_caching", "--disable-log-requests"
+    "--enforce_eager", "--no_enable_prefix_caching", "--disable-log-requests",
+    "--gpu-memory-utilization=0.8"
 ]
 NUM_CONCURRENT = 100