[V1] LogitsProcessor programming model (#16728)

Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
Signed-off-by: Andrew Feldman <afeldman@redhat.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
This commit is contained in:
afeldman-nm
2025-07-02 12:10:42 -04:00
committed by GitHub
parent c1909e7e8c
commit 48fb076cbc
13 changed files with 1401 additions and 393 deletions

View File

@@ -13,9 +13,10 @@ EXPECTED_VALUE = 0.62
# FIXME(rob): enable prefix caching once supported.
MODEL = "meta-llama/Llama-3.2-1B-Instruct"
MODEL_ARGS = f"pretrained={MODEL},enforce_eager=True,enable_prefix_caching=False" # noqa: E501
MODEL_ARGS = f"pretrained={MODEL},enforce_eager=True,enable_prefix_caching=False,gpu_memory_utilization=0.8" # noqa: E501
SERVER_ARGS = [
"--enforce_eager", "--no_enable_prefix_caching", "--disable-log-requests"
"--enforce_eager", "--no_enable_prefix_caching", "--disable-log-requests",
"--gpu-memory-utilization=0.8"
]
NUM_CONCURRENT = 100