[Deprecation] Remove prompt_token_ids arg fallback in LLM.generate and LLM.embed (#18800)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-22 10:56:57 +08:00
parent 19fe1a0510
commit 8896eb72eb
24 changed files with 116 additions and 467 deletions
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -127,13 +127,15 @@ def test_structured_output(
        temperature=1.0,
        max_tokens=4096,
        guided_decoding=GuidedDecodingParams(json=sample_json_schema))
-    outputs = llm.generate(prompts=[
-        (f"Give an example JSON for an employee profile that fits this "
-         f"schema. Make the response as short as possible. Schema: "
-         f"{sample_json_schema}")
-    ] * 2,
-                           sampling_params=sampling_params,
-                           use_tqdm=True)
+
+    prompt = ("Give an example JSON for an employee profile that fits this "
+              "schema. Make the response as short as possible. Schema: "
+              f"{sample_json_schema}")
+    outputs = llm.generate(
+        [prompt] * 2,
+        sampling_params=sampling_params,
+        use_tqdm=True,
+    )

    assert outputs is not None

@@ -191,20 +193,24 @@ def test_structured_output(
        with pytest.raises(ValueError,
                           match="The provided JSON schema contains features "
                           "not supported by xgrammar."):
+
+            prompt = (f"Give an example JSON for an employee profile that "
+                      f"fits this schema: {unsupported_json_schema}. "
+                      f"Make the response as short as possible.")
            llm.generate(
-                prompts=[(f"Give an example JSON for an employee profile that "
-                          f"fits this schema: {unsupported_json_schema}. "
-                          f"Make the response as short as possible.")] * 2,
+                [prompt] * 2,
                sampling_params=sampling_params,
-                use_tqdm=True)
+                use_tqdm=True,
+            )
    else:
-        outputs = llm.generate(prompts=(
-            "Give an example JSON object for a grade "
-            "that fits this schema: "
-            f"{unsupported_json_schema}. Make the response as short as "
-            "possible."),
-                               sampling_params=sampling_params,
-                               use_tqdm=True)
+        prompt = (f"Give an example JSON object for a grade that "
+                  f"fits this schema: {unsupported_json_schema}. "
+                  f"Make the response as short as possible.")
+        outputs = llm.generate(
+            prompt,
+            sampling_params=sampling_params,
+            use_tqdm=True,
+        )
        assert outputs is not None
        for output in outputs:
            assert output is not None
@@ -227,10 +233,9 @@ def test_structured_output(
            max_tokens=1000,
            guided_decoding=GuidedDecodingParams(grammar=sample_sql_ebnf))
        outputs = llm.generate(
-            prompts=(
-                "Generate a sql statement that selects col_1 from "
-                "table_1 where it is equal to 1. Make the response as short as "
-                "possible."),
+            ("Generate a sql statement that selects col_1 from "
+             "table_1 where it is equal to 1. Make the response as short as "
+             "possible."),
            sampling_params=sampling_params,
            use_tqdm=True,
        )
@@ -261,10 +266,9 @@ def test_structured_output(
            max_tokens=1000,
            guided_decoding=GuidedDecodingParams(grammar=sample_sql_lark))
        outputs = llm.generate(
-            prompts=(
-                "Generate a sql statement that selects col_1 from "
-                "table_1 where it is equal to 1. Make the response as short as "
-                "possible."),
+            ("Generate a sql statement that selects col_1 from "
+             "table_1 where it is equal to 1. Make the response as short as "
+             "possible."),
            sampling_params=sampling_params,
            use_tqdm=True,
        )
@@ -301,7 +305,6 @@ def test_structured_output(
            guided_decoding=GuidedDecodingParams(grammar="not a grammar"))
        with pytest.raises(ValueError, match="Failed to convert the grammar "):
            llm.generate(
-                prompts=
                ("Generate a sql statement that selects col_1 from "
                 "table_1 where it is equal to 1. Make the response as short "
                 "as possible."),
@@ -316,11 +319,11 @@ def test_structured_output(
        temperature=0.8,
        top_p=0.95,
        guided_decoding=GuidedDecodingParams(regex=sample_regex))
+
+    prompt = (f"Give an example IPv4 address with this regex: {sample_regex}. "
+              f"Make the response as short as possible.")
    outputs = llm.generate(
-        prompts=[
-            (f"Give an example IPv4 address with this regex: {sample_regex}. "
-             f"Make the response as short as possible.")
-        ] * 2,
+        [prompt] * 2,
        sampling_params=sampling_params,
        use_tqdm=True,
    )
@@ -343,11 +346,13 @@ def test_structured_output(
        temperature=0.8,
        top_p=0.95,
        guided_decoding=GuidedDecodingParams(choice=sample_guided_choice))
+
    outputs = llm.generate(
-        prompts=("The best language for type-safe systems programming is "
-                 "(Make the response as short as possible.) "),
+        ("The best language for type-safe systems programming is "
+         "(Make the response as short as possible.) "),
        sampling_params=sampling_params,
-        use_tqdm=True)
+        use_tqdm=True,
+    )
    assert outputs is not None
    for output in outputs:
        assert output is not None
@@ -367,12 +372,14 @@ def test_structured_output(
        temperature=1.0,
        max_tokens=1000,
        guided_decoding=GuidedDecodingParams(json=json_schema))
-    outputs = llm.generate(prompts=(
-        "Generate a JSON with the brand, model and car_type of the most "
-        "iconic car from the 90's. Make the response as short as "
-        "possible."),
-                           sampling_params=sampling_params,
-                           use_tqdm=True)
+
+    outputs = llm.generate(
+        ("Generate a JSON with the brand, model and car_type of the most "
+         "iconic car from the 90's. Make the response as short as "
+         "possible."),
+        sampling_params=sampling_params,
+        use_tqdm=True,
+    )

    assert outputs is not None

@@ -411,10 +418,11 @@ def test_structured_output(
        guided_decoding=GuidedDecodingParams(json=json_schema))

    outputs = llm.generate(
-        prompts=("Generate a description of a frog using 50 characters. "
-                 "Make the response as short as possible."),
+        ("Generate a description of a frog using 50 characters. "
+         "Make the response as short as possible."),
        sampling_params=sampling_params,
-        use_tqdm=True)
+        use_tqdm=True,
+    )

    assert outputs is not None

@@ -498,7 +506,7 @@ Make the response as short as possible.
 """

        # Change this once other backends support structural_tag
-        outputs = llm.generate(prompts=prompt,
+        outputs = llm.generate(prompt,
                               sampling_params=sampling_params,
                               use_tqdm=True)
        assert outputs is not None
@@ -639,15 +647,13 @@ def test_structured_output_auto_mode(
        f"{unsupported_json_schema}. Make the response as short as possible.")
    # This would fail with the default of "xgrammar", but in "auto"
    # we will handle fallback automatically.
-    outputs = llm.generate(prompts=prompts,
+    outputs = llm.generate(prompts,
                           sampling_params=sampling_params,
                           use_tqdm=True)
    # Make sure `auto` backend handling doesn't mess up sampling_params
    # and that we can reuse it without error.
    outputs.extend(
-        llm.generate(prompts=prompts,
-                     sampling_params=sampling_params,
-                     use_tqdm=True))
+        llm.generate(prompts, sampling_params=sampling_params, use_tqdm=True))

    assert outputs is not None
    for output in outputs:
@@ -705,7 +711,7 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
                                         max_tokens=256,
                                         guided_decoding=guided_params)

-        outputs = llm.generate(prompts=prompt, sampling_params=sampling_params)
+        outputs = llm.generate(prompt, sampling_params=sampling_params)
        assert outputs is not None
        generated_text = outputs[0].outputs[0].text
        assert generated_text is not None