tests/lora/test_llama_tp.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import subprocess
import sys
from typing import Union

import vllm
from vllm import LLM
from vllm.lora.request import LoRARequest
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig

from ..utils import VLLM_PATH, create_new_process_for_each_test, multi_gpu_test

MODEL_PATH = "meta-llama/Llama-2-7b-hf"

EXPECTED_NO_LORA_OUTPUT = [
    "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_75 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_76 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_77 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_78 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user]",  # noqa: E501
    " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? ",  # noqa: E501
    "\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one m",  # noqa: E501
    " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. ",  # noqa: E501
    " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? ",  # noqa: E501
    "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE",  # noqa: E501
]
EXPECTED_LORA_OUTPUT = [
    "  SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ",  # noqa: E501
    "  SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ",  # noqa: E501
    "  SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ",  # noqa: E501
    "  SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ",  # noqa: E501
    "  SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ",  # noqa: E501
    "  SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' "  # noqa: E501
]


def do_sample(llm: vllm.LLM,
              lora_path: str,
              lora_id: int,
              tensorizer_config_dict: Union[dict, None] = None) -> list[str]:
    prompts = [
        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]",  # noqa: E501
        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]",  # noqa: E501
        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]",  # noqa: E501
        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]"  # noqa: E501
    ]

    sampling_params = vllm.SamplingParams(temperature=0,
                                          max_tokens=256,
                                          skip_special_tokens=False,
                                          stop=["[/assistant]"])

    if tensorizer_config_dict is not None:
        outputs = llm.generate(
            prompts,
            sampling_params,
            lora_request=LoRARequest(
                str(lora_id),
                lora_id,
                lora_path,
                tensorizer_config_dict=tensorizer_config_dict)
            if lora_id else None)
    else:
        outputs = llm.generate(
            prompts,
            sampling_params,
            lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
            if lora_id else None)
    # Print the outputs.
    generated_texts: list[str] = []
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        generated_texts.append(generated_text)
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
    return generated_texts


def generate_and_test(llm,
                      sql_lora_files,
                      tensorizer_config_dict: Union[dict, None] = None):
    print("lora adapter created")
    assert do_sample(llm,
                     sql_lora_files,
                     tensorizer_config_dict=tensorizer_config_dict,
                     lora_id=0) == EXPECTED_NO_LORA_OUTPUT

    print("lora 1")
    assert do_sample(llm,
                     sql_lora_files,
                     tensorizer_config_dict=tensorizer_config_dict,
                     lora_id=1) == EXPECTED_LORA_OUTPUT

    print("no lora")
    assert do_sample(llm,
                     sql_lora_files,
                     tensorizer_config_dict=tensorizer_config_dict,
                     lora_id=0) == EXPECTED_NO_LORA_OUTPUT

    print("lora 2")
    assert do_sample(llm,
                     sql_lora_files,
                     tensorizer_config_dict=tensorizer_config_dict,
                     lora_id=2) == EXPECTED_LORA_OUTPUT

    print("removing lora")


@create_new_process_for_each_test()
def test_llama_lora(sql_lora_files):

    llm = vllm.LLM(
        MODEL_PATH,
        enable_lora=True,
        # also test odd max_num_seqs
        max_num_seqs=13,
        max_loras=4,
        enable_chunked_prefill=True)
    generate_and_test(llm, sql_lora_files)


@multi_gpu_test(num_gpus=4)
@create_new_process_for_each_test()
def test_llama_lora_tp4(sql_lora_files):

    llm = vllm.LLM(
        MODEL_PATH,
        enable_lora=True,
        max_num_seqs=16,
        max_loras=4,
        tensor_parallel_size=4,
        enable_chunked_prefill=True,
    )
    generate_and_test(llm, sql_lora_files)


@multi_gpu_test(num_gpus=4)
@create_new_process_for_each_test()
def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):

    llm = vllm.LLM(
        MODEL_PATH,
        enable_lora=True,
        max_num_seqs=16,
        max_loras=4,
        tensor_parallel_size=4,
        fully_sharded_loras=True,
        enable_chunked_prefill=True,
    )
    generate_and_test(llm, sql_lora_files)


@multi_gpu_test(num_gpus=2)
@create_new_process_for_each_test()
def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
                                            sql_lora_huggingface_id):

    # Run the tensorizing of the LoRA adapter and the model in a subprocess
    # to guarantee cleanup

    tp_size = 2
    model_name = "model-rank-%03d.tensors"

    model_ref = MODEL_PATH
    lora_path = sql_lora_huggingface_id
    suffix = "test"
    try:
        result = subprocess.run([
            sys.executable,
            f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py", "--model",
            MODEL_PATH, "--lora-path", lora_path, "--tensor-parallel-size",
            str(tp_size), "serialize", "--serialized-directory",
            str(tmp_path), "--suffix", suffix
        ],
                                check=True,
                                capture_output=True,
                                text=True)
    except subprocess.CalledProcessError as e:
        print("Tensorizing failed.")
        print("STDOUT:\n", e.stdout)
        print("STDERR:\n", e.stderr)
        raise

    print("STDOUT:\n", result.stdout)

    model_uri = tmp_path / "vllm" / model_ref / suffix / model_name
    tensorizer_config = TensorizerConfig(tensorizer_uri=str(model_uri))
    tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir

    loaded_vllm_model = LLM(model=model_ref,
                            load_format="tensorizer",
                            enable_lora=True,
                            enforce_eager=True,
                            model_loader_extra_config=tensorizer_config,
                            max_num_seqs=13,
                            tensor_parallel_size=2,
                            max_loras=2)

    tensorizer_config_dict = tensorizer_config.to_dict()

    print("lora adapter created")
    assert do_sample(loaded_vllm_model,
                     sql_lora_files,
                     tensorizer_config_dict=tensorizer_config_dict,
                     lora_id=0) == EXPECTED_NO_LORA_OUTPUT

    print("lora 1")
    assert do_sample(loaded_vllm_model,
                     sql_lora_files,
                     tensorizer_config_dict=tensorizer_config_dict,
                     lora_id=1) == EXPECTED_LORA_OUTPUT
-												[Misc] Add SPDX-License-Identifier headers to python source files (#12628)

- **Add SPDX license headers to python source files**
- **Check for SPDX headers using pre-commit**

commit 9d7ef44c3cfb72ca4c32e1c677d99259d10d4745
Author: Russell Bryant <rbryant@redhat.com>
Date:   Fri Jan 31 14:18:24 2025 -0500

    Add SPDX license headers to python source files
    
This commit adds SPDX license headers to python source files as
recommended to
the project by the Linux Foundation. These headers provide a concise way
that is
both human and machine readable for communicating license information
for each
source file. It helps avoid any ambiguity about the license of the code
and can
    also be easily used by tools to help manage license compliance.
    
The Linux Foundation runs license scans against the codebase to help
ensure
    we are in compliance with the licenses of the code we use, including
dependencies. Having these headers in place helps that tool do its job.
    
    More information can be found on the SPDX site:
    
    - https://spdx.dev/learn/handling-license-info/
    
    Signed-off-by: Russell Bryant <rbryant@redhat.com>

commit 5a1cf1cb3b80759131c73f6a9dddebccac039dea
Author: Russell Bryant <rbryant@redhat.com>
Date:   Fri Jan 31 14:36:32 2025 -0500

    Check for SPDX headers using pre-commit
    
    Signed-off-by: Russell Bryant <rbryant@redhat.com>

---------

Signed-off-by: Russell Bryant <rbryant@redhat.com>
											
										
										
											2025-02-02 14:58:18 -05:00
+								# SPDX-License-Identifier: Apache-2.0
-												[Misc] Add SPDX-FileCopyrightText  (#19100)

Signed-off-by: simon-mo <simon.mo@hey.com>
											
										
										
											2025-06-03 11:20:17 -07:00
+								# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-												[Frontend] [Core] Add Tensorizer support for V1, LoRA adapter serialization and deserialization (#17926)

Signed-off-by: Sanger Steel <sangersteel@gmail.com>
											
										
										
											2025-05-22 21:44:18 -04:00
+								import subprocess
 								import sys
 								from typing import Union
-												[Misc] Add SPDX-License-Identifier headers to python source files (#12628)

- **Add SPDX license headers to python source files**
- **Check for SPDX headers using pre-commit**

commit 9d7ef44c3cfb72ca4c32e1c677d99259d10d4745
Author: Russell Bryant <rbryant@redhat.com>
Date:   Fri Jan 31 14:18:24 2025 -0500

    Add SPDX license headers to python source files
    
This commit adds SPDX license headers to python source files as
recommended to
the project by the Linux Foundation. These headers provide a concise way
that is
both human and machine readable for communicating license information
for each
source file. It helps avoid any ambiguity about the license of the code
and can
    also be easily used by tools to help manage license compliance.
    
The Linux Foundation runs license scans against the codebase to help
ensure
    we are in compliance with the licenses of the code we use, including
dependencies. Having these headers in place helps that tool do its job.
    
    More information can be found on the SPDX site:
    
    - https://spdx.dev/learn/handling-license-info/
    
    Signed-off-by: Russell Bryant <rbryant@redhat.com>

commit 5a1cf1cb3b80759131c73f6a9dddebccac039dea
Author: Russell Bryant <rbryant@redhat.com>
Date:   Fri Jan 31 14:36:32 2025 -0500

    Check for SPDX headers using pre-commit
    
    Signed-off-by: Russell Bryant <rbryant@redhat.com>

---------

Signed-off-by: Russell Bryant <rbryant@redhat.com>
											
										
										
											2025-02-02 14:58:18 -05:00
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
+								import vllm
-												[Frontend] [Core] Add Tensorizer support for V1, LoRA adapter serialization and deserialization (#17926)

Signed-off-by: Sanger Steel <sangersteel@gmail.com>
											
										
										
											2025-05-22 21:44:18 -04:00
+								from vllm import LLM
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
+								from vllm.lora.request import LoRARequest
-												[Frontend] [Core] Add Tensorizer support for V1, LoRA adapter serialization and deserialization (#17926)

Signed-off-by: Sanger Steel <sangersteel@gmail.com>
											
										
										
											2025-05-22 21:44:18 -04:00
+								from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
-												[Frontend] [Core] Add Tensorizer support for V1, LoRA adapter serialization and deserialization (#17926)

Signed-off-by: Sanger Steel <sangersteel@gmail.com>
											
										
										
											2025-05-22 21:44:18 -04:00
+								from ..utils import VLLM_PATH, create_new_process_for_each_test, multi_gpu_test
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
 								MODEL_PATH = "meta-llama/Llama-2-7b-hf"
 								EXPECTED_NO_LORA_OUTPUT = [
 								    "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_75 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_76 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_77 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_78 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user]",  # noqa: E501
 								    " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? ",  # noqa: E501
 								    "\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one m",  # noqa: E501
 								    " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. ",  # noqa: E501
 								    " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? ",  # noqa: E501
 								    "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE",  # noqa: E501
 								]
 								EXPECTED_LORA_OUTPUT = [
 								    "  SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ",  # noqa: E501
 								    "  SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ",  # noqa: E501
 								    "  SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ",  # noqa: E501
 								    "  SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ",  # noqa: E501
 								    "  SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ",  # noqa: E501
 								    "  SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' "  # noqa: E501
 								]
-												[Frontend] [Core] Add Tensorizer support for V1, LoRA adapter serialization and deserialization (#17926)

Signed-off-by: Sanger Steel <sangersteel@gmail.com>
											
										
										
											2025-05-22 21:44:18 -04:00
+								def do_sample(llm: vllm.LLM,
 								              lora_path: str,
 								              lora_id: int,
 								              tensorizer_config_dict: Union[dict, None] = None) -> list[str]:
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
+								    prompts = [
 								        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
 								        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
 								        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]",  # noqa: E501
 								        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]",  # noqa: E501
 								        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]",  # noqa: E501
 								        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]"  # noqa: E501
 								    ]
-												[Frontend] [Core] Add Tensorizer support for V1, LoRA adapter serialization and deserialization (#17926)

Signed-off-by: Sanger Steel <sangersteel@gmail.com>
											
										
										
											2025-05-22 21:44:18 -04:00
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
+								    sampling_params = vllm.SamplingParams(temperature=0,
 								                                          max_tokens=256,
-												[V1][Perf] Faster incremental detokenization (#15137)

Signed-off-by: Nick Hill <nhill@redhat.com>
											
										
										
											2025-04-17 07:45:24 -07:00
+								                                          skip_special_tokens=False,
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
+								                                          stop=["[/assistant]"])
-												[Frontend] [Core] Add Tensorizer support for V1, LoRA adapter serialization and deserialization (#17926)

Signed-off-by: Sanger Steel <sangersteel@gmail.com>
											
										
										
											2025-05-22 21:44:18 -04:00
 								    if tensorizer_config_dict is not None:
 								        outputs = llm.generate(
 								            prompts,
 								            sampling_params,
 								            lora_request=LoRARequest(
 								                str(lora_id),
 								                lora_id,
 								                lora_path,
 								                tensorizer_config_dict=tensorizer_config_dict)
 								            if lora_id else None)
 								    else:
 								        outputs = llm.generate(
 								            prompts,
 								            sampling_params,
 								            lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
 								            if lora_id else None)
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
+								    # Print the outputs.
-												Update deprecated Python 3.8 typing (#13971)


											
										
										
											2025-03-03 01:34:51 +00:00
+								    generated_texts: list[str] = []
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
+								    for output in outputs:
 								        prompt = output.prompt
 								        generated_text = output.outputs[0].text
 								        generated_texts.append(generated_text)
 								        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 								    return generated_texts
-												[Frontend] [Core] Add Tensorizer support for V1, LoRA adapter serialization and deserialization (#17926)

Signed-off-by: Sanger Steel <sangersteel@gmail.com>
											
										
										
											2025-05-22 21:44:18 -04:00
+								def generate_and_test(llm,
 								                      sql_lora_files,
 								                      tensorizer_config_dict: Union[dict, None] = None):
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
+								    print("lora adapter created")
-												[Frontend] [Core] Add Tensorizer support for V1, LoRA adapter serialization and deserialization (#17926)

Signed-off-by: Sanger Steel <sangersteel@gmail.com>
											
										
										
											2025-05-22 21:44:18 -04:00
+								    assert do_sample(llm,
 								                     sql_lora_files,
 								                     tensorizer_config_dict=tensorizer_config_dict,
 								                     lora_id=0) == EXPECTED_NO_LORA_OUTPUT
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
 								    print("lora 1")
-												[Frontend] [Core] Add Tensorizer support for V1, LoRA adapter serialization and deserialization (#17926)

Signed-off-by: Sanger Steel <sangersteel@gmail.com>
											
										
										
											2025-05-22 21:44:18 -04:00
+								    assert do_sample(llm,
 								                     sql_lora_files,
 								                     tensorizer_config_dict=tensorizer_config_dict,
 								                     lora_id=1) == EXPECTED_LORA_OUTPUT
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
 								    print("no lora")
-												[Frontend] [Core] Add Tensorizer support for V1, LoRA adapter serialization and deserialization (#17926)

Signed-off-by: Sanger Steel <sangersteel@gmail.com>
											
										
										
											2025-05-22 21:44:18 -04:00
+								    assert do_sample(llm,
 								                     sql_lora_files,
 								                     tensorizer_config_dict=tensorizer_config_dict,
 								                     lora_id=0) == EXPECTED_NO_LORA_OUTPUT
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
 								    print("lora 2")
-												[Frontend] [Core] Add Tensorizer support for V1, LoRA adapter serialization and deserialization (#17926)

Signed-off-by: Sanger Steel <sangersteel@gmail.com>
											
										
										
											2025-05-22 21:44:18 -04:00
+								    assert do_sample(llm,
 								                     sql_lora_files,
 								                     tensorizer_config_dict=tensorizer_config_dict,
 								                     lora_id=2) == EXPECTED_LORA_OUTPUT
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
 								    print("removing lora")
-												[Bugfix][ROCm] running new process using spawn method for rocm in tests. (#14810)

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-03-17 19:33:35 +08:00
+								@create_new_process_for_each_test()
-												[Misc][LoRA] Move the implementation of lora bias to punica.py (#10829)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2024-12-03 01:53:36 +08:00
+								def test_llama_lora(sql_lora_files):
-												[Bugfix] LoRA V0 - Fix case where `max_num_seqs` is between cudagraph capture sizes (#15308)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
											
										
										
											2025-03-22 05:03:32 -04:00
+								    llm = vllm.LLM(
 								        MODEL_PATH,
 								        enable_lora=True,
 								        # also test odd max_num_seqs
 								        max_num_seqs=13,
 								        max_loras=4,
 								        enable_chunked_prefill=True)
-												[Misc][LoRA] Move the implementation of lora bias to punica.py (#10829)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2024-12-03 01:53:36 +08:00
+								    generate_and_test(llm, sql_lora_files)
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
+								@multi_gpu_test(num_gpus=4)
-												[Bugfix][ROCm] running new process using spawn method for rocm in tests. (#14810)

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-03-17 19:33:35 +08:00
+								@create_new_process_for_each_test()
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
+								def test_llama_lora_tp4(sql_lora_files):
 								    llm = vllm.LLM(
 								        MODEL_PATH,
 								        enable_lora=True,
 								        max_num_seqs=16,
 								        max_loras=4,
 								        tensor_parallel_size=4,
-												[Misc] LoRA + Chunked Prefill (#9057)


											
										
										
											2024-12-10 21:09:20 -05:00
+								        enable_chunked_prefill=True,
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
+								    )
-												[Misc][LoRA] Move the implementation of lora bias to punica.py (#10829)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2024-12-03 01:53:36 +08:00
+								    generate_and_test(llm, sql_lora_files)
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
 								@multi_gpu_test(num_gpus=4)
-												[Bugfix][ROCm] running new process using spawn method for rocm in tests. (#14810)

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-03-17 19:33:35 +08:00
+								@create_new_process_for_each_test()
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
+								def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
 								    llm = vllm.LLM(
 								        MODEL_PATH,
 								        enable_lora=True,
 								        max_num_seqs=16,
 								        max_loras=4,
 								        tensor_parallel_size=4,
 								        fully_sharded_loras=True,
-												[Misc] LoRA + Chunked Prefill (#9057)


											
										
										
											2024-12-10 21:09:20 -05:00
+								        enable_chunked_prefill=True,
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
+								    )
-												[Misc][LoRA] Move the implementation of lora bias to punica.py (#10829)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2024-12-03 01:53:36 +08:00
+								    generate_and_test(llm, sql_lora_files)
-												[Frontend] [Core] Add Tensorizer support for V1, LoRA adapter serialization and deserialization (#17926)

Signed-off-by: Sanger Steel <sangersteel@gmail.com>
											
										
										
											2025-05-22 21:44:18 -04:00
 								@multi_gpu_test(num_gpus=2)
 								@create_new_process_for_each_test()
 								def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
 								                                            sql_lora_huggingface_id):
 								    # Run the tensorizing of the LoRA adapter and the model in a subprocess
 								    # to guarantee cleanup
 								    tp_size = 2
 								    model_name = "model-rank-%03d.tensors"
 								    model_ref = MODEL_PATH
 								    lora_path = sql_lora_huggingface_id
 								    suffix = "test"
 								    try:
 								        result = subprocess.run([
 								            sys.executable,
-												[Doc] Move examples and further reorganize user guide (#18666)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-05-26 22:38:04 +08:00
+								            f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py", "--model",
-												[Frontend] [Core] Add Tensorizer support for V1, LoRA adapter serialization and deserialization (#17926)

Signed-off-by: Sanger Steel <sangersteel@gmail.com>
											
										
										
											2025-05-22 21:44:18 -04:00
+								            MODEL_PATH, "--lora-path", lora_path, "--tensor-parallel-size",
 								            str(tp_size), "serialize", "--serialized-directory",
 								            str(tmp_path), "--suffix", suffix
 								        ],
 								                                check=True,
 								                                capture_output=True,
 								                                text=True)
 								    except subprocess.CalledProcessError as e:
 								        print("Tensorizing failed.")
 								        print("STDOUT:\n", e.stdout)
 								        print("STDERR:\n", e.stderr)
 								        raise
 								    print("STDOUT:\n", result.stdout)
 								    model_uri = tmp_path / "vllm" / model_ref / suffix / model_name
 								    tensorizer_config = TensorizerConfig(tensorizer_uri=str(model_uri))
 								    tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
 								    loaded_vllm_model = LLM(model=model_ref,
 								                            load_format="tensorizer",
 								                            enable_lora=True,
 								                            enforce_eager=True,
 								                            model_loader_extra_config=tensorizer_config,
 								                            max_num_seqs=13,
 								                            tensor_parallel_size=2,
 								                            max_loras=2)
 								    tensorizer_config_dict = tensorizer_config.to_dict()
 								    print("lora adapter created")
 								    assert do_sample(loaded_vllm_model,
 								                     sql_lora_files,
 								                     tensorizer_config_dict=tensorizer_config_dict,
 								                     lora_id=0) == EXPECTED_NO_LORA_OUTPUT
 								    print("lora 1")
 								    assert do_sample(loaded_vllm_model,
 								                     sql_lora_files,
 								                     tensorizer_config_dict=tensorizer_config_dict,
 								                     lora_id=1) == EXPECTED_LORA_OUTPUT