tests/lora/test_llama_tp.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import subprocess
import sys

import pytest

import vllm
import vllm.config
from vllm import LLM
from vllm.lora.request import LoRARequest
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig

from ..utils import VLLM_PATH, create_new_process_for_each_test, multi_gpu_test

MODEL_PATH = "meta-llama/Llama-2-7b-hf"

EXPECTED_LORA_OUTPUT = [
    "  SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ",  # noqa: E501
    "  SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ",
    "  SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ",  # noqa: E501
    "  SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ",  # noqa: E501
    "  SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ",
    "  SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' ",  # noqa: E501
]


def do_sample(
    llm: vllm.LLM,
    lora_path: str,
    lora_id: int,
    tensorizer_config_dict: dict | None = None,
) -> list[str]:
    prompts = [
        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]",  # noqa: E501
        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]",  # noqa: E501
        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]",  # noqa: E501
        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]",  # noqa: E501
    ]

    sampling_params = vllm.SamplingParams(
        temperature=0, max_tokens=256, skip_special_tokens=False, stop=["[/assistant]"]
    )

    if tensorizer_config_dict is not None:
        outputs = llm.generate(
            prompts,
            sampling_params,
            lora_request=LoRARequest(
                str(lora_id),
                lora_id,
                lora_path,
                tensorizer_config_dict=tensorizer_config_dict,
            )
            if lora_id
            else None,
        )
    else:
        outputs = llm.generate(
            prompts,
            sampling_params,
            lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
            if lora_id
            else None,
        )
    # Print the outputs.
    generated_texts: list[str] = []
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        generated_texts.append(generated_text)
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
    return generated_texts


def generate_and_test(llm, sql_lora_files, tensorizer_config_dict: dict | None = None):
    print("lora adapter created")
    print("lora 1")
    assert (
        do_sample(
            llm,
            sql_lora_files,
            tensorizer_config_dict=tensorizer_config_dict,
            lora_id=1,
        )
        == EXPECTED_LORA_OUTPUT
    )

    print("lora 2")
    assert (
        do_sample(
            llm,
            sql_lora_files,
            tensorizer_config_dict=tensorizer_config_dict,
            lora_id=2,
        )
        == EXPECTED_LORA_OUTPUT
    )

    print("removing lora")


@create_new_process_for_each_test()
@pytest.mark.parametrize("cudagraph_specialize_lora", [True, False])
def test_llama_lora(sql_lora_files, cudagraph_specialize_lora: bool):
    llm = vllm.LLM(
        MODEL_PATH,
        tokenizer=sql_lora_files,
        enable_lora=True,
        # also test odd max_num_seqs
        max_num_seqs=13,
        max_loras=4,
        compilation_config=vllm.config.CompilationConfig(
            cudagraph_specialize_lora=cudagraph_specialize_lora,
        ),
    )
    generate_and_test(llm, sql_lora_files)


@multi_gpu_test(num_gpus=4)
def test_llama_lora_tp4(sql_lora_files):
    llm = vllm.LLM(
        MODEL_PATH,
        tokenizer=sql_lora_files,
        enable_lora=True,
        max_num_seqs=16,
        max_loras=4,
        tensor_parallel_size=4,
    )
    generate_and_test(llm, sql_lora_files)


@multi_gpu_test(num_gpus=4)
def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
    llm = vllm.LLM(
        MODEL_PATH,
        tokenizer=sql_lora_files,
        enable_lora=True,
        max_num_seqs=16,
        max_loras=4,
        tensor_parallel_size=4,
        fully_sharded_loras=True,
    )
    generate_and_test(llm, sql_lora_files)


@multi_gpu_test(num_gpus=2)
def test_tp2_serialize_and_deserialize_lora(
    tmp_path, sql_lora_files, sql_lora_huggingface_id
):
    # Run the tensorizing of the LoRA adapter and the model in a subprocess
    # to guarantee cleanup

    tp_size = 2
    model_name = "model-rank-%03d.tensors"

    model_ref = MODEL_PATH
    lora_path = sql_lora_huggingface_id
    suffix = "test"
    try:
        result = subprocess.run(
            [
                sys.executable,
                f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py",
                "--model",
                MODEL_PATH,
                "--lora-path",
                lora_path,
                "--tensor-parallel-size",
                str(tp_size),
                "serialize",
                "--serialized-directory",
                str(tmp_path),
                "--suffix",
                suffix,
                "--serialization-kwargs",
                '{"limit_cpu_concurrency": 4}',
            ],
            check=True,
            capture_output=True,
            text=True,
        )
    except subprocess.CalledProcessError as e:
        print("Tensorizing failed.")
        print("STDOUT:\n", e.stdout)
        print("STDERR:\n", e.stderr)
        raise

    print("STDOUT:\n", result.stdout)

    model_uri = tmp_path / "vllm" / model_ref / suffix / model_name
    tensorizer_config = TensorizerConfig(tensorizer_uri=str(model_uri))

    loaded_llm = LLM(
        model=model_ref,
        tokenizer=sql_lora_files,
        load_format="tensorizer",
        enable_lora=True,
        enforce_eager=True,
        model_loader_extra_config=tensorizer_config,
        max_num_seqs=13,
        tensor_parallel_size=2,
        max_loras=2,
    )

    tc_as_dict = tensorizer_config.to_serializable()

    print("lora adapter created")
    print("lora 1")
    assert (
        do_sample(
            loaded_llm, sql_lora_files, tensorizer_config_dict=tc_as_dict, lora_id=1
        )
        == EXPECTED_LORA_OUTPUT
    )
-												[Misc] Add SPDX-License-Identifier headers to python source files (#12628)

- **Add SPDX license headers to python source files**
- **Check for SPDX headers using pre-commit**

commit 9d7ef44c3cfb72ca4c32e1c677d99259d10d4745
Author: Russell Bryant <rbryant@redhat.com>
Date:   Fri Jan 31 14:18:24 2025 -0500

    Add SPDX license headers to python source files
    
This commit adds SPDX license headers to python source files as
recommended to
the project by the Linux Foundation. These headers provide a concise way
that is
both human and machine readable for communicating license information
for each
source file. It helps avoid any ambiguity about the license of the code
and can
    also be easily used by tools to help manage license compliance.
    
The Linux Foundation runs license scans against the codebase to help
ensure
    we are in compliance with the licenses of the code we use, including
dependencies. Having these headers in place helps that tool do its job.
    
    More information can be found on the SPDX site:
    
    - https://spdx.dev/learn/handling-license-info/
    
    Signed-off-by: Russell Bryant <rbryant@redhat.com>

commit 5a1cf1cb3b80759131c73f6a9dddebccac039dea
Author: Russell Bryant <rbryant@redhat.com>
Date:   Fri Jan 31 14:36:32 2025 -0500

    Check for SPDX headers using pre-commit
    
    Signed-off-by: Russell Bryant <rbryant@redhat.com>

---------

Signed-off-by: Russell Bryant <rbryant@redhat.com>
											
										
										
											2025-02-02 14:58:18 -05:00
+								# SPDX-License-Identifier: Apache-2.0
-												[Misc] Add SPDX-FileCopyrightText  (#19100)

Signed-off-by: simon-mo <simon.mo@hey.com>
											
										
										
											2025-06-03 11:20:17 -07:00
+								# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-												[Frontend] [Core] Add Tensorizer support for V1, LoRA adapter serialization and deserialization (#17926)

Signed-off-by: Sanger Steel <sangersteel@gmail.com>
											
										
										
											2025-05-22 21:44:18 -04:00
+								import subprocess
 								import sys
-												[Misc] Add SPDX-License-Identifier headers to python source files (#12628)

- **Add SPDX license headers to python source files**
- **Check for SPDX headers using pre-commit**

commit 9d7ef44c3cfb72ca4c32e1c677d99259d10d4745
Author: Russell Bryant <rbryant@redhat.com>
Date:   Fri Jan 31 14:18:24 2025 -0500

    Add SPDX license headers to python source files
    
This commit adds SPDX license headers to python source files as
recommended to
the project by the Linux Foundation. These headers provide a concise way
that is
both human and machine readable for communicating license information
for each
source file. It helps avoid any ambiguity about the license of the code
and can
    also be easily used by tools to help manage license compliance.
    
The Linux Foundation runs license scans against the codebase to help
ensure
    we are in compliance with the licenses of the code we use, including
dependencies. Having these headers in place helps that tool do its job.
    
    More information can be found on the SPDX site:
    
    - https://spdx.dev/learn/handling-license-info/
    
    Signed-off-by: Russell Bryant <rbryant@redhat.com>

commit 5a1cf1cb3b80759131c73f6a9dddebccac039dea
Author: Russell Bryant <rbryant@redhat.com>
Date:   Fri Jan 31 14:36:32 2025 -0500

    Check for SPDX headers using pre-commit
    
    Signed-off-by: Russell Bryant <rbryant@redhat.com>

---------

Signed-off-by: Russell Bryant <rbryant@redhat.com>
											
										
										
											2025-02-02 14:58:18 -05:00
-												[LoRA] LoRA cuda graph specialization (#25914)

Signed-off-by: Andy Lo <andy@mistral.ai>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2025-10-20 05:21:09 +01:00
+								import pytest
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
+								import vllm
-												[LoRA] LoRA cuda graph specialization (#25914)

Signed-off-by: Andy Lo <andy@mistral.ai>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2025-10-20 05:21:09 +01:00
+								import vllm.config
-												[Frontend] [Core] Add Tensorizer support for V1, LoRA adapter serialization and deserialization (#17926)

Signed-off-by: Sanger Steel <sangersteel@gmail.com>
											
										
										
											2025-05-22 21:44:18 -04:00
+								from vllm import LLM
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
+								from vllm.lora.request import LoRARequest
-												[Frontend] [Core] Add Tensorizer support for V1, LoRA adapter serialization and deserialization (#17926)

Signed-off-by: Sanger Steel <sangersteel@gmail.com>
											
										
										
											2025-05-22 21:44:18 -04:00
+								from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
-												[Frontend] [Core] Add Tensorizer support for V1, LoRA adapter serialization and deserialization (#17926)

Signed-off-by: Sanger Steel <sangersteel@gmail.com>
											
										
										
											2025-05-22 21:44:18 -04:00
+								from ..utils import VLLM_PATH, create_new_process_for_each_test, multi_gpu_test
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
 								MODEL_PATH = "meta-llama/Llama-2-7b-hf"
 								EXPECTED_LORA_OUTPUT = [
 								    "  SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ",  # noqa: E501
-												Fix per file ruff ignores related to line length (#26262)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-10-06 06:12:40 +01:00
+								    "  SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ",
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
+								    "  SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ",  # noqa: E501
 								    "  SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ",  # noqa: E501
-												Fix per file ruff ignores related to line length (#26262)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-10-06 06:12:40 +01:00
+								    "  SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ",
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
+								    "  SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' ",  # noqa: E501
 								]
-												[Frontend] [Core] Add Tensorizer support for V1, LoRA adapter serialization and deserialization (#17926)

Signed-off-by: Sanger Steel <sangersteel@gmail.com>
											
										
										
											2025-05-22 21:44:18 -04:00
+								def do_sample(
 								    llm: vllm.LLM,
 								    lora_path: str,
 								    lora_id: int,
 								    tensorizer_config_dict: dict | None = None,
 								) -> list[str]:
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
+								    prompts = [
 								        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
 								        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
 								        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]",  # noqa: E501
 								        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]",  # noqa: E501
 								        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]",  # noqa: E501
 								        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]",  # noqa: E501
 								    ]
-												[Frontend] [Core] Add Tensorizer support for V1, LoRA adapter serialization and deserialization (#17926)

Signed-off-by: Sanger Steel <sangersteel@gmail.com>
											
										
										
											2025-05-22 21:44:18 -04:00
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
+								    sampling_params = vllm.SamplingParams(
-												[V1][Perf] Faster incremental detokenization (#15137)

Signed-off-by: Nick Hill <nhill@redhat.com>
											
										
										
											2025-04-17 07:45:24 -07:00
+								        temperature=0, max_tokens=256, skip_special_tokens=False, stop=["[/assistant]"]
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
+								    )
-												[Frontend] [Core] Add Tensorizer support for V1, LoRA adapter serialization and deserialization (#17926)

Signed-off-by: Sanger Steel <sangersteel@gmail.com>
											
										
										
											2025-05-22 21:44:18 -04:00
 								    if tensorizer_config_dict is not None:
 								        outputs = llm.generate(
 								            prompts,
 								            sampling_params,
 								            lora_request=LoRARequest(
 								                str(lora_id),
 								                lora_id,
 								                lora_path,
 								                tensorizer_config_dict=tensorizer_config_dict,
 								            )
 								            if lora_id
 								            else None,
 								        )
 								    else:
 								        outputs = llm.generate(
 								            prompts,
 								            sampling_params,
 								            lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
 								            if lora_id
 								            else None,
 								        )
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
+								    # Print the outputs.
-												Update deprecated Python 3.8 typing (#13971)


											
										
										
											2025-03-03 01:34:51 +00:00
+								    generated_texts: list[str] = []
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
+								    for output in outputs:
 								        prompt = output.prompt
 								        generated_text = output.outputs[0].text
 								        generated_texts.append(generated_text)
 								        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 								    return generated_texts
-												[Frontend] [Core] Add Tensorizer support for V1, LoRA adapter serialization and deserialization (#17926)

Signed-off-by: Sanger Steel <sangersteel@gmail.com>
											
										
										
											2025-05-22 21:44:18 -04:00
+								def generate_and_test(llm, sql_lora_files, tensorizer_config_dict: dict | None = None):
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
+								    print("lora adapter created")
 								    print("lora 1")
-												[Frontend] [Core] Add Tensorizer support for V1, LoRA adapter serialization and deserialization (#17926)

Signed-off-by: Sanger Steel <sangersteel@gmail.com>
											
										
										
											2025-05-22 21:44:18 -04:00
+								    assert (
 								        do_sample(
 								            llm,
 								            sql_lora_files,
 								            tensorizer_config_dict=tensorizer_config_dict,
 								            lora_id=1,
 								        )
 								        == EXPECTED_LORA_OUTPUT
-												Convert formatting to use `ruff` instead of `yapf` + `isort` (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-10-05 15:06:22 +01:00
+								    )
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
 								    print("lora 2")
-												[Frontend] [Core] Add Tensorizer support for V1, LoRA adapter serialization and deserialization (#17926)

Signed-off-by: Sanger Steel <sangersteel@gmail.com>
											
										
										
											2025-05-22 21:44:18 -04:00
+								    assert (
 								        do_sample(
 								            llm,
 								            sql_lora_files,
 								            tensorizer_config_dict=tensorizer_config_dict,
 								            lora_id=2,
 								        )
 								        == EXPECTED_LORA_OUTPUT
-												Convert formatting to use `ruff` instead of `yapf` + `isort` (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-10-05 15:06:22 +01:00
+								    )
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
 								    print("removing lora")
-												[Bugfix][ROCm] running new process using spawn method for rocm in tests. (#14810)

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-03-17 19:33:35 +08:00
+								@create_new_process_for_each_test()
-												[LoRA] LoRA cuda graph specialization (#25914)

Signed-off-by: Andy Lo <andy@mistral.ai>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2025-10-20 05:21:09 +01:00
+								@pytest.mark.parametrize("cudagraph_specialize_lora", [True, False])
 								def test_llama_lora(sql_lora_files, cudagraph_specialize_lora: bool):
-												[Bugfix] LoRA V0 - Fix case where `max_num_seqs` is between cudagraph capture sizes (#15308)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
											
										
										
											2025-03-22 05:03:32 -04:00
+								    llm = vllm.LLM(
 								        MODEL_PATH,
-												[Core] Remove tokenizer group in vLLM (#24078)

Signed-off-by: Zhuohan Li <zhuohan123@gmail.com>
											
										
										
											2025-09-17 01:42:59 -07:00
+								        tokenizer=sql_lora_files,
-												[Bugfix] LoRA V0 - Fix case where `max_num_seqs` is between cudagraph capture sizes (#15308)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
											
										
										
											2025-03-22 05:03:32 -04:00
+								        enable_lora=True,
 								        # also test odd max_num_seqs
 								        max_num_seqs=13,
-												[V0 Deprecation] Remove V0 LoRA test (#23418)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2025-08-22 17:56:51 +08:00
+								        max_loras=4,
-												[LoRA] LoRA cuda graph specialization (#25914)

Signed-off-by: Andy Lo <andy@mistral.ai>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2025-10-20 05:21:09 +01:00
+								        compilation_config=vllm.config.CompilationConfig(
 								            cudagraph_specialize_lora=cudagraph_specialize_lora,
 								        ),
-												[V0 Deprecation] Remove V0 LoRA test (#23418)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2025-08-22 17:56:51 +08:00
+								    )
-												[Misc][LoRA] Move the implementation of lora bias to punica.py (#10829)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2024-12-03 01:53:36 +08:00
+								    generate_and_test(llm, sql_lora_files)
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
+								@multi_gpu_test(num_gpus=4)
 								def test_llama_lora_tp4(sql_lora_files):
 								    llm = vllm.LLM(
 								        MODEL_PATH,
-												[Core] Remove tokenizer group in vLLM (#24078)

Signed-off-by: Zhuohan Li <zhuohan123@gmail.com>
											
										
										
											2025-09-17 01:42:59 -07:00
+								        tokenizer=sql_lora_files,
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
+								        enable_lora=True,
 								        max_num_seqs=16,
 								        max_loras=4,
 								        tensor_parallel_size=4,
 								    )
-												[Misc][LoRA] Move the implementation of lora bias to punica.py (#10829)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2024-12-03 01:53:36 +08:00
+								    generate_and_test(llm, sql_lora_files)
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
 								@multi_gpu_test(num_gpus=4)
 								def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
 								    llm = vllm.LLM(
 								        MODEL_PATH,
-												[Core] Remove tokenizer group in vLLM (#24078)

Signed-off-by: Zhuohan Li <zhuohan123@gmail.com>
											
										
										
											2025-09-17 01:42:59 -07:00
+								        tokenizer=sql_lora_files,
-												[Bugfix] Fix LoRA weight sharding (#10450)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-11-24 09:23:17 +08:00
+								        enable_lora=True,
 								        max_num_seqs=16,
 								        max_loras=4,
 								        tensor_parallel_size=4,
 								        fully_sharded_loras=True,
 								    )
-												[Misc][LoRA] Move the implementation of lora bias to punica.py (#10829)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2024-12-03 01:53:36 +08:00
+								    generate_and_test(llm, sql_lora_files)
-												[Frontend] [Core] Add Tensorizer support for V1, LoRA adapter serialization and deserialization (#17926)

Signed-off-by: Sanger Steel <sangersteel@gmail.com>
											
										
										
											2025-05-22 21:44:18 -04:00
 								@multi_gpu_test(num_gpus=2)
 								def test_tp2_serialize_and_deserialize_lora(
 								    tmp_path, sql_lora_files, sql_lora_huggingface_id
 								):
 								    # Run the tensorizing of the LoRA adapter and the model in a subprocess
 								    # to guarantee cleanup
 								    tp_size = 2
 								    model_name = "model-rank-%03d.tensors"
 								    model_ref = MODEL_PATH
 								    lora_path = sql_lora_huggingface_id
 								    suffix = "test"
 								    try:
 								        result = subprocess.run(
 								            [
 								                sys.executable,
-												[Doc] Move examples and further reorganize user guide (#18666)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-05-26 22:38:04 +08:00
+								                f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py",
 								                "--model",
-												[Frontend] [Core] Add Tensorizer support for V1, LoRA adapter serialization and deserialization (#17926)

Signed-off-by: Sanger Steel <sangersteel@gmail.com>
											
										
										
											2025-05-22 21:44:18 -04:00
+								                MODEL_PATH,
 								                "--lora-path",
 								                lora_path,
 								                "--tensor-parallel-size",
 								                str(tp_size),
 								                "serialize",
 								                "--serialized-directory",
-												[Frontend] [Core] Integrate Tensorizer in to S3 loading machinery, allow passing arbitrary arguments during save/load (#19619)

Signed-off-by: Sanger Steel <sangersteel@gmail.com>
Co-authored-by: Eta <esyra@coreweave.com>
											
										
										
											2025-07-08 01:47:43 -04:00
+								                str(tmp_path),
 								                "--suffix",
 								                suffix,
 								                "--serialization-kwargs",
 								                '{"limit_cpu_concurrency": 4}',
-												[Frontend] [Core] Add Tensorizer support for V1, LoRA adapter serialization and deserialization (#17926)

Signed-off-by: Sanger Steel <sangersteel@gmail.com>
											
										
										
											2025-05-22 21:44:18 -04:00
+								            ],
 								            check=True,
 								            capture_output=True,
 								            text=True,
 								        )
 								    except subprocess.CalledProcessError as e:
 								        print("Tensorizing failed.")
 								        print("STDOUT:\n", e.stdout)
 								        print("STDERR:\n", e.stderr)
 								        raise
 								    print("STDOUT:\n", result.stdout)
 								    model_uri = tmp_path / "vllm" / model_ref / suffix / model_name
 								    tensorizer_config = TensorizerConfig(tensorizer_uri=str(model_uri))
-												[Misc] unify variable for LLM instance (#20996)

Signed-off-by: Andy Xie <andy.xning@gmail.com>
											
										
										
											2025-07-21 19:18:33 +08:00
+								    loaded_llm = LLM(
 								        model=model_ref,
-												[Core] Remove tokenizer group in vLLM (#24078)

Signed-off-by: Zhuohan Li <zhuohan123@gmail.com>
											
										
										
											2025-09-17 01:42:59 -07:00
+								        tokenizer=sql_lora_files,
-												[Misc] unify variable for LLM instance (#20996)

Signed-off-by: Andy Xie <andy.xning@gmail.com>
											
										
										
											2025-07-21 19:18:33 +08:00
+								        load_format="tensorizer",
 								        enable_lora=True,
 								        enforce_eager=True,
 								        model_loader_extra_config=tensorizer_config,
 								        max_num_seqs=13,
 								        tensor_parallel_size=2,
 								        max_loras=2,
 								    )
-												[Frontend] [Core] Add Tensorizer support for V1, LoRA adapter serialization and deserialization (#17926)

Signed-off-by: Sanger Steel <sangersteel@gmail.com>
											
										
										
											2025-05-22 21:44:18 -04:00
-												[Bugfix] [CI] Fix Tensorizer LoRA test (#20760)

Signed-off-by: Sanger Steel <sangersteel@gmail.com>
											
										
										
											2025-07-10 15:07:06 -04:00
+								    tc_as_dict = tensorizer_config.to_serializable()
-												[Frontend] [Core] Add Tensorizer support for V1, LoRA adapter serialization and deserialization (#17926)

Signed-off-by: Sanger Steel <sangersteel@gmail.com>
											
										
										
											2025-05-22 21:44:18 -04:00
 								    print("lora adapter created")
 								    print("lora 1")
-												[Misc] unify variable for LLM instance (#20996)

Signed-off-by: Andy Xie <andy.xning@gmail.com>
											
										
										
											2025-07-21 19:18:33 +08:00
+								    assert (
 								        do_sample(
-												[Bugfix] [CI] Fix Tensorizer LoRA test (#20760)

Signed-off-by: Sanger Steel <sangersteel@gmail.com>
											
										
										
											2025-07-10 15:07:06 -04:00
+								            loaded_llm, sql_lora_files, tensorizer_config_dict=tc_as_dict, lora_id=1
-												[Frontend] [Core] Add Tensorizer support for V1, LoRA adapter serialization and deserialization (#17926)

Signed-off-by: Sanger Steel <sangersteel@gmail.com>
											
										
										
											2025-05-22 21:44:18 -04:00
+								        )
 								        == EXPECTED_LORA_OUTPUT
-												Convert formatting to use `ruff` instead of `yapf` + `isort` (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-10-05 15:06:22 +01:00
+								    )