Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor
2025-10-05 15:06:22 +01:00
committed by GitHub
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions

View File

@@ -10,12 +10,16 @@ import torch
import torch.nn as nn
from huggingface_hub import snapshot_download
from vllm.distributed import (cleanup_dist_env_and_memory,
init_distributed_environment,
initialize_model_parallel)
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
MergedColumnParallelLinear,
RowParallelLinear)
from vllm.distributed import (
cleanup_dist_env_and_memory,
init_distributed_environment,
initialize_model_parallel,
)
from vllm.model_executor.layers.linear import (
ColumnParallelLinear,
MergedColumnParallelLinear,
RowParallelLinear,
)
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.model_executor.models.interfaces import SupportsLoRA
@@ -47,11 +51,13 @@ def dist_init():
if current_platform.is_cpu() or current_platform.is_tpu():
backend = "gloo"
init_distributed_environment(world_size=1,
rank=0,
distributed_init_method=f"file://{temp_file}",
local_rank=0,
backend=backend)
init_distributed_environment(
world_size=1,
rank=0,
distributed_init_method=f"file://{temp_file}",
local_rank=0,
backend=backend,
)
initialize_model_parallel(1, 1)
yield
cleanup_dist_env_and_memory(shutdown_ray=True)
@@ -66,10 +72,9 @@ def dist_init_torch_only():
backend = "gloo"
temp_file = tempfile.mkstemp()[1]
torch.distributed.init_process_group(world_size=1,
rank=0,
init_method=f"file://{temp_file}",
backend=backend)
torch.distributed.init_process_group(
world_size=1, rank=0, init_method=f"file://{temp_file}", backend=backend
)
class DummyLoRAModel(nn.Sequential, SupportsLoRA):
@@ -79,24 +84,30 @@ class DummyLoRAModel(nn.Sequential, SupportsLoRA):
@pytest.fixture
def dummy_model() -> nn.Module:
model = DummyLoRAModel(
OrderedDict([
("dense1", ColumnParallelLinear(764, 100)),
("dense2", RowParallelLinear(100, 50)),
(
"layer1",
nn.Sequential(
OrderedDict([
("dense1", ColumnParallelLinear(100, 10)),
("dense2", RowParallelLinear(10, 50)),
])),
),
("act2", nn.ReLU()),
("output", ColumnParallelLinear(50, 10)),
("outact", nn.Sigmoid()),
# Special handling for lm_head & sampler
("lm_head", ParallelLMHead(512, 10)),
("logits_processor", LogitsProcessor(512)),
]))
OrderedDict(
[
("dense1", ColumnParallelLinear(764, 100)),
("dense2", RowParallelLinear(100, 50)),
(
"layer1",
nn.Sequential(
OrderedDict(
[
("dense1", ColumnParallelLinear(100, 10)),
("dense2", RowParallelLinear(10, 50)),
]
)
),
),
("act2", nn.ReLU()),
("output", ColumnParallelLinear(50, 10)),
("outact", nn.Sigmoid()),
# Special handling for lm_head & sampler
("lm_head", ParallelLMHead(512, 10)),
("logits_processor", LogitsProcessor(512)),
]
)
)
model.config = MagicMock()
model.embedding_modules = {"lm_head": "lm_head"}
model.unpadded_vocab_size = 32000
@@ -106,24 +117,30 @@ def dummy_model() -> nn.Module:
@pytest.fixture
def dummy_model_gate_up() -> nn.Module:
model = DummyLoRAModel(
OrderedDict([
("dense1", ColumnParallelLinear(764, 100)),
("dense2", RowParallelLinear(100, 50)),
(
"layer1",
nn.Sequential(
OrderedDict([
("dense1", ColumnParallelLinear(100, 10)),
("dense2", RowParallelLinear(10, 50)),
])),
),
("act2", nn.ReLU()),
("gate_up_proj", MergedColumnParallelLinear(50, [5, 5])),
("outact", nn.Sigmoid()),
# Special handling for lm_head & sampler
("lm_head", ParallelLMHead(512, 10)),
("logits_processor", LogitsProcessor(512)),
]))
OrderedDict(
[
("dense1", ColumnParallelLinear(764, 100)),
("dense2", RowParallelLinear(100, 50)),
(
"layer1",
nn.Sequential(
OrderedDict(
[
("dense1", ColumnParallelLinear(100, 10)),
("dense2", RowParallelLinear(10, 50)),
]
)
),
),
("act2", nn.ReLU()),
("gate_up_proj", MergedColumnParallelLinear(50, [5, 5])),
("outact", nn.Sigmoid()),
# Special handling for lm_head & sampler
("lm_head", ParallelLMHead(512, 10)),
("logits_processor", LogitsProcessor(512)),
]
)
)
model.config = MagicMock()
model.packed_modules_mapping = {
"gate_up_proj": [

View File

@@ -7,7 +7,8 @@ import pytest
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args)
build_async_engine_client_from_engine_args,
)
from vllm.inputs import TextPrompt
from vllm.lora.request import LoRARequest
from vllm.sampling_params import SamplingParams
@@ -26,14 +27,10 @@ def get_lora_requests(lora_path) -> list[LoRARequest]:
return lora_requests
async def requests_processing_time(llm,
lora_requests: list[LoRARequest]) -> float:
sampling_params = SamplingParams(n=1,
temperature=0.0,
top_p=1.0,
ignore_eos=True,
max_tokens=1)
async def requests_processing_time(llm, lora_requests: list[LoRARequest]) -> float:
sampling_params = SamplingParams(
n=1, temperature=0.0, top_p=1.0, ignore_eos=True, max_tokens=1
)
generators = []
start = time.perf_counter()
@@ -41,11 +38,11 @@ async def requests_processing_time(llm,
for lora_request in lora_requests:
lora_int_id = lora_request.lora_int_id
generator = llm.generate(
prompt=TextPrompt(prompt=f"hello {lora_int_id}",
multi_modal_data=None), # type: ignore
prompt=TextPrompt(prompt=f"hello {lora_int_id}", multi_modal_data=None), # type: ignore
sampling_params=sampling_params,
lora_request=lora_request,
request_id=f"test{lora_int_id}")
request_id=f"test{lora_int_id}",
)
generators.append(generator)
all_gens = merge_async_iterators(*generators)
@@ -58,13 +55,13 @@ async def requests_processing_time(llm,
@pytest.mark.asyncio
async def test_add_lora(chatglm3_lora_files):
"""
"""
The add_lora function is used to preload some LoRA adapters into the
engine in anticipation of future requests using these adapters. To test
this functionality, we use the async engine to process some requests - We
do it twice, once with add_lora() preloading and once without.
We measure the request processing time in both cases and expect the time
We measure the request processing time in both cases and expect the time
to be lesser in the case with add_lora() calls.
"""
lora_requests: list[LoRARequest] = get_lora_requests(chatglm3_lora_files)
@@ -78,18 +75,18 @@ async def test_add_lora(chatglm3_lora_files):
max_loras=max_loras,
max_lora_rank=LORA_RANK,
max_model_len=128,
gpu_memory_utilization=0.8, #avoid OOM
gpu_memory_utilization=0.8, # avoid OOM
trust_remote_code=True,
enforce_eager=True)
enforce_eager=True,
)
# split lora_requests into 3 parts
part_size = len(lora_requests) // 3
dummy_run_requests = lora_requests[:part_size]
warmup_run_requests = lora_requests[part_size:part_size * 2]
cold_run_requests = lora_requests[part_size * 2:]
warmup_run_requests = lora_requests[part_size : part_size * 2]
cold_run_requests = lora_requests[part_size * 2 :]
async with build_async_engine_client_from_engine_args(engine_args) as llm:
# Dummy run - So any 1-time functionality like triton kernel compilation
# is complete here.
await requests_processing_time(llm, dummy_run_requests)
@@ -101,18 +98,16 @@ async def test_add_lora(chatglm3_lora_files):
# Test that all all_lora calls are successful.
assert all(add_lora_results)
time_with_add_lora = await requests_processing_time(
llm, warmup_run_requests)
time_with_add_lora = await requests_processing_time(llm, warmup_run_requests)
# Run without any warmup
time_cold_start = await requests_processing_time(
llm, cold_run_requests)
time_cold_start = await requests_processing_time(llm, cold_run_requests)
print(f"time hot-start {time_with_add_lora} vs "
f"time cold-start {time_cold_start} ")
print(f"time hot-start {time_with_add_lora} vs time cold-start {time_cold_start} ")
assert time_with_add_lora < time_cold_start, (
f"time_with_add_lora={time_with_add_lora}, "
f"time_cold_start={time_cold_start}"
"The engine request processing time with LoRA pre-loading "
"must be less than the version that does on-demand LoRA loading.")
"must be less than the version that does on-demand LoRA loading."
)

View File

@@ -21,20 +21,18 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
prompts = [
PROMPT_TEMPLATE.format(query="How many singers do we have?"),
PROMPT_TEMPLATE.format(
query=
"What is the average, minimum, and maximum age of all singers from France?" # noqa: E501
query="What is the average, minimum, and maximum age of all singers from France?" # noqa: E501
),
PROMPT_TEMPLATE.format(
query=
"Show name, country, age for all singers ordered by age from the oldest to the youngest." # noqa: E501
query="Show name, country, age for all singers ordered by age from the oldest to the youngest." # noqa: E501
),
]
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
outputs = llm.generate(
prompts,
sampling_params,
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
if lora_id else None)
lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
)
# Print the outputs.
generated_texts: list[str] = []
for output in outputs:
@@ -47,13 +45,15 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
@create_new_process_for_each_test()
def test_chatglm3_lora(chatglm3_lora_files):
llm = vllm.LLM(MODEL_PATH,
max_model_len=1024,
enable_lora=True,
max_loras=4,
max_lora_rank=64,
trust_remote_code=True,
enable_chunked_prefill=True)
llm = vllm.LLM(
MODEL_PATH,
max_model_len=1024,
enable_lora=True,
max_loras=4,
max_lora_rank=64,
trust_remote_code=True,
enable_chunked_prefill=True,
)
output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
for i in range(len(EXPECTED_LORA_OUTPUT)):
@@ -66,15 +66,17 @@ def test_chatglm3_lora(chatglm3_lora_files):
@multi_gpu_test(num_gpus=4)
@create_new_process_for_each_test()
def test_chatglm3_lora_tp4(chatglm3_lora_files):
llm = vllm.LLM(MODEL_PATH,
max_model_len=1024,
enable_lora=True,
max_loras=4,
max_lora_rank=64,
tensor_parallel_size=4,
trust_remote_code=True,
fully_sharded_loras=False,
enable_chunked_prefill=True)
llm = vllm.LLM(
MODEL_PATH,
max_model_len=1024,
enable_lora=True,
max_loras=4,
max_lora_rank=64,
tensor_parallel_size=4,
trust_remote_code=True,
fully_sharded_loras=False,
enable_chunked_prefill=True,
)
output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
for i in range(len(EXPECTED_LORA_OUTPUT)):
@@ -90,16 +92,18 @@ def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
# https://github.com/NVIDIA/nccl/issues/1790, set a lower value for
# gpu_memory_utilization here because NCCL >= 2.26.3 seems to use
# more GPU memory causing vLLM to OOM
llm = vllm.LLM(MODEL_PATH,
max_model_len=1024,
enable_lora=True,
max_loras=4,
max_lora_rank=64,
tensor_parallel_size=4,
trust_remote_code=True,
fully_sharded_loras=True,
enable_chunked_prefill=True,
gpu_memory_utilization=0.85)
llm = vllm.LLM(
MODEL_PATH,
max_model_len=1024,
enable_lora=True,
max_loras=4,
max_lora_rank=64,
tensor_parallel_size=4,
trust_remote_code=True,
fully_sharded_loras=True,
enable_chunked_prefill=True,
gpu_memory_utilization=0.85,
)
output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
for i in range(len(EXPECTED_LORA_OUTPUT)):
assert output1[i] == EXPECTED_LORA_OUTPUT[i]

View File

@@ -32,15 +32,12 @@ VLLM_RUNNER_BASE_KWARGS = {
"max_lora_rank": 320,
"max_model_len": 12800,
"gpu_memory_utilization": 0.8,
"limit_mm_per_prompt": {
"audio": 1
},
"limit_mm_per_prompt": {"audio": 1},
"enforce_eager": True,
}
def run_test(vllm_runner, audio_assets, lora_request, expected_suffix,
**kwargs):
def run_test(vllm_runner, audio_assets, lora_request, expected_suffix, **kwargs):
inputs = [([AUDIO_PROMPT], [audio_assets[0].audio_and_sample_rate[0]])]
# Apply any additional kwargs as overrides to the base kwargs
@@ -53,11 +50,11 @@ def run_test(vllm_runner, audio_assets, lora_request, expected_suffix,
max_tokens=128,
audios=audios,
lora_request=lora_request,
) for prompts, audios in inputs
)
for prompts, audios in inputs
]
assert vllm_outputs_with_default_lora[-1][-1][-1].endswith(
expected_suffix)
assert vllm_outputs_with_default_lora[-1][-1][-1].endswith(expected_suffix)
def test_active_default_mm_lora(

File diff suppressed because it is too large Load Diff

View File

@@ -19,27 +19,28 @@ EXPECTED_LORA_OUTPUT = [
" SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ", # noqa: E501
" SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ", # noqa: E501
" SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ", # noqa: E501
" SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' " # noqa: E501
" SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' ", # noqa: E501
]
def do_sample(llm: vllm.LLM,
lora_path: str,
lora_id: int,
tensorizer_config_dict: Union[dict, None] = None) -> list[str]:
def do_sample(
llm: vllm.LLM,
lora_path: str,
lora_id: int,
tensorizer_config_dict: Union[dict, None] = None,
) -> list[str]:
prompts = [
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]", # noqa: E501
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]", # noqa: E501
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]", # noqa: E501
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]" # noqa: E501
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]", # noqa: E501
]
sampling_params = vllm.SamplingParams(temperature=0,
max_tokens=256,
skip_special_tokens=False,
stop=["[/assistant]"])
sampling_params = vllm.SamplingParams(
temperature=0, max_tokens=256, skip_special_tokens=False, stop=["[/assistant]"]
)
if tensorizer_config_dict is not None:
outputs = llm.generate(
@@ -49,14 +50,19 @@ def do_sample(llm: vllm.LLM,
str(lora_id),
lora_id,
lora_path,
tensorizer_config_dict=tensorizer_config_dict)
if lora_id else None)
tensorizer_config_dict=tensorizer_config_dict,
)
if lora_id
else None,
)
else:
outputs = llm.generate(
prompts,
sampling_params,
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
if lora_id else None)
if lora_id
else None,
)
# Print the outputs.
generated_texts: list[str] = []
for output in outputs:
@@ -67,42 +73,51 @@ def do_sample(llm: vllm.LLM,
return generated_texts
def generate_and_test(llm,
sql_lora_files,
tensorizer_config_dict: Union[dict, None] = None):
def generate_and_test(
llm, sql_lora_files, tensorizer_config_dict: Union[dict, None] = None
):
print("lora adapter created")
print("lora 1")
assert do_sample(llm,
sql_lora_files,
tensorizer_config_dict=tensorizer_config_dict,
lora_id=1) == EXPECTED_LORA_OUTPUT
assert (
do_sample(
llm,
sql_lora_files,
tensorizer_config_dict=tensorizer_config_dict,
lora_id=1,
)
== EXPECTED_LORA_OUTPUT
)
print("lora 2")
assert do_sample(llm,
sql_lora_files,
tensorizer_config_dict=tensorizer_config_dict,
lora_id=2) == EXPECTED_LORA_OUTPUT
assert (
do_sample(
llm,
sql_lora_files,
tensorizer_config_dict=tensorizer_config_dict,
lora_id=2,
)
== EXPECTED_LORA_OUTPUT
)
print("removing lora")
@create_new_process_for_each_test()
def test_llama_lora(sql_lora_files):
llm = vllm.LLM(
MODEL_PATH,
tokenizer=sql_lora_files,
enable_lora=True,
# also test odd max_num_seqs
max_num_seqs=13,
max_loras=4)
max_loras=4,
)
generate_and_test(llm, sql_lora_files)
@multi_gpu_test(num_gpus=4)
@create_new_process_for_each_test()
def test_llama_lora_tp4(sql_lora_files):
llm = vllm.LLM(
MODEL_PATH,
tokenizer=sql_lora_files,
@@ -117,7 +132,6 @@ def test_llama_lora_tp4(sql_lora_files):
@multi_gpu_test(num_gpus=4)
@create_new_process_for_each_test()
def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
llm = vllm.LLM(
MODEL_PATH,
tokenizer=sql_lora_files,
@@ -132,9 +146,9 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
@multi_gpu_test(num_gpus=2)
@create_new_process_for_each_test()
def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
sql_lora_huggingface_id):
def test_tp2_serialize_and_deserialize_lora(
tmp_path, sql_lora_files, sql_lora_huggingface_id
):
# Run the tensorizing of the LoRA adapter and the model in a subprocess
# to guarantee cleanup
@@ -145,17 +159,28 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
lora_path = sql_lora_huggingface_id
suffix = "test"
try:
result = subprocess.run([
sys.executable,
f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py", "--model",
MODEL_PATH, "--lora-path", lora_path, "--tensor-parallel-size",
str(tp_size), "serialize", "--serialized-directory",
str(tmp_path), "--suffix", suffix, "--serialization-kwargs",
'{"limit_cpu_concurrency": 4}'
],
check=True,
capture_output=True,
text=True)
result = subprocess.run(
[
sys.executable,
f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py",
"--model",
MODEL_PATH,
"--lora-path",
lora_path,
"--tensor-parallel-size",
str(tp_size),
"serialize",
"--serialized-directory",
str(tmp_path),
"--suffix",
suffix,
"--serialization-kwargs",
'{"limit_cpu_concurrency": 4}',
],
check=True,
capture_output=True,
text=True,
)
except subprocess.CalledProcessError as e:
print("Tensorizing failed.")
print("STDOUT:\n", e.stdout)
@@ -167,21 +192,25 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
model_uri = tmp_path / "vllm" / model_ref / suffix / model_name
tensorizer_config = TensorizerConfig(tensorizer_uri=str(model_uri))
loaded_llm = LLM(model=model_ref,
tokenizer=sql_lora_files,
load_format="tensorizer",
enable_lora=True,
enforce_eager=True,
model_loader_extra_config=tensorizer_config,
max_num_seqs=13,
tensor_parallel_size=2,
max_loras=2)
loaded_llm = LLM(
model=model_ref,
tokenizer=sql_lora_files,
load_format="tensorizer",
enable_lora=True,
enforce_eager=True,
model_loader_extra_config=tensorizer_config,
max_num_seqs=13,
tensor_parallel_size=2,
max_loras=2,
)
tc_as_dict = tensorizer_config.to_serializable()
print("lora adapter created")
print("lora 1")
assert do_sample(loaded_llm,
sql_lora_files,
tensorizer_config_dict=tc_as_dict,
lora_id=1) == EXPECTED_LORA_OUTPUT
assert (
do_sample(
loaded_llm, sql_lora_files, tensorizer_config_dict=tc_as_dict, lora_id=1
)
== EXPECTED_LORA_OUTPUT
)

View File

@@ -5,6 +5,7 @@ This script contains:
1. test multi loras service with tp >= 2
2. test multi loras request
"""
import pytest
from tests.utils import multi_gpu_test
@@ -31,14 +32,8 @@ LORA_TEST_EXPECTED = [
def format_chatml_messages(prompt: str):
return [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": prompt
},
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt},
]
@@ -57,7 +52,6 @@ def make_add_lora_request(name: str, path: str):
@multi_gpu_test(num_gpus=2)
def test_multi_loras_with_tp_sync():
llm = LLM(
model=MODEL_PATH,
enable_lora=True,
@@ -116,15 +110,17 @@ def test_multi_loras_with_tp_sync():
def reload_lora(name: str):
"""
reload a lora to simulate the case:
setting `VLLM_ALLOW_RUNTIME_LORA_UPDATING=true`
reload a lora to simulate the case:
setting `VLLM_ALLOW_RUNTIME_LORA_UPDATING=true`
for dynamic lora loading and unloading
"""
remove_lora_response = llm.llm_engine.remove_lora(
lora_id=LORA_NAME_ID_MAP[name])
lora_id=LORA_NAME_ID_MAP[name]
)
add_lora_response = llm.llm_engine.add_lora(
make_add_lora_request(name, LORA_NAME_PATH_MAP[name]))
make_add_lora_request(name, LORA_NAME_PATH_MAP[name])
)
print(f"{remove_lora_response=}, {add_lora_response=}")
@@ -134,7 +130,6 @@ def test_multi_loras_with_tp_sync():
assert outputs == expected
for prompt, expected_output in zip(LORA_TEST_PROMPTS, LORA_TEST_EXPECTED):
output_text = call_llm_get_outputs(prompt, "Alice")
check_outputs(output_text, expected_output)
@@ -175,8 +170,7 @@ def test_multiple_lora_requests():
PROMPTS = ["Hello, my name is"] * 2
LORA_NAME = "Alice"
lora_request = [
LoRARequest(LORA_NAME + str(idx), idx + 1,
LORA_NAME_PATH_MAP[LORA_NAME])
LoRARequest(LORA_NAME + str(idx), idx + 1, LORA_NAME_PATH_MAP[LORA_NAME])
for idx in range(len(PROMPTS))
]
# Multiple SamplingParams should be matched with each prompt

View File

@@ -8,9 +8,7 @@ from vllm.lora.peft_helper import PEFTHelper
from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM
from vllm.model_executor.models.utils import WeightsMapper
lora_lst = [
"baichuan7B", "baichuan7B-zero", "baichuan7B-zero-regex", "chatglm3-6b"
]
lora_lst = ["baichuan7B", "baichuan7B-zero", "baichuan7B-zero-regex", "chatglm3-6b"]
BAICHUAN_LORA_MODULES = [
"W_pack",
"o_proj",
@@ -37,8 +35,9 @@ def test_load_checkpoints(
else:
expected_lora_modules.append(module)
if lora_name == "baichuan7B":
peft_helper = PEFTHelper.from_local_dir(baichuan_lora_files,
max_position_embeddings=4096)
peft_helper = PEFTHelper.from_local_dir(
baichuan_lora_files, max_position_embeddings=4096
)
# For the baichuan7B model, load it's LoRA,
# and the test should pass.
LoRAModel.from_local_checkpoint(
@@ -48,13 +47,15 @@ def test_load_checkpoints(
lora_model_id=1,
device="cpu",
embedding_modules=embedding_modules,
embedding_padding_modules=embed_padding_modules)
embedding_padding_modules=embed_padding_modules,
)
elif lora_name == "baichuan7B-zero":
# Test that the target_modules contain prefix
# such as "model.layers.0.self_atten.W_pack", and
# the test should pass.
peft_helper = PEFTHelper.from_local_dir(baichuan_zero_lora_files,
max_position_embeddings=4096)
peft_helper = PEFTHelper.from_local_dir(
baichuan_zero_lora_files, max_position_embeddings=4096
)
LoRAModel.from_local_checkpoint(
baichuan_zero_lora_files,
expected_lora_modules,
@@ -62,12 +63,14 @@ def test_load_checkpoints(
lora_model_id=1,
device="cpu",
embedding_modules=embedding_modules,
embedding_padding_modules=embed_padding_modules)
embedding_padding_modules=embed_padding_modules,
)
elif lora_name == "baichuan7B-zero-regex":
# Test that the `target_modules` in the form of regular expressions,
# such as `model\\..*(W_pack|o_proj)`, and the test should pass.
peft_helper = PEFTHelper.from_local_dir(baichuan_regex_lora_files,
max_position_embeddings=4096)
peft_helper = PEFTHelper.from_local_dir(
baichuan_regex_lora_files, max_position_embeddings=4096
)
LoRAModel.from_local_checkpoint(
baichuan_regex_lora_files,
expected_lora_modules,
@@ -75,13 +78,15 @@ def test_load_checkpoints(
lora_model_id=1,
device="cpu",
embedding_modules=embedding_modules,
embedding_padding_modules=embed_padding_modules)
embedding_padding_modules=embed_padding_modules,
)
else:
# For the baichuan7B model, load chatglm3-6b's LoRA,
# and the test should raise the following error.
expected_error = "Please verify that the loaded LoRA module is correct" # noqa: E501
peft_helper = PEFTHelper.from_local_dir(chatglm3_lora_files,
max_position_embeddings=4096)
peft_helper = PEFTHelper.from_local_dir(
chatglm3_lora_files, max_position_embeddings=4096
)
with pytest.raises(ValueError, match=expected_error):
LoRAModel.from_local_checkpoint(
chatglm3_lora_files,
@@ -90,11 +95,11 @@ def test_load_checkpoints(
lora_model_id=1,
device="cpu",
embedding_modules=embedding_modules,
embedding_padding_modules=embed_padding_modules)
embedding_padding_modules=embed_padding_modules,
)
def test_lora_weights_mapping(baichuan_lora_files):
packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
@@ -113,8 +118,9 @@ def test_lora_weights_mapping(baichuan_lora_files):
".layers.": ".baichuan_layers.",
},
)
peft_helper = PEFTHelper.from_local_dir(baichuan_lora_files,
max_position_embeddings=4096)
peft_helper = PEFTHelper.from_local_dir(
baichuan_lora_files, max_position_embeddings=4096
)
lora_model = LoRAModel.from_local_checkpoint(
baichuan_lora_files,
expected_lora_modules,

View File

@@ -3,11 +3,13 @@
"""
Script to test add_lora, remove_lora, pin_lora, list_loras functions.
"""
import pytest
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args)
build_async_engine_client_from_engine_args,
)
from vllm.lora.request import LoRARequest
from vllm.v1.engine.llm_engine import LLMEngine
@@ -17,23 +19,24 @@ LORA_RANK = 8
def make_lora_request(lora_id: int):
return LoRARequest(lora_name=f"{lora_id}",
lora_int_id=lora_id,
lora_path=LORA_MODULE_PATH)
return LoRARequest(
lora_name=f"{lora_id}", lora_int_id=lora_id, lora_path=LORA_MODULE_PATH
)
def test_lora_functions_sync():
max_loras = 4
# Create engine in eager-mode. Due to high max_loras, the CI can
# OOM during cuda-graph capture.
engine_args = EngineArgs(model=MODEL_PATH,
enable_lora=True,
max_loras=max_loras,
max_lora_rank=LORA_RANK,
max_model_len=128,
gpu_memory_utilization=0.8,
enforce_eager=True)
engine_args = EngineArgs(
model=MODEL_PATH,
enable_lora=True,
max_loras=max_loras,
max_lora_rank=LORA_RANK,
max_model_len=128,
gpu_memory_utilization=0.8,
enforce_eager=True,
)
llm = LLMEngine.from_engine_args(engine_args)
@@ -70,15 +73,16 @@ def test_lora_functions_sync():
@pytest.mark.asyncio
async def test_lora_functions_async():
max_loras = 4
engine_args = AsyncEngineArgs(model=MODEL_PATH,
enable_lora=True,
max_loras=max_loras,
max_lora_rank=LORA_RANK,
max_model_len=128,
gpu_memory_utilization=0.8,
enforce_eager=True)
engine_args = AsyncEngineArgs(
model=MODEL_PATH,
enable_lora=True,
max_loras=max_loras,
max_lora_rank=LORA_RANK,
max_model_len=128,
gpu_memory_utilization=0.8,
enforce_eager=True,
)
async def run_check(fn, args, expected: list):
await fn(args)

View File

@@ -11,8 +11,12 @@ from vllm.model_executor.models.llama import LlamaForCausalLM
# Provide absolute path and huggingface lora ids
lora_fixture_name = ["sql_lora_files", "sql_lora_huggingface_id"]
LLAMA_LORA_MODULES = [
"qkv_proj", "o_proj", "gate_up_proj", "down_proj", "embed_tokens",
"lm_head"
"qkv_proj",
"o_proj",
"gate_up_proj",
"down_proj",
"embed_tokens",
"lm_head",
]
@@ -40,7 +44,8 @@ def test_load_checkpoints_from_huggingface(lora_fixture_name, request):
lora_model_id=1,
device="cpu",
embedding_modules=embedding_modules,
embedding_padding_modules=embed_padding_modules)
embedding_padding_modules=embed_padding_modules,
)
# Assertions to ensure the model is loaded correctly
assert lora_model is not None, "LoRAModel is not loaded correctly"

View File

@@ -10,16 +10,21 @@ from torch import nn
from vllm.config import ModelConfig, VllmConfig
from vllm.config.lora import LoRAConfig
from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
MergedColumnParallelLinearWithLoRA,
RowParallelLinearWithLoRA)
from vllm.lora.layers import (
ColumnParallelLinearWithLoRA,
MergedColumnParallelLinearWithLoRA,
RowParallelLinearWithLoRA,
)
from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
from vllm.lora.models import (LoRAMapping, LoRAModel, LoRAModelManager,
LRUCacheLoRAModelManager)
from vllm.lora.models import (
LoRAMapping,
LoRAModel,
LoRAModelManager,
LRUCacheLoRAModelManager,
)
from vllm.lora.peft_helper import PEFTHelper
from vllm.lora.request import LoRARequest
from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager,
WorkerLoRAManager)
from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager, WorkerLoRAManager
from vllm.platforms import current_platform
from .utils import create_peft_lora
@@ -31,22 +36,25 @@ EMBEDDING_MODULES = {
EMBEDDING_PADDING_MODULES = ["lm_head"]
DEVICES = ([
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
] if current_platform.is_cuda_alike() else ["cpu"])
DEVICES = (
[f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
if current_platform.is_cuda_alike()
else ["cpu"]
)
DEFAULT_DTYPE = torch.get_default_dtype()
@pytest.mark.parametrize("device", DEVICES)
def test_from_lora_tensors(sql_lora_files, device):
tensors = load_file(
os.path.join(sql_lora_files, "adapter_model.safetensors"))
tensors = load_file(os.path.join(sql_lora_files, "adapter_model.safetensors"))
new_embeddings = load_file(
os.path.join(sql_lora_files, "new_embeddings.safetensors"))
os.path.join(sql_lora_files, "new_embeddings.safetensors")
)
peft_helper = PEFTHelper.from_local_dir(sql_lora_files,
max_position_embeddings=4096)
peft_helper = PEFTHelper.from_local_dir(
sql_lora_files, max_position_embeddings=4096
)
lora_model = LoRAModel.from_lora_tensors(
1,
tensors,
@@ -54,7 +62,8 @@ def test_from_lora_tensors(sql_lora_files, device):
device=device,
embeddings=new_embeddings,
embedding_modules=EMBEDDING_MODULES,
embedding_padding_modules=EMBEDDING_PADDING_MODULES)
embedding_padding_modules=EMBEDDING_PADDING_MODULES,
)
for module_name, lora in lora_model.loras.items():
assert lora.module_name == module_name
assert lora.rank == 8
@@ -63,22 +72,27 @@ def test_from_lora_tensors(sql_lora_files, device):
assert lora.lora_b is not None
assert lora.lora_a.device == torch.device(device)
assert lora.lora_b.device == torch.device(device)
assert (lora.lora_a.shape[0] == lora.lora_b.shape[1]
), f"{lora.lora_a.shape=}, {lora.lora_b.shape=}"
assert lora.lora_a.shape[0] == lora.lora_b.shape[1], (
f"{lora.lora_a.shape=}, {lora.lora_b.shape=}"
)
assert lora.lora_a.shape[0] == 8
embeddings_module = next(
(k for k in EMBEDDING_MODULES if k in module_name), None)
(k for k in EMBEDDING_MODULES if k in module_name), None
)
if embeddings_module:
assert torch.equal(
lora.embeddings_tensor,
new_embeddings[EMBEDDING_MODULES[embeddings_module]].to(
device=lora.embeddings_tensor.device))
device=lora.embeddings_tensor.device
),
)
else:
assert lora.embeddings_tensor is None
def create_lora(lora_id: int, model: nn.Module, sub_modules: list[str],
device: torch.device) -> LoRAModel:
def create_lora(
lora_id: int, model: nn.Module, sub_modules: list[str], device: torch.device
) -> LoRAModel:
loras: dict[str, LoRALayerWeights] = {}
for name in sub_modules:
w = model.get_submodule(name).weight
@@ -110,8 +124,7 @@ def create_packed_lora(
8,
16,
torch.rand([8, w.shape[1]], device=device),
torch.rand([w.shape[0] // len(replaced_module_names), 8],
device=device),
torch.rand([w.shape[0] // len(replaced_module_names), 8], device=device),
)
return LoRAModel(lora_id, 8, loras)
@@ -119,42 +132,42 @@ def create_packed_lora(
def test_replace_submodules(dist_init, dummy_model):
model = dummy_model
manager = LoRAModelManager(
model, 1, 1, 1,
LoRAConfig(max_lora_rank=8,
max_cpu_loras=8,
max_loras=8,
lora_dtype=DEFAULT_DTYPE), torch.device(DEVICES[0]))
model,
1,
1,
1,
LoRAConfig(
max_lora_rank=8, max_cpu_loras=8, max_loras=8, lora_dtype=DEFAULT_DTYPE
),
torch.device(DEVICES[0]),
)
model = manager.model
assert isinstance(model.get_submodule("dense1"),
ColumnParallelLinearWithLoRA)
assert isinstance(model.get_submodule("layer1.dense1"),
ColumnParallelLinearWithLoRA)
assert isinstance(model.get_submodule("dense1"), ColumnParallelLinearWithLoRA)
assert isinstance(
model.get_submodule("layer1.dense1"), ColumnParallelLinearWithLoRA
)
assert isinstance(model.get_submodule("dense2"), RowParallelLinearWithLoRA)
assert isinstance(model.get_submodule("layer1.dense2"),
RowParallelLinearWithLoRA)
assert isinstance(model.get_submodule("layer1.dense2"), RowParallelLinearWithLoRA)
@pytest.mark.parametrize("device", DEVICES)
def test_lora_model_manager(dist_init, dummy_model, device):
model = dummy_model
model_lora1 = create_lora(1,
model, ["layer1.dense1", "dense2", "lm_head"],
device=device)
model_lora2 = create_lora(2,
model, ["dense1", "dense2", "lm_head"],
device=device)
model_lora3 = create_lora(3,
model, ["dense1", "dense2", "lm_head"],
device=device)
manager = LoRAModelManager(model,
2,
2,
2,
LoRAConfig(max_lora_rank=8,
max_cpu_loras=3,
max_loras=2,
lora_dtype=DEFAULT_DTYPE),
device=device)
model_lora1 = create_lora(
1, model, ["layer1.dense1", "dense2", "lm_head"], device=device
)
model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"], device=device)
model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"], device=device)
manager = LoRAModelManager(
model,
2,
2,
2,
LoRAConfig(
max_lora_rank=8, max_cpu_loras=3, max_loras=2, lora_dtype=DEFAULT_DTYPE
),
device=device,
)
assert all(x is None for x in manager.lora_index_to_id)
assert manager.add_adapter(model_lora1)
assert manager.activate_adapter(1)
@@ -204,24 +217,21 @@ def test_lora_model_manager(dist_init, dummy_model, device):
@pytest.mark.parametrize("device", DEVICES)
def test_lora_lru_cache_model_manager(dist_init, dummy_model, device):
model = dummy_model
model_lora1 = create_lora(1,
model, ["layer1.dense1", "dense2", "lm_head"],
device=device)
model_lora2 = create_lora(2,
model, ["dense1", "dense2", "lm_head"],
device=device)
model_lora3 = create_lora(3,
model, ["dense1", "dense2", "lm_head"],
device=device)
manager = LRUCacheLoRAModelManager(model,
2,
2,
2,
LoRAConfig(max_lora_rank=8,
max_cpu_loras=3,
max_loras=2,
lora_dtype=DEFAULT_DTYPE),
device=device)
model_lora1 = create_lora(
1, model, ["layer1.dense1", "dense2", "lm_head"], device=device
)
model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"], device=device)
model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"], device=device)
manager = LRUCacheLoRAModelManager(
model,
2,
2,
2,
LoRAConfig(
max_lora_rank=8, max_cpu_loras=3, max_loras=2, lora_dtype=DEFAULT_DTYPE
),
device=device,
)
assert all(x is None for x in manager.lora_index_to_id)
assert manager.add_adapter(model_lora1)
assert manager.activate_adapter(1)
@@ -297,27 +307,22 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
# This tests just the LRU cache functionality, everything else is
# tested in test_lora_model_manager
model = dummy_model
model_lora1 = create_lora(1,
model, ["layer1.dense1", "dense2", "lm_head"],
device=device)
model_lora2 = create_lora(2,
model, ["dense1", "dense2", "lm_head"],
device=device)
model_lora3 = create_lora(3,
model, ["dense1", "dense2", "lm_head"],
device=device)
model_lora4 = create_lora(4,
model, ["dense1", "dense2", "lm_head"],
device=device)
manager = LRUCacheLoRAModelManager(model,
2,
2,
2,
LoRAConfig(max_lora_rank=8,
max_cpu_loras=2,
max_loras=2,
lora_dtype=DEFAULT_DTYPE),
device=device)
model_lora1 = create_lora(
1, model, ["layer1.dense1", "dense2", "lm_head"], device=device
)
model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"], device=device)
model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"], device=device)
model_lora4 = create_lora(4, model, ["dense1", "dense2", "lm_head"], device=device)
manager = LRUCacheLoRAModelManager(
model,
2,
2,
2,
LoRAConfig(
max_lora_rank=8, max_cpu_loras=2, max_loras=2, lora_dtype=DEFAULT_DTYPE
),
device=device,
)
assert all(x is None for x in manager.lora_index_to_id)
# Add up to capacity
@@ -421,12 +426,10 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
@pytest.mark.parametrize("device", DEVICES)
def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device,
tmp_path):
lora_config = LoRAConfig(max_lora_rank=8,
max_cpu_loras=4,
max_loras=4,
lora_dtype=DEFAULT_DTYPE)
def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device, tmp_path):
lora_config = LoRAConfig(
max_lora_rank=8, max_cpu_loras=4, max_loras=4, lora_dtype=DEFAULT_DTYPE
)
dummy_lora_files = f"{tmp_path}/lora_adapter"
os.makedirs(dummy_lora_files, exist_ok=True)
@@ -438,13 +441,13 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device,
)
model_config = ModelConfig(max_model_len=16)
vllm_config = VllmConfig(model_config=model_config,
lora_config=lora_config)
vllm_config = VllmConfig(model_config=model_config, lora_config=lora_config)
vllm_config.scheduler_config.max_num_seqs = 4
vllm_config.scheduler_config.max_num_batched_tokens = 2
worker_adapter_manager = LRUCacheWorkerLoRAManager(
vllm_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
vllm_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES
)
worker_adapter_manager.max_num_seqs = 4
worker_adapter_manager.max_num_batched_tokens = 2
@@ -452,52 +455,64 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device,
worker_adapter_manager.create_lora_manager(dummy_model)
mapping = LoRAMapping([], [])
worker_adapter_manager.set_active_adapters([
LoRARequest("1", 1, dummy_lora_files),
LoRARequest("2", 2, dummy_lora_files)
], mapping)
worker_adapter_manager.set_active_adapters(
[LoRARequest("1", 1, dummy_lora_files), LoRARequest("2", 2, dummy_lora_files)],
mapping,
)
assert worker_adapter_manager.list_adapters() == {1, 2}
assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
worker_adapter_manager.set_active_adapters([
LoRARequest("1", 1, dummy_lora_files),
LoRARequest("3", 3, dummy_lora_files),
LoRARequest("4", 4, dummy_lora_files)
], mapping)
worker_adapter_manager.set_active_adapters(
[
LoRARequest("1", 1, dummy_lora_files),
LoRARequest("3", 3, dummy_lora_files),
LoRARequest("4", 4, dummy_lora_files),
],
mapping,
)
assert worker_adapter_manager.list_adapters() == {1, 2, 3, 4}
assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 3
assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4
worker_adapter_manager.set_active_adapters([
LoRARequest("1", 1, dummy_lora_files),
LoRARequest("2", 2, dummy_lora_files),
LoRARequest("5", 5, dummy_lora_files)
], mapping)
worker_adapter_manager.set_active_adapters(
[
LoRARequest("1", 1, dummy_lora_files),
LoRARequest("2", 2, dummy_lora_files),
LoRARequest("5", 5, dummy_lora_files),
],
mapping,
)
assert worker_adapter_manager.list_adapters() == {1, 2, 4, 5}
assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 5
assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4
worker_adapter_manager.set_active_adapters([
LoRARequest("1", 1, dummy_lora_files),
LoRARequest("1", 1, dummy_lora_files),
LoRARequest("1", 1, dummy_lora_files)
], mapping)
worker_adapter_manager.set_active_adapters(
[
LoRARequest("1", 1, dummy_lora_files),
LoRARequest("1", 1, dummy_lora_files),
LoRARequest("1", 1, dummy_lora_files),
],
mapping,
)
assert worker_adapter_manager.list_adapters() == {1, 2, 4, 5}
assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 5
assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4
worker_adapter_manager.set_active_adapters([
LoRARequest("6", 6, dummy_lora_files),
LoRARequest("7", 7, dummy_lora_files),
LoRARequest("8", 8, dummy_lora_files)
], mapping)
worker_adapter_manager.set_active_adapters(
[
LoRARequest("6", 6, dummy_lora_files),
LoRARequest("7", 7, dummy_lora_files),
LoRARequest("8", 8, dummy_lora_files),
],
mapping,
)
assert worker_adapter_manager.list_adapters() == {1, 6, 7, 8}
assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 7
@@ -506,41 +521,40 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device,
# Over capacity
with pytest.raises(RuntimeError):
worker_adapter_manager.set_active_adapters([
LoRARequest("10", 10, dummy_lora_files),
LoRARequest("11", 11, dummy_lora_files),
LoRARequest("12", 12, dummy_lora_files),
LoRARequest("13", 13, dummy_lora_files),
LoRARequest("14", 14, dummy_lora_files)
], mapping)
worker_adapter_manager.set_active_adapters(
[
LoRARequest("10", 10, dummy_lora_files),
LoRARequest("11", 11, dummy_lora_files),
LoRARequest("12", 12, dummy_lora_files),
LoRARequest("13", 13, dummy_lora_files),
LoRARequest("14", 14, dummy_lora_files),
],
mapping,
)
assert worker_adapter_manager.device == device
assert (worker_adapter_manager._adapter_manager.punica_wrapper.device ==
device)
assert worker_adapter_manager._adapter_manager.punica_wrapper.device == device
@pytest.mark.parametrize("device", DEVICES)
def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device,
tmp_path):
def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path):
# Should remove every LoRA not specified in the request.
lora_config = LoRAConfig(max_lora_rank=8,
max_cpu_loras=4,
max_loras=4,
lora_dtype=DEFAULT_DTYPE)
lora_config = LoRAConfig(
max_lora_rank=8, max_cpu_loras=4, max_loras=4, lora_dtype=DEFAULT_DTYPE
)
model_config = ModelConfig(max_model_len=16)
vllm_config = VllmConfig(model_config=model_config,
lora_config=lora_config)
vllm_config = VllmConfig(model_config=model_config, lora_config=lora_config)
vllm_config.scheduler_config.max_num_seqs = 4
vllm_config.scheduler_config.max_num_batched_tokens = 2
worker_adapter_manager = WorkerLoRAManager(vllm_config, device,
EMBEDDING_MODULES,
EMBEDDING_PADDING_MODULES)
worker_adapter_manager = WorkerLoRAManager(
vllm_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES
)
worker_adapter_manager.vocab_size = (
dummy_model_gate_up.unpadded_vocab_size -
lora_config.lora_extra_vocab_size)
dummy_model_gate_up.unpadded_vocab_size - lora_config.lora_extra_vocab_size
)
worker_adapter_manager.create_lora_manager(dummy_model_gate_up)
dummy_lora_files = f"{tmp_path}/lora_adapter"
@@ -553,49 +567,61 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device,
)
mapping = LoRAMapping([], [])
worker_adapter_manager.set_active_adapters([
LoRARequest("1", 1, dummy_lora_files),
LoRARequest("2", 2, dummy_lora_files)
], mapping)
worker_adapter_manager.set_active_adapters(
[LoRARequest("1", 1, dummy_lora_files), LoRARequest("2", 2, dummy_lora_files)],
mapping,
)
assert worker_adapter_manager.list_adapters() == {1, 2}
assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
worker_adapter_manager.set_active_adapters([
LoRARequest("1", 1, dummy_lora_files),
LoRARequest("3", 3, dummy_lora_files),
LoRARequest("4", 4, dummy_lora_files)
], mapping)
worker_adapter_manager.set_active_adapters(
[
LoRARequest("1", 1, dummy_lora_files),
LoRARequest("3", 3, dummy_lora_files),
LoRARequest("4", 4, dummy_lora_files),
],
mapping,
)
assert worker_adapter_manager.list_adapters() == {1, 3, 4}
assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 3
assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 4
worker_adapter_manager.set_active_adapters([
LoRARequest("1", 1, dummy_lora_files),
LoRARequest("2", 2, dummy_lora_files),
LoRARequest("5", 5, dummy_lora_files)
], mapping)
worker_adapter_manager.set_active_adapters(
[
LoRARequest("1", 1, dummy_lora_files),
LoRARequest("2", 2, dummy_lora_files),
LoRARequest("5", 5, dummy_lora_files),
],
mapping,
)
assert worker_adapter_manager.list_adapters() == {1, 2, 5}
assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 5
worker_adapter_manager.set_active_adapters([
LoRARequest("1", 1, dummy_lora_files),
LoRARequest("1", 1, dummy_lora_files),
LoRARequest("1", 1, dummy_lora_files)
], mapping)
worker_adapter_manager.set_active_adapters(
[
LoRARequest("1", 1, dummy_lora_files),
LoRARequest("1", 1, dummy_lora_files),
LoRARequest("1", 1, dummy_lora_files),
],
mapping,
)
assert worker_adapter_manager.list_adapters() == {1}
assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] is None
assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] is None
worker_adapter_manager.set_active_adapters([
LoRARequest("6", 6, dummy_lora_files),
LoRARequest("7", 7, dummy_lora_files),
LoRARequest("8", 8, dummy_lora_files)
], mapping)
worker_adapter_manager.set_active_adapters(
[
LoRARequest("6", 6, dummy_lora_files),
LoRARequest("7", 7, dummy_lora_files),
LoRARequest("8", 8, dummy_lora_files),
],
mapping,
)
assert worker_adapter_manager.list_adapters() == {6, 7, 8}
assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 8
assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 6
@@ -603,17 +629,19 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device,
# Over capacity
with pytest.raises(RuntimeError):
worker_adapter_manager.set_active_adapters([
LoRARequest("10", 10, dummy_lora_files),
LoRARequest("11", 11, dummy_lora_files),
LoRARequest("12", 12, dummy_lora_files),
LoRARequest("13", 13, dummy_lora_files),
LoRARequest("14", 14, dummy_lora_files)
], mapping)
worker_adapter_manager.set_active_adapters(
[
LoRARequest("10", 10, dummy_lora_files),
LoRARequest("11", 11, dummy_lora_files),
LoRARequest("12", 12, dummy_lora_files),
LoRARequest("13", 13, dummy_lora_files),
LoRARequest("14", 14, dummy_lora_files),
],
mapping,
)
assert worker_adapter_manager.device == device
assert (worker_adapter_manager._adapter_manager.punica_wrapper.device ==
device)
assert worker_adapter_manager._adapter_manager.punica_wrapper.device == device
@pytest.mark.parametrize("device", DEVICES)
@@ -624,7 +652,8 @@ def test_packed_loras(dist_init, dummy_model_gate_up, device):
model,
module_name="gate_up_proj",
replaced_module_names=["gate_proj", "up_proj"],
device=device)
device=device,
)
model_lora1 = create_packed_lora(
2,
model,
@@ -634,19 +663,21 @@ def test_packed_loras(dist_init, dummy_model_gate_up, device):
empty_replaced_module_name="gate_proj",
)
manager = LoRAModelManager(model,
2,
2,
2,
LoRAConfig(max_lora_rank=8,
max_cpu_loras=2,
max_loras=2,
lora_dtype=DEFAULT_DTYPE),
device=device)
manager = LoRAModelManager(
model,
2,
2,
2,
LoRAConfig(
max_lora_rank=8, max_cpu_loras=2, max_loras=2, lora_dtype=DEFAULT_DTYPE
),
device=device,
)
model = manager.model
assert isinstance(model.get_submodule("gate_up_proj"),
MergedColumnParallelLinearWithLoRA)
assert isinstance(
model.get_submodule("gate_up_proj"), MergedColumnParallelLinearWithLoRA
)
# Verify packed lora is correct
model_lora_clone = model_lora.clone(1)
model_lora_clone1 = model_lora1.clone(1)
@@ -659,21 +690,27 @@ def test_packed_loras(dist_init, dummy_model_gate_up, device):
packed_lora = model_lora.get_lora("gate_up_proj")
assert packed_lora and isinstance(packed_lora, PackedLoRALayerWeights)
torch.testing.assert_close(packed_lora.lora_a[0],
model_lora_clone.get_lora("gate_proj").lora_a)
torch.testing.assert_close(packed_lora.lora_b[0],
model_lora_clone.get_lora("gate_proj").lora_b)
torch.testing.assert_close(packed_lora.lora_a[1],
model_lora_clone.get_lora("up_proj").lora_a)
torch.testing.assert_close(packed_lora.lora_b[1],
model_lora_clone.get_lora("up_proj").lora_b)
torch.testing.assert_close(
packed_lora.lora_a[0], model_lora_clone.get_lora("gate_proj").lora_a
)
torch.testing.assert_close(
packed_lora.lora_b[0], model_lora_clone.get_lora("gate_proj").lora_b
)
torch.testing.assert_close(
packed_lora.lora_a[1], model_lora_clone.get_lora("up_proj").lora_a
)
torch.testing.assert_close(
packed_lora.lora_b[1], model_lora_clone.get_lora("up_proj").lora_b
)
packed_lora1 = model_lora1.get_lora("gate_up_proj")
assert packed_lora1 and isinstance(packed_lora1, PackedLoRALayerWeights)
assert packed_lora1.lora_a[0] is None
assert packed_lora1.lora_b[0] is None
torch.testing.assert_close(packed_lora1.lora_a[1],
model_lora_clone1.get_lora("up_proj").lora_a)
torch.testing.assert_close(packed_lora1.lora_b[1],
model_lora_clone1.get_lora("up_proj").lora_b)
torch.testing.assert_close(
packed_lora1.lora_a[1], model_lora_clone1.get_lora("up_proj").lora_a
)
torch.testing.assert_close(
packed_lora1.lora_b[1], model_lora_clone1.get_lora("up_proj").lora_b
)

View File

@@ -15,7 +15,8 @@ MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5"
PROMPT_TEMPLATE = (
"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"
"(<image>./</image>)\nWhat is in the image?<|eot_id|>"
"<|start_header_id|>assistant<|end_header_id|>\n\n")
"<|start_header_id|>assistant<|end_header_id|>\n\n"
)
IMAGE_ASSETS = [
ImageAsset("stop_sign"),
@@ -34,18 +35,18 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
stop_token_ids=[128001, 128009], # eos_id, eot_id
)
inputs = [{
"prompt": PROMPT_TEMPLATE,
"multi_modal_data": {
"image": asset.pil_image
},
} for asset in IMAGE_ASSETS]
inputs = [
{
"prompt": PROMPT_TEMPLATE,
"multi_modal_data": {"image": asset.pil_image},
}
for asset in IMAGE_ASSETS
]
outputs = llm.generate(
inputs,
sampling_params,
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
if lora_id else None,
lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
)
# Print the outputs.
generated_texts: list[str] = []
@@ -58,7 +59,8 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
@pytest.mark.xfail(
current_platform.is_rocm(),
reason="MiniCPM-V dependency xformers incompatible with ROCm")
reason="MiniCPM-V dependency xformers incompatible with ROCm",
)
def test_minicpmv_lora(minicpmv_lora_files):
llm = vllm.LLM(
MODEL_PATH,
@@ -68,10 +70,7 @@ def test_minicpmv_lora(minicpmv_lora_files):
max_lora_rank=8,
enforce_eager=True,
max_model_len=2048,
limit_mm_per_prompt={
"image": 2,
"video": 0
},
limit_mm_per_prompt={"image": 2, "video": 0},
trust_remote_code=True,
)
output1 = do_sample(llm, minicpmv_lora_files, lora_id=1)
@@ -82,11 +81,13 @@ def test_minicpmv_lora(minicpmv_lora_files):
assert EXPECTED_OUTPUT[i].startswith(output2[i])
@pytest.mark.skipif(current_platform.is_cuda_alike(),
reason="Skipping to avoid redundant model tests")
@pytest.mark.skipif(
current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
)
@pytest.mark.xfail(
current_platform.is_rocm(),
reason="MiniCPM-V dependency xformers incompatible with ROCm")
reason="MiniCPM-V dependency xformers incompatible with ROCm",
)
@create_new_process_for_each_test()
def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
llm = vllm.LLM(
@@ -96,10 +97,7 @@ def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
max_loras=4,
max_lora_rank=64,
tensor_parallel_size=4,
limit_mm_per_prompt={
"image": 2,
"video": 0
},
limit_mm_per_prompt={"image": 2, "video": 0},
trust_remote_code=True,
)
output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
@@ -107,11 +105,13 @@ def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
@pytest.mark.skipif(current_platform.is_cuda_alike(),
reason="Skipping to avoid redundant model tests")
@pytest.mark.skipif(
current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
)
@pytest.mark.xfail(
current_platform.is_rocm(),
reason="MiniCPM-V dependency xformers incompatible with ROCm")
reason="MiniCPM-V dependency xformers incompatible with ROCm",
)
@create_new_process_for_each_test()
def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files):
llm = vllm.LLM(
@@ -122,10 +122,7 @@ def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files):
max_lora_rank=8,
tensor_parallel_size=4,
trust_remote_code=True,
limit_mm_per_prompt={
"image": 1,
"video": 0
},
limit_mm_per_prompt={"image": 1, "video": 0},
fully_sharded_loras=True,
)
output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)

View File

@@ -11,15 +11,15 @@ from vllm.platforms import current_platform
MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1"
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int,
prompts: list[str]) -> list[str]:
def do_sample(
llm: vllm.LLM, lora_path: str, lora_id: int, prompts: list[str]
) -> list[str]:
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256)
outputs = llm.generate(
prompts,
sampling_params,
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
if lora_id else None)
lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
)
# Print the outputs.
generated_texts: list[str] = []
for output in outputs:
@@ -33,8 +33,11 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int,
@pytest.mark.parametrize("tp_size", [4])
def test_mixtral_lora(mixtral_lora_files, tp_size):
"""Original test, the LoRA model has the common target modules, not all"""
if torch.cuda.device_count(
) < tp_size and tp_size > 1 and current_platform.is_cuda_alike():
if (
torch.cuda.device_count() < tp_size
and tp_size > 1
and current_platform.is_cuda_alike()
):
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
prompts = [
@@ -57,7 +60,11 @@ def test_mixtral_lora(mixtral_lora_files, tp_size):
"give_opinion(name[SpellForce 3], developer[Grimlore Games], release_year[2017], rating[poor])", # noqa: E501
"inform(name[BioShock], release_year[2007], rating[good], genres[action-adventure, role-playing, shooter], platforms[PlayStation, Xbox, PC], available_on_steam[yes], has_linux_release[no], has_mac_release[yes])", # noqa: E501
]
assert do_sample(llm, mixtral_lora_files, lora_id=1,
prompts=prompts) == expected_lora_output
assert do_sample(llm, mixtral_lora_files, lora_id=2,
prompts=prompts) == expected_lora_output
assert (
do_sample(llm, mixtral_lora_files, lora_id=1, prompts=prompts)
== expected_lora_output
)
assert (
do_sample(llm, mixtral_lora_files, lora_id=2, prompts=prompts)
== expected_lora_output
)

View File

@@ -13,34 +13,27 @@ from vllm.lora.peft_helper import PEFTHelper
ERROR_CASES = [
(
"test_rank",
{
"r": 1024
},
{"r": 1024},
"is greater than max_lora_rank",
),
(
"test_bias",
{
"bias": "all"
},
{"bias": "all"},
"Adapter bias cannot be used without bias_enabled",
),
("test_dora", {
"use_dora": True
}, "does not yet support DoRA"),
("test_dora", {"use_dora": True}, "does not yet support DoRA"),
(
"test_modules_to_save",
{
"modules_to_save": ["lm_head"]
},
{"modules_to_save": ["lm_head"]},
"only supports modules_to_save being None",
),
]
def test_peft_helper_pass(sql_lora_files, tmp_path):
peft_helper = PEFTHelper.from_local_dir(sql_lora_files,
max_position_embeddings=4096)
peft_helper = PEFTHelper.from_local_dir(
sql_lora_files, max_position_embeddings=4096
)
lora_config = LoRAConfig(max_lora_rank=16, max_cpu_loras=3, max_loras=2)
peft_helper.validate_legal(lora_config)
assert peft_helper.r == 8
@@ -74,8 +67,7 @@ def test_peft_helper_pass(sql_lora_files, tmp_path):
with open(config_path, "w") as f:
json.dump(adapter_config, f)
peft_helper = PEFTHelper.from_local_dir(test_dir,
max_position_embeddings=4096)
peft_helper = PEFTHelper.from_local_dir(test_dir, max_position_embeddings=4096)
peft_helper.validate_legal(lora_config)
scaling = peft_helper.lora_alpha / math.sqrt(peft_helper.r)
assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3
@@ -106,4 +98,5 @@ def test_peft_helper_error(
# Test loading the adapter
with pytest.raises(ValueError, match=expected_error):
PEFTHelper.from_local_dir(
test_dir, max_position_embeddings=4096).validate_legal(lora_config)
test_dir, max_position_embeddings=4096
).validate_legal(lora_config)

View File

@@ -21,11 +21,18 @@ def reset_device(reset_default_device):
# Utility shrink and expand operations used as reference implementations.
def sgmv_shrink_for_nslices(
nslices: int, inputs_tensor: torch.Tensor,
lora_weights_lst: list[torch.Tensor], out_tensor: torch.Tensor,
b_seq_start_loc: torch.Tensor, seq_len_tensor: torch.Tensor,
prompt_lora_mapping: torch.Tensor, batches: int, max_seq_length: int,
num_tokens: int, scaling: float):
nslices: int,
inputs_tensor: torch.Tensor,
lora_weights_lst: list[torch.Tensor],
out_tensor: torch.Tensor,
b_seq_start_loc: torch.Tensor,
seq_len_tensor: torch.Tensor,
prompt_lora_mapping: torch.Tensor,
batches: int,
max_seq_length: int,
num_tokens: int,
scaling: float,
):
"""
Wrapper around torch_ops.sgmv_shrink that handles any nslices.
"""
@@ -44,15 +51,20 @@ def sgmv_shrink_for_nslices(
)
def sgmv_expand_for_nslices(nslices: int, hidden_size: int,
inputs_tensor: torch.Tensor,
lora_weights_lst: list[torch.Tensor],
out_tensor: torch.Tensor,
b_seq_start_loc: torch.Tensor,
seq_len_tensor: torch.Tensor,
prompt_lora_mapping: torch.Tensor, batches: int,
max_seq_length: int, num_tokens: int,
add_inputs: bool) -> None:
def sgmv_expand_for_nslices(
nslices: int,
hidden_size: int,
inputs_tensor: torch.Tensor,
lora_weights_lst: list[torch.Tensor],
out_tensor: torch.Tensor,
b_seq_start_loc: torch.Tensor,
seq_len_tensor: torch.Tensor,
prompt_lora_mapping: torch.Tensor,
batches: int,
max_seq_length: int,
num_tokens: int,
add_inputs: bool,
) -> None:
"""
Wrapper around torch_ops.sgmv_expand that handles any nslices.
"""
@@ -94,10 +106,17 @@ def sgmv_expand_for_nslices(nslices: int, hidden_size: int,
_dict_lock = Lock()
def check_lora_shrink_kernel(batches: int, num_loras: int, rank: int,
hidden_size: int, nslices: int,
dtype: torch.dtype, device: str, seq_length: int,
scaling: float):
def check_lora_shrink_kernel(
batches: int,
num_loras: int,
rank: int,
hidden_size: int,
nslices: int,
dtype: torch.dtype,
device: str,
seq_length: int,
scaling: float,
):
"""
Compare outputs of torch_ops.sgmv_shrink and triton_ops.lora_shrink
kernels.
@@ -116,14 +135,19 @@ def check_lora_shrink_kernel(batches: int, num_loras: int, rank: int,
max_seq_length, token_nums = data.meta()
# Setup metadata information for SGMV and reference kernels
sgmv_meta_args = (data.b_seq_start_loc, data.seq_len_tensor,
data.prompt_lora_mapping, batches, max_seq_length,
token_nums)
sgmv_meta_args = (
data.b_seq_start_loc,
data.seq_len_tensor,
data.prompt_lora_mapping,
batches,
max_seq_length,
token_nums,
)
# Setup metadata information for the LoRA kernel.
lora_meta = LoRAKernelMeta.make(max_loras=num_loras,
max_num_tokens=token_nums,
device='cuda')
lora_meta = LoRAKernelMeta.make(
max_loras=num_loras, max_num_tokens=token_nums, device="cuda"
)
lora_meta.prepare_tensors(data.token_lora_mapping)
ref_out_tensor = data.ref_out_tensor
@@ -154,10 +178,17 @@ def check_lora_shrink_kernel(batches: int, num_loras: int, rank: int,
assert_close(out_tensor, ref_out_tensor)
def check_lora_expand_kernel(batches: int, num_loras: int, rank: int,
hidden_size: int, nslices: int,
dtype: torch.dtype, device: str, seq_length: int,
add_inputs: bool):
def check_lora_expand_kernel(
batches: int,
num_loras: int,
rank: int,
hidden_size: int,
nslices: int,
dtype: torch.dtype,
device: str,
seq_length: int,
add_inputs: bool,
):
"""
Compare outputs of torch_ops.sgmv_expand and triton_ops.lora_expand
kernels.
@@ -177,14 +208,19 @@ def check_lora_expand_kernel(batches: int, num_loras: int, rank: int,
max_seq_length, token_nums = data.meta()
# Setup metadata information for SGMV and reference kernels
sgmv_meta_args = (data.b_seq_start_loc, data.seq_len_tensor,
data.prompt_lora_mapping, batches, max_seq_length,
token_nums)
sgmv_meta_args = (
data.b_seq_start_loc,
data.seq_len_tensor,
data.prompt_lora_mapping,
batches,
max_seq_length,
token_nums,
)
# Setup metadata information for the LoRA kernel.
lora_meta = LoRAKernelMeta.make(max_loras=num_loras,
max_num_tokens=token_nums,
device='cuda')
lora_meta = LoRAKernelMeta.make(
max_loras=num_loras, max_num_tokens=token_nums, device="cuda"
)
lora_meta.prepare_tensors(data.token_lora_mapping)
# Setup output tensors
@@ -194,21 +230,25 @@ def check_lora_expand_kernel(batches: int, num_loras: int, rank: int,
with _dict_lock:
# lora_expand kernel
_LORA_B_PTR_DICT.clear()
triton_ops.lora_expand(data.inputs_tensor,
data.lora_weights,
out_tensor,
*lora_meta.meta_args(token_nums=token_nums),
offset_start=0,
add_inputs=add_inputs)
triton_ops.lora_expand(
data.inputs_tensor,
data.lora_weights,
out_tensor,
*lora_meta.meta_args(token_nums=token_nums),
offset_start=0,
add_inputs=add_inputs,
)
# Reference
sgmv_expand_for_nslices(nslices,
hidden_size,
data.inputs_tensor,
data.lora_weights,
ref_out_tensor,
*sgmv_meta_args,
add_inputs=add_inputs)
sgmv_expand_for_nslices(
nslices,
hidden_size,
data.inputs_tensor,
data.lora_weights,
ref_out_tensor,
*sgmv_meta_args,
add_inputs=add_inputs,
)
assert_close(out_tensor, ref_out_tensor)
@@ -299,7 +339,7 @@ HIDDEN_SIZES = [
128000,
128256,
]
#The size of TP
# The size of TP
divisibility = [1, 2, 8, 16, 64]
all_hidden_size = []
@@ -331,10 +371,10 @@ DEVICES = [f"cuda:{0}"]
SEED = [0]
@pytest.mark.parametrize("batches", test_params['batches'])
@pytest.mark.parametrize("num_loras", test_params['num_loras'])
@pytest.mark.parametrize("rank", test_params['max_ranks'])
@pytest.mark.parametrize("hidden_size", test_params['hidden_sizes'])
@pytest.mark.parametrize("batches", test_params["batches"])
@pytest.mark.parametrize("num_loras", test_params["num_loras"])
@pytest.mark.parametrize("rank", test_params["max_ranks"])
@pytest.mark.parametrize("hidden_size", test_params["hidden_sizes"])
@pytest.mark.parametrize("nslices", [1, 2, 3])
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("device", DEVICES)
@@ -358,31 +398,35 @@ def test_kernels(
current_platform.seed_everything(seed)
if op_type == "shrink":
check_lora_shrink_kernel(batches=batches,
num_loras=num_loras,
rank=rank,
hidden_size=hidden_size,
nslices=nslices,
dtype=dtype,
device=device,
seq_length=128,
scaling=0.5)
check_lora_shrink_kernel(
batches=batches,
num_loras=num_loras,
rank=rank,
hidden_size=hidden_size,
nslices=nslices,
dtype=dtype,
device=device,
seq_length=128,
scaling=0.5,
)
else:
check_lora_expand_kernel(batches=batches,
num_loras=num_loras,
rank=rank,
hidden_size=hidden_size,
nslices=nslices,
dtype=dtype,
device=device,
seq_length=128,
add_inputs=True)
check_lora_expand_kernel(
batches=batches,
num_loras=num_loras,
rank=rank,
hidden_size=hidden_size,
nslices=nslices,
dtype=dtype,
device=device,
seq_length=128,
add_inputs=True,
)
@pytest.mark.parametrize("batches", hs_test_params['batches'])
@pytest.mark.parametrize("num_loras", hs_test_params['num_loras'])
@pytest.mark.parametrize("rank", hs_test_params['max_ranks'])
@pytest.mark.parametrize("hidden_size", hs_test_params['hidden_sizes'])
@pytest.mark.parametrize("batches", hs_test_params["batches"])
@pytest.mark.parametrize("num_loras", hs_test_params["num_loras"])
@pytest.mark.parametrize("rank", hs_test_params["max_ranks"])
@pytest.mark.parametrize("hidden_size", hs_test_params["hidden_sizes"])
@pytest.mark.parametrize("nslices", [1, 2, 3])
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("device", DEVICES)
@@ -406,22 +450,26 @@ def test_kernels_hidden_size(
current_platform.seed_everything(seed)
if op_type == "shrink":
check_lora_shrink_kernel(batches=batches,
num_loras=num_loras,
rank=rank,
hidden_size=hidden_size,
nslices=nslices,
dtype=dtype,
device=device,
seq_length=128,
scaling=0.5)
check_lora_shrink_kernel(
batches=batches,
num_loras=num_loras,
rank=rank,
hidden_size=hidden_size,
nslices=nslices,
dtype=dtype,
device=device,
seq_length=128,
scaling=0.5,
)
else:
check_lora_expand_kernel(batches=batches,
num_loras=num_loras,
rank=rank,
hidden_size=hidden_size,
nslices=nslices,
dtype=dtype,
device=device,
seq_length=128,
add_inputs=True)
check_lora_expand_kernel(
batches=batches,
num_loras=num_loras,
rank=rank,
hidden_size=hidden_size,
nslices=nslices,
dtype=dtype,
device=device,
seq_length=128,
add_inputs=True,
)

View File

@@ -20,28 +20,27 @@ class ModelWithQuantization:
MODELS: list[ModelWithQuantization]
#AWQ quantization is currently not supported in ROCm.
# AWQ quantization is currently not supported in ROCm.
if current_platform.is_rocm():
MODELS = [
ModelWithQuantization(
model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
quantization="gptq"),
model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", quantization="gptq"
),
]
else:
MODELS = [
ModelWithQuantization(
model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
quantization="awq"),
model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", quantization="awq"
),
ModelWithQuantization(
model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
quantization="gptq"),
model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", quantization="gptq"
),
]
def do_sample(llm: vllm.LLM,
lora_path: str,
lora_id: int,
max_tokens: int = 256) -> list[str]:
def do_sample(
llm: vllm.LLM, lora_path: str, lora_id: int, max_tokens: int = 256
) -> list[str]:
raw_prompts = [
"Give me an orange-ish brown color",
"Give me a neon pink color",
@@ -52,14 +51,14 @@ def do_sample(llm: vllm.LLM,
prompts = [format_prompt_tuples(p) for p in raw_prompts]
sampling_params = vllm.SamplingParams(temperature=0,
max_tokens=max_tokens,
stop=["<|im_end|>"])
sampling_params = vllm.SamplingParams(
temperature=0, max_tokens=max_tokens, stop=["<|im_end|>"]
)
outputs = llm.generate(
prompts,
sampling_params,
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
if lora_id else None)
lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
)
# Print the outputs.
generated_texts: list[str] = []
for output in outputs:
@@ -72,18 +71,18 @@ def do_sample(llm: vllm.LLM,
@pytest.mark.parametrize("model", MODELS)
def test_quant_model_lora(tinyllama_lora_files, model):
llm = vllm.LLM(
model=model.model_path,
enable_lora=True,
max_num_seqs=16,
max_loras=4,
max_model_len=400,
gpu_memory_utilization=0.2, #avoid OOM
gpu_memory_utilization=0.2, # avoid OOM
quantization=model.quantization,
trust_remote_code=True,
enable_chunked_prefill=True,
tokenizer=tinyllama_lora_files)
tokenizer=tinyllama_lora_files,
)
if model.quantization is None:
expected_lora_output = [
@@ -104,11 +103,11 @@ def test_quant_model_lora(tinyllama_lora_files, model):
def expect_match(output, expected_output):
# HACK: GPTQ lora outputs are just incredibly unstable.
# Assert that the outputs changed.
if (model.quantization == "gptq"
and expected_output is expected_lora_output):
if model.quantization == "gptq" and expected_output is expected_lora_output:
for i, o in enumerate(output):
assert o.startswith(
'#'), f"Expected example {i} to start with # but got {o}"
assert o.startswith("#"), (
f"Expected example {i} to start with # but got {o}"
)
return
assert output == expected_output
@@ -116,17 +115,11 @@ def test_quant_model_lora(tinyllama_lora_files, model):
print("lora adapter created")
print("lora 1")
output = do_sample(llm,
tinyllama_lora_files,
lora_id=1,
max_tokens=max_tokens)
output = do_sample(llm, tinyllama_lora_files, lora_id=1, max_tokens=max_tokens)
expect_match(output, expected_lora_output)
print("lora 2")
output = do_sample(llm,
tinyllama_lora_files,
lora_id=2,
max_tokens=max_tokens)
output = do_sample(llm, tinyllama_lora_files, lora_id=2, max_tokens=max_tokens)
expect_match(output, expected_lora_output)
print("removing lora")
@@ -136,8 +129,7 @@ def test_quant_model_lora(tinyllama_lora_files, model):
@pytest.mark.parametrize("model", MODELS)
def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
model):
def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available, model):
if num_gpus_available < 2:
pytest.skip(f"Not enough GPUs for tensor parallelism {2}")
if model.quantization == "gptq":
@@ -147,10 +139,11 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
enable_lora=True,
max_num_seqs=16,
max_loras=4,
gpu_memory_utilization=0.2, #avoid OOM
gpu_memory_utilization=0.2, # avoid OOM
quantization=model.quantization,
trust_remote_code=True,
enable_chunked_prefill=True)
enable_chunked_prefill=True,
)
output_tp1 = do_sample(llm_tp1, tinyllama_lora_files, lora_id=1)
del llm_tp1
@@ -162,9 +155,10 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
max_num_seqs=16,
max_loras=4,
tensor_parallel_size=2,
gpu_memory_utilization=0.2, #avoid OOM
gpu_memory_utilization=0.2, # avoid OOM
quantization=model.quantization,
enable_chunked_prefill=True)
enable_chunked_prefill=True,
)
output_tp2 = do_sample(llm_tp2, tinyllama_lora_files, lora_id=1)
del llm_tp2

View File

@@ -37,7 +37,8 @@ class Qwen2VLTester:
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>"
"\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
"What is in the image?<|im_end|>\n"
"<|im_start|>assistant\n")
"<|im_start|>assistant\n"
)
def __init__(self, config: TestConfig):
self.config = config
@@ -56,68 +57,68 @@ class Qwen2VLTester:
max_model_len=self.config.max_model_len,
)
def run_test(self,
images: list[ImageAsset],
expected_outputs: list[str],
lora_id: Optional[int] = None,
temperature: float = 0,
max_tokens: int = 5):
def run_test(
self,
images: list[ImageAsset],
expected_outputs: list[str],
lora_id: Optional[int] = None,
temperature: float = 0,
max_tokens: int = 5,
):
sampling_params = vllm.SamplingParams(
temperature=temperature,
max_tokens=max_tokens,
)
inputs = [{
"prompt": self.PROMPT_TEMPLATE,
"multi_modal_data": {
"image": asset.pil_image
},
} for asset in images]
lora_request = LoRARequest(str(lora_id), lora_id,
self.config.lora_path)
outputs = self.llm.generate(inputs,
sampling_params,
lora_request=lora_request)
generated_texts = [
output.outputs[0].text.strip() for output in outputs
inputs = [
{
"prompt": self.PROMPT_TEMPLATE,
"multi_modal_data": {"image": asset.pil_image},
}
for asset in images
]
lora_request = LoRARequest(str(lora_id), lora_id, self.config.lora_path)
outputs = self.llm.generate(inputs, sampling_params, lora_request=lora_request)
generated_texts = [output.outputs[0].text.strip() for output in outputs]
# Validate outputs
for generated, expected in zip(generated_texts, expected_outputs):
assert expected.startswith(
generated), f"Generated text {generated} doesn't "
assert expected.startswith(generated), (
f"Generated text {generated} doesn't "
)
f"match expected pattern {expected}"
def run_beam_search_test(self,
images: list[ImageAsset],
expected_outputs: list[list[str]],
lora_id: Optional[int] = None,
temperature: float = 0,
beam_width: int = 2,
max_tokens: int = 5):
def run_beam_search_test(
self,
images: list[ImageAsset],
expected_outputs: list[list[str]],
lora_id: Optional[int] = None,
temperature: float = 0,
beam_width: int = 2,
max_tokens: int = 5,
):
beam_search_params = BeamSearchParams(
beam_width=beam_width, max_tokens=max_tokens, temperature=temperature
)
beam_search_params = BeamSearchParams(beam_width=beam_width,
max_tokens=max_tokens,
temperature=temperature)
inputs = [
{
"prompt": self.PROMPT_TEMPLATE,
"multi_modal_data": {"image": asset.pil_image},
}
for asset in images
]
inputs = [{
"prompt": self.PROMPT_TEMPLATE,
"multi_modal_data": {
"image": asset.pil_image
},
} for asset in images]
lora_request = LoRARequest(str(lora_id), lora_id,
self.config.lora_path)
outputs = self.llm.beam_search(inputs,
beam_search_params,
lora_request=lora_request)
lora_request = LoRARequest(str(lora_id), lora_id, self.config.lora_path)
outputs = self.llm.beam_search(
inputs, beam_search_params, lora_request=lora_request
)
for output_obj, expected_outs in zip(outputs, expected_outputs):
output_texts = [seq.text for seq in output_obj.sequences]
assert output_texts == expected_outs, \
f"Generated texts {output_texts} do not match expected {expected_outs}" # noqa: E501
assert output_texts == expected_outs, (
f"Generated texts {output_texts} do not match expected {expected_outs}"
) # noqa: E501
TEST_IMAGES = [
@@ -144,27 +145,25 @@ QWEN25VL_MODEL_PATH = "Qwen/Qwen2.5-VL-3B-Instruct"
@pytest.mark.xfail(
current_platform.is_rocm(),
reason="Qwen2-VL dependency xformers incompatible with ROCm")
reason="Qwen2-VL dependency xformers incompatible with ROCm",
)
def test_qwen2vl_lora(qwen2vl_lora_files):
"""Test Qwen 2.0 VL model with LoRA"""
config = TestConfig(model_path=QWEN2VL_MODEL_PATH,
lora_path=qwen2vl_lora_files)
config = TestConfig(model_path=QWEN2VL_MODEL_PATH, lora_path=qwen2vl_lora_files)
tester = Qwen2VLTester(config)
# Test with different LoRA IDs
for lora_id in [1, 2]:
tester.run_test(TEST_IMAGES,
expected_outputs=EXPECTED_OUTPUTS,
lora_id=lora_id)
tester.run_test(TEST_IMAGES, expected_outputs=EXPECTED_OUTPUTS, lora_id=lora_id)
@pytest.mark.xfail(
current_platform.is_rocm(),
reason="Qwen2-VL dependency xformers incompatible with ROCm")
reason="Qwen2-VL dependency xformers incompatible with ROCm",
)
def test_qwen2vl_lora_beam_search(qwen2vl_lora_files):
"""Test Qwen 2.0 VL model with LoRA through beam search."""
config = TestConfig(model_path=QWEN2VL_MODEL_PATH,
lora_path=qwen2vl_lora_files)
config = TestConfig(model_path=QWEN2VL_MODEL_PATH, lora_path=qwen2vl_lora_files)
tester = Qwen2VLTester(config)
# Test with different LoRA IDs
@@ -176,7 +175,8 @@ def test_qwen2vl_lora_beam_search(qwen2vl_lora_files):
tester.run_beam_search_test(
[ImageAsset("cherry_blossom")],
expected_outputs=EXPECTED_BEAM_SEARCH_OUTPUTS,
lora_id=lora_id)
lora_id=lora_id,
)
@pytest.mark.xfail(
@@ -185,12 +185,9 @@ def test_qwen2vl_lora_beam_search(qwen2vl_lora_files):
)
def test_qwen25vl_lora(qwen25vl_lora_files):
"""Test Qwen 2.5 VL model with LoRA"""
config = TestConfig(model_path=QWEN25VL_MODEL_PATH,
lora_path=qwen25vl_lora_files)
config = TestConfig(model_path=QWEN25VL_MODEL_PATH, lora_path=qwen25vl_lora_files)
tester = Qwen2VLTester(config)
# Test with different LoRA IDs
for lora_id in [1, 2]:
tester.run_test(TEST_IMAGES,
expected_outputs=EXPECTED_OUTPUTS,
lora_id=lora_id)
tester.run_test(TEST_IMAGES, expected_outputs=EXPECTED_OUTPUTS, lora_id=lora_id)

View File

@@ -12,13 +12,15 @@ from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
class DummyLoRAResolver(LoRAResolver):
"""A dummy LoRA resolver for testing."""
async def resolve_lora(self, base_model_name: str,
lora_name: str) -> Optional[LoRARequest]:
async def resolve_lora(
self, base_model_name: str, lora_name: str
) -> Optional[LoRARequest]:
if lora_name == "test_lora":
return LoRARequest(
lora_name=lora_name,
lora_path=f"/dummy/path/{base_model_name}/{lora_name}",
lora_int_id=abs(hash(lora_name)))
lora_int_id=abs(hash(lora_name)),
)
return None
@@ -70,6 +72,5 @@ async def test_dummy_resolver_resolve():
assert result.lora_path == f"/dummy/path/{base_model_name}/{lora_name}"
# Test failed resolution
result = await dummy_resolver.resolve_lora(base_model_name,
"nonexistent_lora")
result = await dummy_resolver.resolve_lora(base_model_name, "nonexistent_lora")
assert result is None

View File

@@ -24,20 +24,18 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
prompts = [
PROMPT_TEMPLATE.format(query="How many singers do we have?"),
PROMPT_TEMPLATE.format(
query=
"What is the average, minimum, and maximum age of all singers from France?" # noqa: E501
query="What is the average, minimum, and maximum age of all singers from France?" # noqa: E501
),
PROMPT_TEMPLATE.format(
query=
"What are all distinct countries where singers above age 20 are from?" # noqa: E501
query="What are all distinct countries where singers above age 20 are from?" # noqa: E501
),
]
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
outputs = llm.generate(
prompts,
sampling_params,
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
if lora_id else None)
lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
)
# Print the outputs.
generated_texts: list[str] = []
for output in outputs:
@@ -49,13 +47,15 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
def test_ilama_lora(ilama_lora_files):
llm = vllm.LLM(MODEL_PATH,
max_model_len=1024,
enable_lora=True,
max_loras=4,
max_lora_rank=16,
trust_remote_code=True,
enable_chunked_prefill=True)
llm = vllm.LLM(
MODEL_PATH,
max_model_len=1024,
enable_lora=True,
max_loras=4,
max_lora_rank=16,
trust_remote_code=True,
enable_chunked_prefill=True,
)
output1 = do_sample(llm, ilama_lora_files, lora_id=1)
for i in range(len(EXPECTED_LORA_OUTPUT)):
@@ -65,20 +65,23 @@ def test_ilama_lora(ilama_lora_files):
assert output2[i] == EXPECTED_LORA_OUTPUT[i]
@pytest.mark.skipif(current_platform.is_cuda_alike(),
reason="Skipping to avoid redundant model tests")
@pytest.mark.skipif(
current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
)
@multi_gpu_test(num_gpus=4)
@create_new_process_for_each_test()
def test_ilama_lora_tp4(ilama_lora_files):
llm = vllm.LLM(MODEL_PATH,
max_model_len=1024,
enable_lora=True,
max_loras=4,
max_lora_rank=16,
tensor_parallel_size=4,
trust_remote_code=True,
fully_sharded_loras=False,
enable_chunked_prefill=True)
llm = vllm.LLM(
MODEL_PATH,
max_model_len=1024,
enable_lora=True,
max_loras=4,
max_lora_rank=16,
tensor_parallel_size=4,
trust_remote_code=True,
fully_sharded_loras=False,
enable_chunked_prefill=True,
)
output1 = do_sample(llm, ilama_lora_files, lora_id=1)
for i in range(len(EXPECTED_LORA_OUTPUT)):
@@ -88,20 +91,23 @@ def test_ilama_lora_tp4(ilama_lora_files):
assert output2[i] == EXPECTED_LORA_OUTPUT[i]
@pytest.mark.skipif(current_platform.is_cuda_alike(),
reason="Skipping to avoid redundant model tests")
@pytest.mark.skipif(
current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
)
@multi_gpu_test(num_gpus=4)
@create_new_process_for_each_test()
def test_ilama_lora_tp4_fully_sharded_loras(ilama_lora_files):
llm = vllm.LLM(MODEL_PATH,
max_model_len=1024,
enable_lora=True,
max_loras=4,
max_lora_rank=16,
tensor_parallel_size=4,
trust_remote_code=True,
fully_sharded_loras=True,
enable_chunked_prefill=True)
llm = vllm.LLM(
MODEL_PATH,
max_model_len=1024,
enable_lora=True,
max_loras=4,
max_lora_rank=16,
tensor_parallel_size=4,
trust_remote_code=True,
fully_sharded_loras=True,
enable_chunked_prefill=True,
)
output1 = do_sample(llm, ilama_lora_files, lora_id=1)
for i in range(len(EXPECTED_LORA_OUTPUT)):
assert output1[i] == EXPECTED_LORA_OUTPUT[i]

View File

@@ -9,8 +9,11 @@ import pytest
from huggingface_hub.utils import HfHubHTTPError
from torch import nn
from vllm.lora.utils import (get_adapter_absolute_path,
parse_fine_tuned_lora_name, replace_submodule)
from vllm.lora.utils import (
get_adapter_absolute_path,
parse_fine_tuned_lora_name,
replace_submodule,
)
from vllm.model_executor.models.utils import WeightsMapper
@@ -24,10 +27,12 @@ class LoRANameParserTestConfig(NamedTuple):
def test_parse_fine_tuned_lora_name_valid():
fixture = [
LoRANameParserTestConfig("base_model.model.lm_head.lora_A.weight",
"lm_head", True, False),
LoRANameParserTestConfig("base_model.model.lm_head.lora_B.weight",
"lm_head", False, False),
LoRANameParserTestConfig(
"base_model.model.lm_head.lora_A.weight", "lm_head", True, False
),
LoRANameParserTestConfig(
"base_model.model.lm_head.lora_B.weight", "lm_head", False, False
),
LoRANameParserTestConfig(
"base_model.model.model.embed_tokens.lora_embedding_A",
"model.embed_tokens",
@@ -71,7 +76,8 @@ def test_parse_fine_tuned_lora_name_valid():
True,
False,
weights_mapper=WeightsMapper(
orig_to_new_prefix={"model.": "language_model.model."}),
orig_to_new_prefix={"model.": "language_model.model."}
),
),
LoRANameParserTestConfig(
"base_model.model.model.layers.9.mlp.down_proj.lora_B.weight",
@@ -79,7 +85,8 @@ def test_parse_fine_tuned_lora_name_valid():
False,
False,
weights_mapper=WeightsMapper(
orig_to_new_prefix={"model.": "language_model.model."}),
orig_to_new_prefix={"model.": "language_model.model."}
),
),
LoRANameParserTestConfig(
"model.layers.9.mlp.down_proj.lora_A.weight",
@@ -87,7 +94,8 @@ def test_parse_fine_tuned_lora_name_valid():
True,
False,
weights_mapper=WeightsMapper(
orig_to_new_prefix={"model.": "language_model.model."}),
orig_to_new_prefix={"model.": "language_model.model."}
),
),
LoRANameParserTestConfig(
"model.layers.9.mlp.down_proj.lora_B.weight",
@@ -95,12 +103,14 @@ def test_parse_fine_tuned_lora_name_valid():
False,
False,
weights_mapper=WeightsMapper(
orig_to_new_prefix={"model.": "language_model.model."}),
orig_to_new_prefix={"model.": "language_model.model."}
),
),
]
for name, module_name, is_lora_a, is_bias, weights_mapper in fixture:
assert (module_name, is_lora_a,
is_bias) == parse_fine_tuned_lora_name(name, weights_mapper)
assert (module_name, is_lora_a, is_bias) == parse_fine_tuned_lora_name(
name, weights_mapper
)
def test_parse_fine_tuned_lora_name_invalid():
@@ -115,22 +125,28 @@ def test_parse_fine_tuned_lora_name_invalid():
def test_replace_submodule():
model = nn.Sequential(
OrderedDict([
("dense1", nn.Linear(764, 100)),
("act1", nn.ReLU()),
("dense2", nn.Linear(100, 50)),
(
"seq1",
nn.Sequential(
OrderedDict([
("dense1", nn.Linear(100, 10)),
("dense2", nn.Linear(10, 50)),
])),
),
("act2", nn.ReLU()),
("output", nn.Linear(50, 10)),
("outact", nn.Sigmoid()),
]))
OrderedDict(
[
("dense1", nn.Linear(764, 100)),
("act1", nn.ReLU()),
("dense2", nn.Linear(100, 50)),
(
"seq1",
nn.Sequential(
OrderedDict(
[
("dense1", nn.Linear(100, 10)),
("dense2", nn.Linear(10, 50)),
]
)
),
),
("act2", nn.ReLU()),
("output", nn.Linear(50, 10)),
("outact", nn.Sigmoid()),
]
)
)
sigmoid = nn.Sigmoid()
@@ -143,52 +159,51 @@ def test_replace_submodule():
# Unit tests for get_adapter_absolute_path
@patch('os.path.isabs')
@patch("os.path.isabs")
def test_get_adapter_absolute_path_absolute(mock_isabs):
path = '/absolute/path/to/lora'
path = "/absolute/path/to/lora"
mock_isabs.return_value = True
assert get_adapter_absolute_path(path) == path
@patch('os.path.expanduser')
@patch("os.path.expanduser")
def test_get_adapter_absolute_path_expanduser(mock_expanduser):
# Path with ~ that needs to be expanded
path = '~/relative/path/to/lora'
absolute_path = '/home/user/relative/path/to/lora'
path = "~/relative/path/to/lora"
absolute_path = "/home/user/relative/path/to/lora"
mock_expanduser.return_value = absolute_path
assert get_adapter_absolute_path(path) == absolute_path
@patch('os.path.exists')
@patch('os.path.abspath')
@patch("os.path.exists")
@patch("os.path.abspath")
def test_get_adapter_absolute_path_local_existing(mock_abspath, mock_exist):
# Relative path that exists locally
path = 'relative/path/to/lora'
absolute_path = '/absolute/path/to/lora'
path = "relative/path/to/lora"
absolute_path = "/absolute/path/to/lora"
mock_exist.return_value = True
mock_abspath.return_value = absolute_path
assert get_adapter_absolute_path(path) == absolute_path
@patch('huggingface_hub.snapshot_download')
@patch('os.path.exists')
def test_get_adapter_absolute_path_huggingface(mock_exist,
mock_snapshot_download):
@patch("huggingface_hub.snapshot_download")
@patch("os.path.exists")
def test_get_adapter_absolute_path_huggingface(mock_exist, mock_snapshot_download):
# Hugging Face model identifier
path = 'org/repo'
absolute_path = '/mock/snapshot/path'
path = "org/repo"
absolute_path = "/mock/snapshot/path"
mock_exist.return_value = False
mock_snapshot_download.return_value = absolute_path
assert get_adapter_absolute_path(path) == absolute_path
@patch('huggingface_hub.snapshot_download')
@patch('os.path.exists')
def test_get_adapter_absolute_path_huggingface_error(mock_exist,
mock_snapshot_download):
@patch("huggingface_hub.snapshot_download")
@patch("os.path.exists")
def test_get_adapter_absolute_path_huggingface_error(
mock_exist, mock_snapshot_download
):
# Hugging Face model identifier with download error
path = 'org/repo'
path = "org/repo"
mock_exist.return_value = False
mock_snapshot_download.side_effect = HfHubHTTPError(
"failed to query model info")
mock_snapshot_download.side_effect = HfHubHTTPError("failed to query model info")
assert get_adapter_absolute_path(path) == path

View File

@@ -6,8 +6,14 @@ import random
import tempfile
from unittest.mock import patch
from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
ParallelConfig, SchedulerConfig, VllmConfig)
from vllm.config import (
CacheConfig,
DeviceConfig,
ModelConfig,
ParallelConfig,
SchedulerConfig,
VllmConfig,
)
from vllm.config.load import LoadConfig
from vllm.config.lora import LoRAConfig
from vllm.lora.models import LoRAMapping
@@ -19,12 +25,12 @@ NUM_LORAS = 16
@patch.dict(os.environ, {"RANK": "0"})
def test_worker_apply_lora(sql_lora_files):
def set_active_loras(worker: Worker, lora_requests: list[LoRARequest]):
lora_mapping = LoRAMapping([], [])
worker.model_runner.lora_manager.set_active_adapters(
lora_requests, lora_mapping)
lora_requests, lora_mapping
)
vllm_config = VllmConfig(
model_config=ModelConfig(
@@ -49,9 +55,9 @@ def test_worker_apply_lora(sql_lora_files):
swap_space=0,
cache_dtype="auto",
),
lora_config=LoRAConfig(max_lora_rank=8,
max_cpu_loras=NUM_LORAS,
max_loras=NUM_LORAS),
lora_config=LoRAConfig(
max_lora_rank=8, max_cpu_loras=NUM_LORAS, max_loras=NUM_LORAS
),
)
worker = Worker(
vllm_config=vllm_config,
@@ -67,23 +73,22 @@ def test_worker_apply_lora(sql_lora_files):
assert worker.list_loras() == set()
lora_requests = [
LoRARequest(str(i + 1), i + 1, sql_lora_files)
for i in range(NUM_LORAS)
LoRARequest(str(i + 1), i + 1, sql_lora_files) for i in range(NUM_LORAS)
]
set_active_loras(worker, lora_requests)
assert worker.list_loras() == {
lora_request.lora_int_id
for lora_request in lora_requests
lora_request.lora_int_id for lora_request in lora_requests
}
for i in range(NUM_LORAS):
random.seed(i)
iter_lora_requests = random.choices(lora_requests,
k=random.randint(1, NUM_LORAS))
iter_lora_requests = random.choices(
lora_requests, k=random.randint(1, NUM_LORAS)
)
random.shuffle(iter_lora_requests)
iter_lora_requests = iter_lora_requests[:-random.randint(0, NUM_LORAS)]
iter_lora_requests = iter_lora_requests[: -random.randint(0, NUM_LORAS)]
set_active_loras(worker, lora_requests)
assert worker.list_loras().issuperset(
{lora_request.lora_int_id
for lora_request in iter_lora_requests})
{lora_request.lora_int_id for lora_request in iter_lora_requests}
)

View File

@@ -13,7 +13,6 @@ from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
class DummyLoRAManager:
def __init__(self, device: torch.device = "cuda:0"):
super().__init__()
self._loras: dict[str, LoRALayerWeights] = {}
@@ -36,12 +35,12 @@ class DummyLoRAManager:
module_name,
rank=rank,
lora_alpha=1,
lora_a=torch.rand([rank, weight.shape[1]],
dtype=weight.dtype,
device=self._device),
lora_b=torch.rand([weight.shape[0], rank],
dtype=weight.dtype,
device=self._device),
lora_a=torch.rand(
[rank, weight.shape[1]], dtype=weight.dtype, device=self._device
),
lora_b=torch.rand(
[weight.shape[0], rank], dtype=weight.dtype, device=self._device
),
)
if generate_embeddings_tensor:
lora.embeddings_tensor = torch.rand(
@@ -146,27 +145,26 @@ def generate_data(
op_type,
device,
) -> PunicaTensors:
seq_len_tensor = torch.randint(seq_length, seq_length + 1,
(batches, )).to(device)
seq_len_tensor = torch.randint(seq_length, seq_length + 1, (batches,)).to(device)
b_seq_start_loc = torch.cumsum(
torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
dim=0,
).to(device)
total_tokens = seq_len_tensor.sum()
if op_type == "shrink":
inputs_tensor = torch.rand((total_tokens, hidden_size),
dtype=dtype).to(device)
inputs_tensor = torch.rand((total_tokens, hidden_size), dtype=dtype).to(device)
lora_weights = torch.rand(
(lora_nums, max_rank, hidden_size), # col-major
dtype=dtype,
).to(device)
# shrink op need atomic_add, so output is initinized by 0
ref_out_tensor = torch.zeros((total_tokens, max_rank),
dtype=dtype,
device=inputs_tensor.device)
ref_out_tensor = torch.zeros(
(total_tokens, max_rank), dtype=dtype, device=inputs_tensor.device
)
# NOTE shrink kernel using torch.float32 as output type
our_out_tensor = torch.zeros((total_tokens, max_rank),
dtype=torch.float32).to(device)
our_out_tensor = torch.zeros((total_tokens, max_rank), dtype=torch.float32).to(
device
)
else:
inputs_tensor = torch.rand(
(total_tokens, max_rank),
@@ -184,15 +182,16 @@ def generate_data(
).to(device)
# Ensure the same input.
our_out_tensor = ref_out_tensor.clone()
lora_indices_tensor = torch.randint(0,
lora_nums - 1 if lora_nums > 1 else 1,
(batches, )).to(device)
lora_indices_tensor = torch.randint(
0, lora_nums - 1 if lora_nums > 1 else 1, (batches,)
).to(device)
indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
current_offset = 0
for b_id in range(batches):
lora_index = lora_indices_tensor[b_id]
indices[current_offset:current_offset +
seq_len_tensor[b_id]].copy_(lora_index)
indices[current_offset : current_offset + seq_len_tensor[b_id]].copy_(
lora_index
)
current_offset += seq_len_tensor[b_id].item()
return PunicaTensors(
@@ -217,8 +216,7 @@ def generate_data_for_expand_nslices(
nslices,
device,
) -> PunicaTensors:
seq_len_tensor = torch.randint(seq_length, seq_length + 1,
(batches, )).to(device)
seq_len_tensor = torch.randint(seq_length, seq_length + 1, (batches,)).to(device)
b_seq_start_loc = torch.cumsum(
torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
dim=0,
@@ -234,22 +232,25 @@ def generate_data_for_expand_nslices(
torch.rand(
(lora_nums, hidden_size, max_rank), # col-major
dtype=dtype,
).to(device))
).to(device)
)
# expand op needs to complete y+=a@lora_b, so output is
# initinized randomly
ref_out_tensor = torch.rand((total_tokens, hidden_size * nslices),
dtype=dtype).to(device)
ref_out_tensor = torch.rand((total_tokens, hidden_size * nslices), dtype=dtype).to(
device
)
# Ensure the same input.
our_out_tensor = ref_out_tensor.clone()
lora_indices_tensor = torch.randint(0,
lora_nums - 1 if lora_nums > 1 else 1,
(batches, ))
lora_indices_tensor = torch.randint(
0, lora_nums - 1 if lora_nums > 1 else 1, (batches,)
)
indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
current_offset = 0
for b_id in range(batches):
lora_index = lora_indices_tensor[b_id]
indices[current_offset:current_offset +
seq_len_tensor[b_id]] = (lora_index.item())
indices[current_offset : current_offset + seq_len_tensor[b_id]] = (
lora_index.item()
)
current_offset += seq_len_tensor[b_id].item()
lora_indices_tensor = lora_indices_tensor.to(device)
@@ -276,8 +277,7 @@ def generate_data_for_nslices(
op_type,
device,
) -> PunicaTensors:
seq_len_tensor = torch.randint(seq_length, seq_length + 1,
(batches, )).to(device)
seq_len_tensor = torch.randint(seq_length, seq_length + 1, (batches,)).to(device)
b_seq_start_loc = torch.cumsum(
torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
dim=0,
@@ -286,9 +286,7 @@ def generate_data_for_nslices(
lora_weights_lst = []
if op_type == "shrink":
inputs_tensor = torch.rand((total_tokens, hidden_size),
dtype=dtype).to(device)
inputs_tensor = torch.rand((total_tokens, hidden_size), dtype=dtype).to(device)
for _ in range(nslices):
if op_type == "shrink":
@@ -296,7 +294,8 @@ def generate_data_for_nslices(
torch.rand(
(lora_nums, max_rank, hidden_size), # col-major
dtype=dtype,
).to(device))
).to(device)
)
# NOTE shrink kernel using torch.float32 as output type
# shrink op need atomic_add, so output is initinized by 0
our_out_tensor = torch.zeros(
@@ -313,23 +312,26 @@ def generate_data_for_nslices(
torch.rand(
(lora_nums, hidden_size, max_rank), # col-major
dtype=dtype,
).to(device))
).to(device)
)
# expand op needs to complete y+=a@lora_b, so output is
# initinized randomly
our_out_tensor = torch.rand((total_tokens, hidden_size * nslices),
dtype=dtype).to(device)
our_out_tensor = torch.rand(
(total_tokens, hidden_size * nslices), dtype=dtype
).to(device)
# Ensure the same input.
ref_out_tensor = our_out_tensor.clone()
lora_indices_tensor = torch.randint(0,
lora_nums - 1 if lora_nums > 1 else 1,
(batches, ))
lora_indices_tensor = torch.randint(
0, lora_nums - 1 if lora_nums > 1 else 1, (batches,)
)
indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
current_offset = 0
for b_id in range(batches):
lora_index = lora_indices_tensor[b_id]
indices[current_offset:current_offset +
seq_len_tensor[b_id]] = (lora_index.item())
indices[current_offset : current_offset + seq_len_tensor[b_id]] = (
lora_index.item()
)
current_offset += seq_len_tensor[b_id].item()
lora_indices_tensor = lora_indices_tensor.to(device)
@@ -379,24 +381,20 @@ def create_peft_lora(
}
for module_name in target_modules:
module = model
for attr in module_name.split("."):
module = getattr(module, attr)
if hasattr(module, "input_size") and hasattr(module, "output_size"):
in_features = module.input_size
out_features = module.output_size
elif hasattr(module, "embedding_dim") and hasattr(
module, "num_embeddings"):
elif hasattr(module, "embedding_dim") and hasattr(module, "num_embeddings"):
# ParallelLMHead
in_features = module.embedding_dim
out_features = module.num_embeddings
else:
raise ValueError(
f"Unable to determine dimensions for module {module_name}")
raise ValueError(f"Unable to determine dimensions for module {module_name}")
lora_A = torch.randn(rank, in_features, dtype=lora_dtype)