Convert formatting to use ruff instead of yapf + isort (#26247)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -10,12 +10,16 @@ import torch
|
||||
import torch.nn as nn
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
from vllm.distributed import (cleanup_dist_env_and_memory,
|
||||
init_distributed_environment,
|
||||
initialize_model_parallel)
|
||||
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
||||
MergedColumnParallelLinear,
|
||||
RowParallelLinear)
|
||||
from vllm.distributed import (
|
||||
cleanup_dist_env_and_memory,
|
||||
init_distributed_environment,
|
||||
initialize_model_parallel,
|
||||
)
|
||||
from vllm.model_executor.layers.linear import (
|
||||
ColumnParallelLinear,
|
||||
MergedColumnParallelLinear,
|
||||
RowParallelLinear,
|
||||
)
|
||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
||||
from vllm.model_executor.models.interfaces import SupportsLoRA
|
||||
@@ -47,11 +51,13 @@ def dist_init():
|
||||
if current_platform.is_cpu() or current_platform.is_tpu():
|
||||
backend = "gloo"
|
||||
|
||||
init_distributed_environment(world_size=1,
|
||||
rank=0,
|
||||
distributed_init_method=f"file://{temp_file}",
|
||||
local_rank=0,
|
||||
backend=backend)
|
||||
init_distributed_environment(
|
||||
world_size=1,
|
||||
rank=0,
|
||||
distributed_init_method=f"file://{temp_file}",
|
||||
local_rank=0,
|
||||
backend=backend,
|
||||
)
|
||||
initialize_model_parallel(1, 1)
|
||||
yield
|
||||
cleanup_dist_env_and_memory(shutdown_ray=True)
|
||||
@@ -66,10 +72,9 @@ def dist_init_torch_only():
|
||||
backend = "gloo"
|
||||
|
||||
temp_file = tempfile.mkstemp()[1]
|
||||
torch.distributed.init_process_group(world_size=1,
|
||||
rank=0,
|
||||
init_method=f"file://{temp_file}",
|
||||
backend=backend)
|
||||
torch.distributed.init_process_group(
|
||||
world_size=1, rank=0, init_method=f"file://{temp_file}", backend=backend
|
||||
)
|
||||
|
||||
|
||||
class DummyLoRAModel(nn.Sequential, SupportsLoRA):
|
||||
@@ -79,24 +84,30 @@ class DummyLoRAModel(nn.Sequential, SupportsLoRA):
|
||||
@pytest.fixture
|
||||
def dummy_model() -> nn.Module:
|
||||
model = DummyLoRAModel(
|
||||
OrderedDict([
|
||||
("dense1", ColumnParallelLinear(764, 100)),
|
||||
("dense2", RowParallelLinear(100, 50)),
|
||||
(
|
||||
"layer1",
|
||||
nn.Sequential(
|
||||
OrderedDict([
|
||||
("dense1", ColumnParallelLinear(100, 10)),
|
||||
("dense2", RowParallelLinear(10, 50)),
|
||||
])),
|
||||
),
|
||||
("act2", nn.ReLU()),
|
||||
("output", ColumnParallelLinear(50, 10)),
|
||||
("outact", nn.Sigmoid()),
|
||||
# Special handling for lm_head & sampler
|
||||
("lm_head", ParallelLMHead(512, 10)),
|
||||
("logits_processor", LogitsProcessor(512)),
|
||||
]))
|
||||
OrderedDict(
|
||||
[
|
||||
("dense1", ColumnParallelLinear(764, 100)),
|
||||
("dense2", RowParallelLinear(100, 50)),
|
||||
(
|
||||
"layer1",
|
||||
nn.Sequential(
|
||||
OrderedDict(
|
||||
[
|
||||
("dense1", ColumnParallelLinear(100, 10)),
|
||||
("dense2", RowParallelLinear(10, 50)),
|
||||
]
|
||||
)
|
||||
),
|
||||
),
|
||||
("act2", nn.ReLU()),
|
||||
("output", ColumnParallelLinear(50, 10)),
|
||||
("outact", nn.Sigmoid()),
|
||||
# Special handling for lm_head & sampler
|
||||
("lm_head", ParallelLMHead(512, 10)),
|
||||
("logits_processor", LogitsProcessor(512)),
|
||||
]
|
||||
)
|
||||
)
|
||||
model.config = MagicMock()
|
||||
model.embedding_modules = {"lm_head": "lm_head"}
|
||||
model.unpadded_vocab_size = 32000
|
||||
@@ -106,24 +117,30 @@ def dummy_model() -> nn.Module:
|
||||
@pytest.fixture
|
||||
def dummy_model_gate_up() -> nn.Module:
|
||||
model = DummyLoRAModel(
|
||||
OrderedDict([
|
||||
("dense1", ColumnParallelLinear(764, 100)),
|
||||
("dense2", RowParallelLinear(100, 50)),
|
||||
(
|
||||
"layer1",
|
||||
nn.Sequential(
|
||||
OrderedDict([
|
||||
("dense1", ColumnParallelLinear(100, 10)),
|
||||
("dense2", RowParallelLinear(10, 50)),
|
||||
])),
|
||||
),
|
||||
("act2", nn.ReLU()),
|
||||
("gate_up_proj", MergedColumnParallelLinear(50, [5, 5])),
|
||||
("outact", nn.Sigmoid()),
|
||||
# Special handling for lm_head & sampler
|
||||
("lm_head", ParallelLMHead(512, 10)),
|
||||
("logits_processor", LogitsProcessor(512)),
|
||||
]))
|
||||
OrderedDict(
|
||||
[
|
||||
("dense1", ColumnParallelLinear(764, 100)),
|
||||
("dense2", RowParallelLinear(100, 50)),
|
||||
(
|
||||
"layer1",
|
||||
nn.Sequential(
|
||||
OrderedDict(
|
||||
[
|
||||
("dense1", ColumnParallelLinear(100, 10)),
|
||||
("dense2", RowParallelLinear(10, 50)),
|
||||
]
|
||||
)
|
||||
),
|
||||
),
|
||||
("act2", nn.ReLU()),
|
||||
("gate_up_proj", MergedColumnParallelLinear(50, [5, 5])),
|
||||
("outact", nn.Sigmoid()),
|
||||
# Special handling for lm_head & sampler
|
||||
("lm_head", ParallelLMHead(512, 10)),
|
||||
("logits_processor", LogitsProcessor(512)),
|
||||
]
|
||||
)
|
||||
)
|
||||
model.config = MagicMock()
|
||||
model.packed_modules_mapping = {
|
||||
"gate_up_proj": [
|
||||
|
||||
@@ -7,7 +7,8 @@ import pytest
|
||||
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.entrypoints.openai.api_server import (
|
||||
build_async_engine_client_from_engine_args)
|
||||
build_async_engine_client_from_engine_args,
|
||||
)
|
||||
from vllm.inputs import TextPrompt
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.sampling_params import SamplingParams
|
||||
@@ -26,14 +27,10 @@ def get_lora_requests(lora_path) -> list[LoRARequest]:
|
||||
return lora_requests
|
||||
|
||||
|
||||
async def requests_processing_time(llm,
|
||||
lora_requests: list[LoRARequest]) -> float:
|
||||
|
||||
sampling_params = SamplingParams(n=1,
|
||||
temperature=0.0,
|
||||
top_p=1.0,
|
||||
ignore_eos=True,
|
||||
max_tokens=1)
|
||||
async def requests_processing_time(llm, lora_requests: list[LoRARequest]) -> float:
|
||||
sampling_params = SamplingParams(
|
||||
n=1, temperature=0.0, top_p=1.0, ignore_eos=True, max_tokens=1
|
||||
)
|
||||
|
||||
generators = []
|
||||
start = time.perf_counter()
|
||||
@@ -41,11 +38,11 @@ async def requests_processing_time(llm,
|
||||
for lora_request in lora_requests:
|
||||
lora_int_id = lora_request.lora_int_id
|
||||
generator = llm.generate(
|
||||
prompt=TextPrompt(prompt=f"hello {lora_int_id}",
|
||||
multi_modal_data=None), # type: ignore
|
||||
prompt=TextPrompt(prompt=f"hello {lora_int_id}", multi_modal_data=None), # type: ignore
|
||||
sampling_params=sampling_params,
|
||||
lora_request=lora_request,
|
||||
request_id=f"test{lora_int_id}")
|
||||
request_id=f"test{lora_int_id}",
|
||||
)
|
||||
generators.append(generator)
|
||||
|
||||
all_gens = merge_async_iterators(*generators)
|
||||
@@ -58,13 +55,13 @@ async def requests_processing_time(llm,
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_add_lora(chatglm3_lora_files):
|
||||
"""
|
||||
"""
|
||||
The add_lora function is used to preload some LoRA adapters into the
|
||||
engine in anticipation of future requests using these adapters. To test
|
||||
this functionality, we use the async engine to process some requests - We
|
||||
do it twice, once with add_lora() preloading and once without.
|
||||
|
||||
We measure the request processing time in both cases and expect the time
|
||||
We measure the request processing time in both cases and expect the time
|
||||
to be lesser in the case with add_lora() calls.
|
||||
"""
|
||||
lora_requests: list[LoRARequest] = get_lora_requests(chatglm3_lora_files)
|
||||
@@ -78,18 +75,18 @@ async def test_add_lora(chatglm3_lora_files):
|
||||
max_loras=max_loras,
|
||||
max_lora_rank=LORA_RANK,
|
||||
max_model_len=128,
|
||||
gpu_memory_utilization=0.8, #avoid OOM
|
||||
gpu_memory_utilization=0.8, # avoid OOM
|
||||
trust_remote_code=True,
|
||||
enforce_eager=True)
|
||||
enforce_eager=True,
|
||||
)
|
||||
|
||||
# split lora_requests into 3 parts
|
||||
part_size = len(lora_requests) // 3
|
||||
dummy_run_requests = lora_requests[:part_size]
|
||||
warmup_run_requests = lora_requests[part_size:part_size * 2]
|
||||
cold_run_requests = lora_requests[part_size * 2:]
|
||||
warmup_run_requests = lora_requests[part_size : part_size * 2]
|
||||
cold_run_requests = lora_requests[part_size * 2 :]
|
||||
|
||||
async with build_async_engine_client_from_engine_args(engine_args) as llm:
|
||||
|
||||
# Dummy run - So any 1-time functionality like triton kernel compilation
|
||||
# is complete here.
|
||||
await requests_processing_time(llm, dummy_run_requests)
|
||||
@@ -101,18 +98,16 @@ async def test_add_lora(chatglm3_lora_files):
|
||||
# Test that all all_lora calls are successful.
|
||||
assert all(add_lora_results)
|
||||
|
||||
time_with_add_lora = await requests_processing_time(
|
||||
llm, warmup_run_requests)
|
||||
time_with_add_lora = await requests_processing_time(llm, warmup_run_requests)
|
||||
|
||||
# Run without any warmup
|
||||
time_cold_start = await requests_processing_time(
|
||||
llm, cold_run_requests)
|
||||
time_cold_start = await requests_processing_time(llm, cold_run_requests)
|
||||
|
||||
print(f"time hot-start {time_with_add_lora} vs "
|
||||
f"time cold-start {time_cold_start} ")
|
||||
print(f"time hot-start {time_with_add_lora} vs time cold-start {time_cold_start} ")
|
||||
|
||||
assert time_with_add_lora < time_cold_start, (
|
||||
f"time_with_add_lora={time_with_add_lora}, "
|
||||
f"time_cold_start={time_cold_start}"
|
||||
"The engine request processing time with LoRA pre-loading "
|
||||
"must be less than the version that does on-demand LoRA loading.")
|
||||
"must be less than the version that does on-demand LoRA loading."
|
||||
)
|
||||
|
||||
@@ -21,20 +21,18 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
prompts = [
|
||||
PROMPT_TEMPLATE.format(query="How many singers do we have?"),
|
||||
PROMPT_TEMPLATE.format(
|
||||
query=
|
||||
"What is the average, minimum, and maximum age of all singers from France?" # noqa: E501
|
||||
query="What is the average, minimum, and maximum age of all singers from France?" # noqa: E501
|
||||
),
|
||||
PROMPT_TEMPLATE.format(
|
||||
query=
|
||||
"Show name, country, age for all singers ordered by age from the oldest to the youngest." # noqa: E501
|
||||
query="Show name, country, age for all singers ordered by age from the oldest to the youngest." # noqa: E501
|
||||
),
|
||||
]
|
||||
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
|
||||
if lora_id else None)
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
|
||||
)
|
||||
# Print the outputs.
|
||||
generated_texts: list[str] = []
|
||||
for output in outputs:
|
||||
@@ -47,13 +45,15 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
|
||||
@create_new_process_for_each_test()
|
||||
def test_chatglm3_lora(chatglm3_lora_files):
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
max_lora_rank=64,
|
||||
trust_remote_code=True,
|
||||
enable_chunked_prefill=True)
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
max_lora_rank=64,
|
||||
trust_remote_code=True,
|
||||
enable_chunked_prefill=True,
|
||||
)
|
||||
|
||||
output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
|
||||
for i in range(len(EXPECTED_LORA_OUTPUT)):
|
||||
@@ -66,15 +66,17 @@ def test_chatglm3_lora(chatglm3_lora_files):
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
@create_new_process_for_each_test()
|
||||
def test_chatglm3_lora_tp4(chatglm3_lora_files):
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
max_lora_rank=64,
|
||||
tensor_parallel_size=4,
|
||||
trust_remote_code=True,
|
||||
fully_sharded_loras=False,
|
||||
enable_chunked_prefill=True)
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
max_lora_rank=64,
|
||||
tensor_parallel_size=4,
|
||||
trust_remote_code=True,
|
||||
fully_sharded_loras=False,
|
||||
enable_chunked_prefill=True,
|
||||
)
|
||||
|
||||
output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
|
||||
for i in range(len(EXPECTED_LORA_OUTPUT)):
|
||||
@@ -90,16 +92,18 @@ def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
|
||||
# https://github.com/NVIDIA/nccl/issues/1790, set a lower value for
|
||||
# gpu_memory_utilization here because NCCL >= 2.26.3 seems to use
|
||||
# more GPU memory causing vLLM to OOM
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
max_lora_rank=64,
|
||||
tensor_parallel_size=4,
|
||||
trust_remote_code=True,
|
||||
fully_sharded_loras=True,
|
||||
enable_chunked_prefill=True,
|
||||
gpu_memory_utilization=0.85)
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
max_lora_rank=64,
|
||||
tensor_parallel_size=4,
|
||||
trust_remote_code=True,
|
||||
fully_sharded_loras=True,
|
||||
enable_chunked_prefill=True,
|
||||
gpu_memory_utilization=0.85,
|
||||
)
|
||||
output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
|
||||
for i in range(len(EXPECTED_LORA_OUTPUT)):
|
||||
assert output1[i] == EXPECTED_LORA_OUTPUT[i]
|
||||
|
||||
@@ -32,15 +32,12 @@ VLLM_RUNNER_BASE_KWARGS = {
|
||||
"max_lora_rank": 320,
|
||||
"max_model_len": 12800,
|
||||
"gpu_memory_utilization": 0.8,
|
||||
"limit_mm_per_prompt": {
|
||||
"audio": 1
|
||||
},
|
||||
"limit_mm_per_prompt": {"audio": 1},
|
||||
"enforce_eager": True,
|
||||
}
|
||||
|
||||
|
||||
def run_test(vllm_runner, audio_assets, lora_request, expected_suffix,
|
||||
**kwargs):
|
||||
def run_test(vllm_runner, audio_assets, lora_request, expected_suffix, **kwargs):
|
||||
inputs = [([AUDIO_PROMPT], [audio_assets[0].audio_and_sample_rate[0]])]
|
||||
|
||||
# Apply any additional kwargs as overrides to the base kwargs
|
||||
@@ -53,11 +50,11 @@ def run_test(vllm_runner, audio_assets, lora_request, expected_suffix,
|
||||
max_tokens=128,
|
||||
audios=audios,
|
||||
lora_request=lora_request,
|
||||
) for prompts, audios in inputs
|
||||
)
|
||||
for prompts, audios in inputs
|
||||
]
|
||||
|
||||
assert vllm_outputs_with_default_lora[-1][-1][-1].endswith(
|
||||
expected_suffix)
|
||||
assert vllm_outputs_with_default_lora[-1][-1][-1].endswith(expected_suffix)
|
||||
|
||||
|
||||
def test_active_default_mm_lora(
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -19,27 +19,28 @@ EXPECTED_LORA_OUTPUT = [
|
||||
" SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ", # noqa: E501
|
||||
" SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ", # noqa: E501
|
||||
" SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ", # noqa: E501
|
||||
" SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' " # noqa: E501
|
||||
" SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' ", # noqa: E501
|
||||
]
|
||||
|
||||
|
||||
def do_sample(llm: vllm.LLM,
|
||||
lora_path: str,
|
||||
lora_id: int,
|
||||
tensorizer_config_dict: Union[dict, None] = None) -> list[str]:
|
||||
def do_sample(
|
||||
llm: vllm.LLM,
|
||||
lora_path: str,
|
||||
lora_id: int,
|
||||
tensorizer_config_dict: Union[dict, None] = None,
|
||||
) -> list[str]:
|
||||
prompts = [
|
||||
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501
|
||||
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501
|
||||
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]", # noqa: E501
|
||||
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]", # noqa: E501
|
||||
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]", # noqa: E501
|
||||
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]" # noqa: E501
|
||||
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]", # noqa: E501
|
||||
]
|
||||
|
||||
sampling_params = vllm.SamplingParams(temperature=0,
|
||||
max_tokens=256,
|
||||
skip_special_tokens=False,
|
||||
stop=["[/assistant]"])
|
||||
sampling_params = vllm.SamplingParams(
|
||||
temperature=0, max_tokens=256, skip_special_tokens=False, stop=["[/assistant]"]
|
||||
)
|
||||
|
||||
if tensorizer_config_dict is not None:
|
||||
outputs = llm.generate(
|
||||
@@ -49,14 +50,19 @@ def do_sample(llm: vllm.LLM,
|
||||
str(lora_id),
|
||||
lora_id,
|
||||
lora_path,
|
||||
tensorizer_config_dict=tensorizer_config_dict)
|
||||
if lora_id else None)
|
||||
tensorizer_config_dict=tensorizer_config_dict,
|
||||
)
|
||||
if lora_id
|
||||
else None,
|
||||
)
|
||||
else:
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
|
||||
if lora_id else None)
|
||||
if lora_id
|
||||
else None,
|
||||
)
|
||||
# Print the outputs.
|
||||
generated_texts: list[str] = []
|
||||
for output in outputs:
|
||||
@@ -67,42 +73,51 @@ def do_sample(llm: vllm.LLM,
|
||||
return generated_texts
|
||||
|
||||
|
||||
def generate_and_test(llm,
|
||||
sql_lora_files,
|
||||
tensorizer_config_dict: Union[dict, None] = None):
|
||||
def generate_and_test(
|
||||
llm, sql_lora_files, tensorizer_config_dict: Union[dict, None] = None
|
||||
):
|
||||
print("lora adapter created")
|
||||
print("lora 1")
|
||||
assert do_sample(llm,
|
||||
sql_lora_files,
|
||||
tensorizer_config_dict=tensorizer_config_dict,
|
||||
lora_id=1) == EXPECTED_LORA_OUTPUT
|
||||
assert (
|
||||
do_sample(
|
||||
llm,
|
||||
sql_lora_files,
|
||||
tensorizer_config_dict=tensorizer_config_dict,
|
||||
lora_id=1,
|
||||
)
|
||||
== EXPECTED_LORA_OUTPUT
|
||||
)
|
||||
|
||||
print("lora 2")
|
||||
assert do_sample(llm,
|
||||
sql_lora_files,
|
||||
tensorizer_config_dict=tensorizer_config_dict,
|
||||
lora_id=2) == EXPECTED_LORA_OUTPUT
|
||||
assert (
|
||||
do_sample(
|
||||
llm,
|
||||
sql_lora_files,
|
||||
tensorizer_config_dict=tensorizer_config_dict,
|
||||
lora_id=2,
|
||||
)
|
||||
== EXPECTED_LORA_OUTPUT
|
||||
)
|
||||
|
||||
print("removing lora")
|
||||
|
||||
|
||||
@create_new_process_for_each_test()
|
||||
def test_llama_lora(sql_lora_files):
|
||||
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
tokenizer=sql_lora_files,
|
||||
enable_lora=True,
|
||||
# also test odd max_num_seqs
|
||||
max_num_seqs=13,
|
||||
max_loras=4)
|
||||
max_loras=4,
|
||||
)
|
||||
generate_and_test(llm, sql_lora_files)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
@create_new_process_for_each_test()
|
||||
def test_llama_lora_tp4(sql_lora_files):
|
||||
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
tokenizer=sql_lora_files,
|
||||
@@ -117,7 +132,6 @@ def test_llama_lora_tp4(sql_lora_files):
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
@create_new_process_for_each_test()
|
||||
def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
|
||||
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
tokenizer=sql_lora_files,
|
||||
@@ -132,9 +146,9 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@create_new_process_for_each_test()
|
||||
def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
|
||||
sql_lora_huggingface_id):
|
||||
|
||||
def test_tp2_serialize_and_deserialize_lora(
|
||||
tmp_path, sql_lora_files, sql_lora_huggingface_id
|
||||
):
|
||||
# Run the tensorizing of the LoRA adapter and the model in a subprocess
|
||||
# to guarantee cleanup
|
||||
|
||||
@@ -145,17 +159,28 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
|
||||
lora_path = sql_lora_huggingface_id
|
||||
suffix = "test"
|
||||
try:
|
||||
result = subprocess.run([
|
||||
sys.executable,
|
||||
f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py", "--model",
|
||||
MODEL_PATH, "--lora-path", lora_path, "--tensor-parallel-size",
|
||||
str(tp_size), "serialize", "--serialized-directory",
|
||||
str(tmp_path), "--suffix", suffix, "--serialization-kwargs",
|
||||
'{"limit_cpu_concurrency": 4}'
|
||||
],
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True)
|
||||
result = subprocess.run(
|
||||
[
|
||||
sys.executable,
|
||||
f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py",
|
||||
"--model",
|
||||
MODEL_PATH,
|
||||
"--lora-path",
|
||||
lora_path,
|
||||
"--tensor-parallel-size",
|
||||
str(tp_size),
|
||||
"serialize",
|
||||
"--serialized-directory",
|
||||
str(tmp_path),
|
||||
"--suffix",
|
||||
suffix,
|
||||
"--serialization-kwargs",
|
||||
'{"limit_cpu_concurrency": 4}',
|
||||
],
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
except subprocess.CalledProcessError as e:
|
||||
print("Tensorizing failed.")
|
||||
print("STDOUT:\n", e.stdout)
|
||||
@@ -167,21 +192,25 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
|
||||
model_uri = tmp_path / "vllm" / model_ref / suffix / model_name
|
||||
tensorizer_config = TensorizerConfig(tensorizer_uri=str(model_uri))
|
||||
|
||||
loaded_llm = LLM(model=model_ref,
|
||||
tokenizer=sql_lora_files,
|
||||
load_format="tensorizer",
|
||||
enable_lora=True,
|
||||
enforce_eager=True,
|
||||
model_loader_extra_config=tensorizer_config,
|
||||
max_num_seqs=13,
|
||||
tensor_parallel_size=2,
|
||||
max_loras=2)
|
||||
loaded_llm = LLM(
|
||||
model=model_ref,
|
||||
tokenizer=sql_lora_files,
|
||||
load_format="tensorizer",
|
||||
enable_lora=True,
|
||||
enforce_eager=True,
|
||||
model_loader_extra_config=tensorizer_config,
|
||||
max_num_seqs=13,
|
||||
tensor_parallel_size=2,
|
||||
max_loras=2,
|
||||
)
|
||||
|
||||
tc_as_dict = tensorizer_config.to_serializable()
|
||||
|
||||
print("lora adapter created")
|
||||
print("lora 1")
|
||||
assert do_sample(loaded_llm,
|
||||
sql_lora_files,
|
||||
tensorizer_config_dict=tc_as_dict,
|
||||
lora_id=1) == EXPECTED_LORA_OUTPUT
|
||||
assert (
|
||||
do_sample(
|
||||
loaded_llm, sql_lora_files, tensorizer_config_dict=tc_as_dict, lora_id=1
|
||||
)
|
||||
== EXPECTED_LORA_OUTPUT
|
||||
)
|
||||
|
||||
@@ -5,6 +5,7 @@ This script contains:
|
||||
1. test multi loras service with tp >= 2
|
||||
2. test multi loras request
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.utils import multi_gpu_test
|
||||
@@ -31,14 +32,8 @@ LORA_TEST_EXPECTED = [
|
||||
|
||||
def format_chatml_messages(prompt: str):
|
||||
return [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant."
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt
|
||||
},
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": prompt},
|
||||
]
|
||||
|
||||
|
||||
@@ -57,7 +52,6 @@ def make_add_lora_request(name: str, path: str):
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
def test_multi_loras_with_tp_sync():
|
||||
|
||||
llm = LLM(
|
||||
model=MODEL_PATH,
|
||||
enable_lora=True,
|
||||
@@ -116,15 +110,17 @@ def test_multi_loras_with_tp_sync():
|
||||
|
||||
def reload_lora(name: str):
|
||||
"""
|
||||
reload a lora to simulate the case:
|
||||
setting `VLLM_ALLOW_RUNTIME_LORA_UPDATING=true`
|
||||
reload a lora to simulate the case:
|
||||
setting `VLLM_ALLOW_RUNTIME_LORA_UPDATING=true`
|
||||
for dynamic lora loading and unloading
|
||||
"""
|
||||
remove_lora_response = llm.llm_engine.remove_lora(
|
||||
lora_id=LORA_NAME_ID_MAP[name])
|
||||
lora_id=LORA_NAME_ID_MAP[name]
|
||||
)
|
||||
|
||||
add_lora_response = llm.llm_engine.add_lora(
|
||||
make_add_lora_request(name, LORA_NAME_PATH_MAP[name]))
|
||||
make_add_lora_request(name, LORA_NAME_PATH_MAP[name])
|
||||
)
|
||||
|
||||
print(f"{remove_lora_response=}, {add_lora_response=}")
|
||||
|
||||
@@ -134,7 +130,6 @@ def test_multi_loras_with_tp_sync():
|
||||
assert outputs == expected
|
||||
|
||||
for prompt, expected_output in zip(LORA_TEST_PROMPTS, LORA_TEST_EXPECTED):
|
||||
|
||||
output_text = call_llm_get_outputs(prompt, "Alice")
|
||||
check_outputs(output_text, expected_output)
|
||||
|
||||
@@ -175,8 +170,7 @@ def test_multiple_lora_requests():
|
||||
PROMPTS = ["Hello, my name is"] * 2
|
||||
LORA_NAME = "Alice"
|
||||
lora_request = [
|
||||
LoRARequest(LORA_NAME + str(idx), idx + 1,
|
||||
LORA_NAME_PATH_MAP[LORA_NAME])
|
||||
LoRARequest(LORA_NAME + str(idx), idx + 1, LORA_NAME_PATH_MAP[LORA_NAME])
|
||||
for idx in range(len(PROMPTS))
|
||||
]
|
||||
# Multiple SamplingParams should be matched with each prompt
|
||||
|
||||
@@ -8,9 +8,7 @@ from vllm.lora.peft_helper import PEFTHelper
|
||||
from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM
|
||||
from vllm.model_executor.models.utils import WeightsMapper
|
||||
|
||||
lora_lst = [
|
||||
"baichuan7B", "baichuan7B-zero", "baichuan7B-zero-regex", "chatglm3-6b"
|
||||
]
|
||||
lora_lst = ["baichuan7B", "baichuan7B-zero", "baichuan7B-zero-regex", "chatglm3-6b"]
|
||||
BAICHUAN_LORA_MODULES = [
|
||||
"W_pack",
|
||||
"o_proj",
|
||||
@@ -37,8 +35,9 @@ def test_load_checkpoints(
|
||||
else:
|
||||
expected_lora_modules.append(module)
|
||||
if lora_name == "baichuan7B":
|
||||
peft_helper = PEFTHelper.from_local_dir(baichuan_lora_files,
|
||||
max_position_embeddings=4096)
|
||||
peft_helper = PEFTHelper.from_local_dir(
|
||||
baichuan_lora_files, max_position_embeddings=4096
|
||||
)
|
||||
# For the baichuan7B model, load it's LoRA,
|
||||
# and the test should pass.
|
||||
LoRAModel.from_local_checkpoint(
|
||||
@@ -48,13 +47,15 @@ def test_load_checkpoints(
|
||||
lora_model_id=1,
|
||||
device="cpu",
|
||||
embedding_modules=embedding_modules,
|
||||
embedding_padding_modules=embed_padding_modules)
|
||||
embedding_padding_modules=embed_padding_modules,
|
||||
)
|
||||
elif lora_name == "baichuan7B-zero":
|
||||
# Test that the target_modules contain prefix
|
||||
# such as "model.layers.0.self_atten.W_pack", and
|
||||
# the test should pass.
|
||||
peft_helper = PEFTHelper.from_local_dir(baichuan_zero_lora_files,
|
||||
max_position_embeddings=4096)
|
||||
peft_helper = PEFTHelper.from_local_dir(
|
||||
baichuan_zero_lora_files, max_position_embeddings=4096
|
||||
)
|
||||
LoRAModel.from_local_checkpoint(
|
||||
baichuan_zero_lora_files,
|
||||
expected_lora_modules,
|
||||
@@ -62,12 +63,14 @@ def test_load_checkpoints(
|
||||
lora_model_id=1,
|
||||
device="cpu",
|
||||
embedding_modules=embedding_modules,
|
||||
embedding_padding_modules=embed_padding_modules)
|
||||
embedding_padding_modules=embed_padding_modules,
|
||||
)
|
||||
elif lora_name == "baichuan7B-zero-regex":
|
||||
# Test that the `target_modules` in the form of regular expressions,
|
||||
# such as `model\\..*(W_pack|o_proj)`, and the test should pass.
|
||||
peft_helper = PEFTHelper.from_local_dir(baichuan_regex_lora_files,
|
||||
max_position_embeddings=4096)
|
||||
peft_helper = PEFTHelper.from_local_dir(
|
||||
baichuan_regex_lora_files, max_position_embeddings=4096
|
||||
)
|
||||
LoRAModel.from_local_checkpoint(
|
||||
baichuan_regex_lora_files,
|
||||
expected_lora_modules,
|
||||
@@ -75,13 +78,15 @@ def test_load_checkpoints(
|
||||
lora_model_id=1,
|
||||
device="cpu",
|
||||
embedding_modules=embedding_modules,
|
||||
embedding_padding_modules=embed_padding_modules)
|
||||
embedding_padding_modules=embed_padding_modules,
|
||||
)
|
||||
else:
|
||||
# For the baichuan7B model, load chatglm3-6b's LoRA,
|
||||
# and the test should raise the following error.
|
||||
expected_error = "Please verify that the loaded LoRA module is correct" # noqa: E501
|
||||
peft_helper = PEFTHelper.from_local_dir(chatglm3_lora_files,
|
||||
max_position_embeddings=4096)
|
||||
peft_helper = PEFTHelper.from_local_dir(
|
||||
chatglm3_lora_files, max_position_embeddings=4096
|
||||
)
|
||||
with pytest.raises(ValueError, match=expected_error):
|
||||
LoRAModel.from_local_checkpoint(
|
||||
chatglm3_lora_files,
|
||||
@@ -90,11 +95,11 @@ def test_load_checkpoints(
|
||||
lora_model_id=1,
|
||||
device="cpu",
|
||||
embedding_modules=embedding_modules,
|
||||
embedding_padding_modules=embed_padding_modules)
|
||||
embedding_padding_modules=embed_padding_modules,
|
||||
)
|
||||
|
||||
|
||||
def test_lora_weights_mapping(baichuan_lora_files):
|
||||
|
||||
packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
|
||||
embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
|
||||
embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
|
||||
@@ -113,8 +118,9 @@ def test_lora_weights_mapping(baichuan_lora_files):
|
||||
".layers.": ".baichuan_layers.",
|
||||
},
|
||||
)
|
||||
peft_helper = PEFTHelper.from_local_dir(baichuan_lora_files,
|
||||
max_position_embeddings=4096)
|
||||
peft_helper = PEFTHelper.from_local_dir(
|
||||
baichuan_lora_files, max_position_embeddings=4096
|
||||
)
|
||||
lora_model = LoRAModel.from_local_checkpoint(
|
||||
baichuan_lora_files,
|
||||
expected_lora_modules,
|
||||
|
||||
@@ -3,11 +3,13 @@
|
||||
"""
|
||||
Script to test add_lora, remove_lora, pin_lora, list_loras functions.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
|
||||
from vllm.entrypoints.openai.api_server import (
|
||||
build_async_engine_client_from_engine_args)
|
||||
build_async_engine_client_from_engine_args,
|
||||
)
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.v1.engine.llm_engine import LLMEngine
|
||||
|
||||
@@ -17,23 +19,24 @@ LORA_RANK = 8
|
||||
|
||||
|
||||
def make_lora_request(lora_id: int):
|
||||
return LoRARequest(lora_name=f"{lora_id}",
|
||||
lora_int_id=lora_id,
|
||||
lora_path=LORA_MODULE_PATH)
|
||||
return LoRARequest(
|
||||
lora_name=f"{lora_id}", lora_int_id=lora_id, lora_path=LORA_MODULE_PATH
|
||||
)
|
||||
|
||||
|
||||
def test_lora_functions_sync():
|
||||
|
||||
max_loras = 4
|
||||
# Create engine in eager-mode. Due to high max_loras, the CI can
|
||||
# OOM during cuda-graph capture.
|
||||
engine_args = EngineArgs(model=MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_loras=max_loras,
|
||||
max_lora_rank=LORA_RANK,
|
||||
max_model_len=128,
|
||||
gpu_memory_utilization=0.8,
|
||||
enforce_eager=True)
|
||||
engine_args = EngineArgs(
|
||||
model=MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_loras=max_loras,
|
||||
max_lora_rank=LORA_RANK,
|
||||
max_model_len=128,
|
||||
gpu_memory_utilization=0.8,
|
||||
enforce_eager=True,
|
||||
)
|
||||
|
||||
llm = LLMEngine.from_engine_args(engine_args)
|
||||
|
||||
@@ -70,15 +73,16 @@ def test_lora_functions_sync():
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_lora_functions_async():
|
||||
|
||||
max_loras = 4
|
||||
engine_args = AsyncEngineArgs(model=MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_loras=max_loras,
|
||||
max_lora_rank=LORA_RANK,
|
||||
max_model_len=128,
|
||||
gpu_memory_utilization=0.8,
|
||||
enforce_eager=True)
|
||||
engine_args = AsyncEngineArgs(
|
||||
model=MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_loras=max_loras,
|
||||
max_lora_rank=LORA_RANK,
|
||||
max_model_len=128,
|
||||
gpu_memory_utilization=0.8,
|
||||
enforce_eager=True,
|
||||
)
|
||||
|
||||
async def run_check(fn, args, expected: list):
|
||||
await fn(args)
|
||||
|
||||
@@ -11,8 +11,12 @@ from vllm.model_executor.models.llama import LlamaForCausalLM
|
||||
# Provide absolute path and huggingface lora ids
|
||||
lora_fixture_name = ["sql_lora_files", "sql_lora_huggingface_id"]
|
||||
LLAMA_LORA_MODULES = [
|
||||
"qkv_proj", "o_proj", "gate_up_proj", "down_proj", "embed_tokens",
|
||||
"lm_head"
|
||||
"qkv_proj",
|
||||
"o_proj",
|
||||
"gate_up_proj",
|
||||
"down_proj",
|
||||
"embed_tokens",
|
||||
"lm_head",
|
||||
]
|
||||
|
||||
|
||||
@@ -40,7 +44,8 @@ def test_load_checkpoints_from_huggingface(lora_fixture_name, request):
|
||||
lora_model_id=1,
|
||||
device="cpu",
|
||||
embedding_modules=embedding_modules,
|
||||
embedding_padding_modules=embed_padding_modules)
|
||||
embedding_padding_modules=embed_padding_modules,
|
||||
)
|
||||
|
||||
# Assertions to ensure the model is loaded correctly
|
||||
assert lora_model is not None, "LoRAModel is not loaded correctly"
|
||||
|
||||
@@ -10,16 +10,21 @@ from torch import nn
|
||||
|
||||
from vllm.config import ModelConfig, VllmConfig
|
||||
from vllm.config.lora import LoRAConfig
|
||||
from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
|
||||
MergedColumnParallelLinearWithLoRA,
|
||||
RowParallelLinearWithLoRA)
|
||||
from vllm.lora.layers import (
|
||||
ColumnParallelLinearWithLoRA,
|
||||
MergedColumnParallelLinearWithLoRA,
|
||||
RowParallelLinearWithLoRA,
|
||||
)
|
||||
from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
|
||||
from vllm.lora.models import (LoRAMapping, LoRAModel, LoRAModelManager,
|
||||
LRUCacheLoRAModelManager)
|
||||
from vllm.lora.models import (
|
||||
LoRAMapping,
|
||||
LoRAModel,
|
||||
LoRAModelManager,
|
||||
LRUCacheLoRAModelManager,
|
||||
)
|
||||
from vllm.lora.peft_helper import PEFTHelper
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager,
|
||||
WorkerLoRAManager)
|
||||
from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager, WorkerLoRAManager
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from .utils import create_peft_lora
|
||||
@@ -31,22 +36,25 @@ EMBEDDING_MODULES = {
|
||||
|
||||
EMBEDDING_PADDING_MODULES = ["lm_head"]
|
||||
|
||||
DEVICES = ([
|
||||
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
|
||||
] if current_platform.is_cuda_alike() else ["cpu"])
|
||||
DEVICES = (
|
||||
[f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
|
||||
if current_platform.is_cuda_alike()
|
||||
else ["cpu"]
|
||||
)
|
||||
|
||||
DEFAULT_DTYPE = torch.get_default_dtype()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("device", DEVICES)
|
||||
def test_from_lora_tensors(sql_lora_files, device):
|
||||
tensors = load_file(
|
||||
os.path.join(sql_lora_files, "adapter_model.safetensors"))
|
||||
tensors = load_file(os.path.join(sql_lora_files, "adapter_model.safetensors"))
|
||||
new_embeddings = load_file(
|
||||
os.path.join(sql_lora_files, "new_embeddings.safetensors"))
|
||||
os.path.join(sql_lora_files, "new_embeddings.safetensors")
|
||||
)
|
||||
|
||||
peft_helper = PEFTHelper.from_local_dir(sql_lora_files,
|
||||
max_position_embeddings=4096)
|
||||
peft_helper = PEFTHelper.from_local_dir(
|
||||
sql_lora_files, max_position_embeddings=4096
|
||||
)
|
||||
lora_model = LoRAModel.from_lora_tensors(
|
||||
1,
|
||||
tensors,
|
||||
@@ -54,7 +62,8 @@ def test_from_lora_tensors(sql_lora_files, device):
|
||||
device=device,
|
||||
embeddings=new_embeddings,
|
||||
embedding_modules=EMBEDDING_MODULES,
|
||||
embedding_padding_modules=EMBEDDING_PADDING_MODULES)
|
||||
embedding_padding_modules=EMBEDDING_PADDING_MODULES,
|
||||
)
|
||||
for module_name, lora in lora_model.loras.items():
|
||||
assert lora.module_name == module_name
|
||||
assert lora.rank == 8
|
||||
@@ -63,22 +72,27 @@ def test_from_lora_tensors(sql_lora_files, device):
|
||||
assert lora.lora_b is not None
|
||||
assert lora.lora_a.device == torch.device(device)
|
||||
assert lora.lora_b.device == torch.device(device)
|
||||
assert (lora.lora_a.shape[0] == lora.lora_b.shape[1]
|
||||
), f"{lora.lora_a.shape=}, {lora.lora_b.shape=}"
|
||||
assert lora.lora_a.shape[0] == lora.lora_b.shape[1], (
|
||||
f"{lora.lora_a.shape=}, {lora.lora_b.shape=}"
|
||||
)
|
||||
assert lora.lora_a.shape[0] == 8
|
||||
embeddings_module = next(
|
||||
(k for k in EMBEDDING_MODULES if k in module_name), None)
|
||||
(k for k in EMBEDDING_MODULES if k in module_name), None
|
||||
)
|
||||
if embeddings_module:
|
||||
assert torch.equal(
|
||||
lora.embeddings_tensor,
|
||||
new_embeddings[EMBEDDING_MODULES[embeddings_module]].to(
|
||||
device=lora.embeddings_tensor.device))
|
||||
device=lora.embeddings_tensor.device
|
||||
),
|
||||
)
|
||||
else:
|
||||
assert lora.embeddings_tensor is None
|
||||
|
||||
|
||||
def create_lora(lora_id: int, model: nn.Module, sub_modules: list[str],
|
||||
device: torch.device) -> LoRAModel:
|
||||
def create_lora(
|
||||
lora_id: int, model: nn.Module, sub_modules: list[str], device: torch.device
|
||||
) -> LoRAModel:
|
||||
loras: dict[str, LoRALayerWeights] = {}
|
||||
for name in sub_modules:
|
||||
w = model.get_submodule(name).weight
|
||||
@@ -110,8 +124,7 @@ def create_packed_lora(
|
||||
8,
|
||||
16,
|
||||
torch.rand([8, w.shape[1]], device=device),
|
||||
torch.rand([w.shape[0] // len(replaced_module_names), 8],
|
||||
device=device),
|
||||
torch.rand([w.shape[0] // len(replaced_module_names), 8], device=device),
|
||||
)
|
||||
return LoRAModel(lora_id, 8, loras)
|
||||
|
||||
@@ -119,42 +132,42 @@ def create_packed_lora(
|
||||
def test_replace_submodules(dist_init, dummy_model):
|
||||
model = dummy_model
|
||||
manager = LoRAModelManager(
|
||||
model, 1, 1, 1,
|
||||
LoRAConfig(max_lora_rank=8,
|
||||
max_cpu_loras=8,
|
||||
max_loras=8,
|
||||
lora_dtype=DEFAULT_DTYPE), torch.device(DEVICES[0]))
|
||||
model,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
LoRAConfig(
|
||||
max_lora_rank=8, max_cpu_loras=8, max_loras=8, lora_dtype=DEFAULT_DTYPE
|
||||
),
|
||||
torch.device(DEVICES[0]),
|
||||
)
|
||||
model = manager.model
|
||||
assert isinstance(model.get_submodule("dense1"),
|
||||
ColumnParallelLinearWithLoRA)
|
||||
assert isinstance(model.get_submodule("layer1.dense1"),
|
||||
ColumnParallelLinearWithLoRA)
|
||||
assert isinstance(model.get_submodule("dense1"), ColumnParallelLinearWithLoRA)
|
||||
assert isinstance(
|
||||
model.get_submodule("layer1.dense1"), ColumnParallelLinearWithLoRA
|
||||
)
|
||||
assert isinstance(model.get_submodule("dense2"), RowParallelLinearWithLoRA)
|
||||
assert isinstance(model.get_submodule("layer1.dense2"),
|
||||
RowParallelLinearWithLoRA)
|
||||
assert isinstance(model.get_submodule("layer1.dense2"), RowParallelLinearWithLoRA)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("device", DEVICES)
|
||||
def test_lora_model_manager(dist_init, dummy_model, device):
|
||||
model = dummy_model
|
||||
model_lora1 = create_lora(1,
|
||||
model, ["layer1.dense1", "dense2", "lm_head"],
|
||||
device=device)
|
||||
model_lora2 = create_lora(2,
|
||||
model, ["dense1", "dense2", "lm_head"],
|
||||
device=device)
|
||||
model_lora3 = create_lora(3,
|
||||
model, ["dense1", "dense2", "lm_head"],
|
||||
device=device)
|
||||
manager = LoRAModelManager(model,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
LoRAConfig(max_lora_rank=8,
|
||||
max_cpu_loras=3,
|
||||
max_loras=2,
|
||||
lora_dtype=DEFAULT_DTYPE),
|
||||
device=device)
|
||||
model_lora1 = create_lora(
|
||||
1, model, ["layer1.dense1", "dense2", "lm_head"], device=device
|
||||
)
|
||||
model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"], device=device)
|
||||
model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"], device=device)
|
||||
manager = LoRAModelManager(
|
||||
model,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
LoRAConfig(
|
||||
max_lora_rank=8, max_cpu_loras=3, max_loras=2, lora_dtype=DEFAULT_DTYPE
|
||||
),
|
||||
device=device,
|
||||
)
|
||||
assert all(x is None for x in manager.lora_index_to_id)
|
||||
assert manager.add_adapter(model_lora1)
|
||||
assert manager.activate_adapter(1)
|
||||
@@ -204,24 +217,21 @@ def test_lora_model_manager(dist_init, dummy_model, device):
|
||||
@pytest.mark.parametrize("device", DEVICES)
|
||||
def test_lora_lru_cache_model_manager(dist_init, dummy_model, device):
|
||||
model = dummy_model
|
||||
model_lora1 = create_lora(1,
|
||||
model, ["layer1.dense1", "dense2", "lm_head"],
|
||||
device=device)
|
||||
model_lora2 = create_lora(2,
|
||||
model, ["dense1", "dense2", "lm_head"],
|
||||
device=device)
|
||||
model_lora3 = create_lora(3,
|
||||
model, ["dense1", "dense2", "lm_head"],
|
||||
device=device)
|
||||
manager = LRUCacheLoRAModelManager(model,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
LoRAConfig(max_lora_rank=8,
|
||||
max_cpu_loras=3,
|
||||
max_loras=2,
|
||||
lora_dtype=DEFAULT_DTYPE),
|
||||
device=device)
|
||||
model_lora1 = create_lora(
|
||||
1, model, ["layer1.dense1", "dense2", "lm_head"], device=device
|
||||
)
|
||||
model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"], device=device)
|
||||
model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"], device=device)
|
||||
manager = LRUCacheLoRAModelManager(
|
||||
model,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
LoRAConfig(
|
||||
max_lora_rank=8, max_cpu_loras=3, max_loras=2, lora_dtype=DEFAULT_DTYPE
|
||||
),
|
||||
device=device,
|
||||
)
|
||||
assert all(x is None for x in manager.lora_index_to_id)
|
||||
assert manager.add_adapter(model_lora1)
|
||||
assert manager.activate_adapter(1)
|
||||
@@ -297,27 +307,22 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
|
||||
# This tests just the LRU cache functionality, everything else is
|
||||
# tested in test_lora_model_manager
|
||||
model = dummy_model
|
||||
model_lora1 = create_lora(1,
|
||||
model, ["layer1.dense1", "dense2", "lm_head"],
|
||||
device=device)
|
||||
model_lora2 = create_lora(2,
|
||||
model, ["dense1", "dense2", "lm_head"],
|
||||
device=device)
|
||||
model_lora3 = create_lora(3,
|
||||
model, ["dense1", "dense2", "lm_head"],
|
||||
device=device)
|
||||
model_lora4 = create_lora(4,
|
||||
model, ["dense1", "dense2", "lm_head"],
|
||||
device=device)
|
||||
manager = LRUCacheLoRAModelManager(model,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
LoRAConfig(max_lora_rank=8,
|
||||
max_cpu_loras=2,
|
||||
max_loras=2,
|
||||
lora_dtype=DEFAULT_DTYPE),
|
||||
device=device)
|
||||
model_lora1 = create_lora(
|
||||
1, model, ["layer1.dense1", "dense2", "lm_head"], device=device
|
||||
)
|
||||
model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"], device=device)
|
||||
model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"], device=device)
|
||||
model_lora4 = create_lora(4, model, ["dense1", "dense2", "lm_head"], device=device)
|
||||
manager = LRUCacheLoRAModelManager(
|
||||
model,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
LoRAConfig(
|
||||
max_lora_rank=8, max_cpu_loras=2, max_loras=2, lora_dtype=DEFAULT_DTYPE
|
||||
),
|
||||
device=device,
|
||||
)
|
||||
assert all(x is None for x in manager.lora_index_to_id)
|
||||
|
||||
# Add up to capacity
|
||||
@@ -421,12 +426,10 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
|
||||
|
||||
|
||||
@pytest.mark.parametrize("device", DEVICES)
|
||||
def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device,
|
||||
tmp_path):
|
||||
lora_config = LoRAConfig(max_lora_rank=8,
|
||||
max_cpu_loras=4,
|
||||
max_loras=4,
|
||||
lora_dtype=DEFAULT_DTYPE)
|
||||
def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device, tmp_path):
|
||||
lora_config = LoRAConfig(
|
||||
max_lora_rank=8, max_cpu_loras=4, max_loras=4, lora_dtype=DEFAULT_DTYPE
|
||||
)
|
||||
|
||||
dummy_lora_files = f"{tmp_path}/lora_adapter"
|
||||
os.makedirs(dummy_lora_files, exist_ok=True)
|
||||
@@ -438,13 +441,13 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device,
|
||||
)
|
||||
|
||||
model_config = ModelConfig(max_model_len=16)
|
||||
vllm_config = VllmConfig(model_config=model_config,
|
||||
lora_config=lora_config)
|
||||
vllm_config = VllmConfig(model_config=model_config, lora_config=lora_config)
|
||||
|
||||
vllm_config.scheduler_config.max_num_seqs = 4
|
||||
vllm_config.scheduler_config.max_num_batched_tokens = 2
|
||||
worker_adapter_manager = LRUCacheWorkerLoRAManager(
|
||||
vllm_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
|
||||
vllm_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES
|
||||
)
|
||||
|
||||
worker_adapter_manager.max_num_seqs = 4
|
||||
worker_adapter_manager.max_num_batched_tokens = 2
|
||||
@@ -452,52 +455,64 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device,
|
||||
worker_adapter_manager.create_lora_manager(dummy_model)
|
||||
|
||||
mapping = LoRAMapping([], [])
|
||||
worker_adapter_manager.set_active_adapters([
|
||||
LoRARequest("1", 1, dummy_lora_files),
|
||||
LoRARequest("2", 2, dummy_lora_files)
|
||||
], mapping)
|
||||
worker_adapter_manager.set_active_adapters(
|
||||
[LoRARequest("1", 1, dummy_lora_files), LoRARequest("2", 2, dummy_lora_files)],
|
||||
mapping,
|
||||
)
|
||||
assert worker_adapter_manager.list_adapters() == {1, 2}
|
||||
assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
|
||||
assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
|
||||
|
||||
worker_adapter_manager.set_active_adapters([
|
||||
LoRARequest("1", 1, dummy_lora_files),
|
||||
LoRARequest("3", 3, dummy_lora_files),
|
||||
LoRARequest("4", 4, dummy_lora_files)
|
||||
], mapping)
|
||||
worker_adapter_manager.set_active_adapters(
|
||||
[
|
||||
LoRARequest("1", 1, dummy_lora_files),
|
||||
LoRARequest("3", 3, dummy_lora_files),
|
||||
LoRARequest("4", 4, dummy_lora_files),
|
||||
],
|
||||
mapping,
|
||||
)
|
||||
assert worker_adapter_manager.list_adapters() == {1, 2, 3, 4}
|
||||
assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
|
||||
assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
|
||||
assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 3
|
||||
assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4
|
||||
|
||||
worker_adapter_manager.set_active_adapters([
|
||||
LoRARequest("1", 1, dummy_lora_files),
|
||||
LoRARequest("2", 2, dummy_lora_files),
|
||||
LoRARequest("5", 5, dummy_lora_files)
|
||||
], mapping)
|
||||
worker_adapter_manager.set_active_adapters(
|
||||
[
|
||||
LoRARequest("1", 1, dummy_lora_files),
|
||||
LoRARequest("2", 2, dummy_lora_files),
|
||||
LoRARequest("5", 5, dummy_lora_files),
|
||||
],
|
||||
mapping,
|
||||
)
|
||||
assert worker_adapter_manager.list_adapters() == {1, 2, 4, 5}
|
||||
assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
|
||||
assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
|
||||
assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 5
|
||||
assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4
|
||||
|
||||
worker_adapter_manager.set_active_adapters([
|
||||
LoRARequest("1", 1, dummy_lora_files),
|
||||
LoRARequest("1", 1, dummy_lora_files),
|
||||
LoRARequest("1", 1, dummy_lora_files)
|
||||
], mapping)
|
||||
worker_adapter_manager.set_active_adapters(
|
||||
[
|
||||
LoRARequest("1", 1, dummy_lora_files),
|
||||
LoRARequest("1", 1, dummy_lora_files),
|
||||
LoRARequest("1", 1, dummy_lora_files),
|
||||
],
|
||||
mapping,
|
||||
)
|
||||
assert worker_adapter_manager.list_adapters() == {1, 2, 4, 5}
|
||||
assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
|
||||
assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
|
||||
assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 5
|
||||
assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4
|
||||
|
||||
worker_adapter_manager.set_active_adapters([
|
||||
LoRARequest("6", 6, dummy_lora_files),
|
||||
LoRARequest("7", 7, dummy_lora_files),
|
||||
LoRARequest("8", 8, dummy_lora_files)
|
||||
], mapping)
|
||||
worker_adapter_manager.set_active_adapters(
|
||||
[
|
||||
LoRARequest("6", 6, dummy_lora_files),
|
||||
LoRARequest("7", 7, dummy_lora_files),
|
||||
LoRARequest("8", 8, dummy_lora_files),
|
||||
],
|
||||
mapping,
|
||||
)
|
||||
assert worker_adapter_manager.list_adapters() == {1, 6, 7, 8}
|
||||
assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
|
||||
assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 7
|
||||
@@ -506,41 +521,40 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device,
|
||||
|
||||
# Over capacity
|
||||
with pytest.raises(RuntimeError):
|
||||
worker_adapter_manager.set_active_adapters([
|
||||
LoRARequest("10", 10, dummy_lora_files),
|
||||
LoRARequest("11", 11, dummy_lora_files),
|
||||
LoRARequest("12", 12, dummy_lora_files),
|
||||
LoRARequest("13", 13, dummy_lora_files),
|
||||
LoRARequest("14", 14, dummy_lora_files)
|
||||
], mapping)
|
||||
worker_adapter_manager.set_active_adapters(
|
||||
[
|
||||
LoRARequest("10", 10, dummy_lora_files),
|
||||
LoRARequest("11", 11, dummy_lora_files),
|
||||
LoRARequest("12", 12, dummy_lora_files),
|
||||
LoRARequest("13", 13, dummy_lora_files),
|
||||
LoRARequest("14", 14, dummy_lora_files),
|
||||
],
|
||||
mapping,
|
||||
)
|
||||
|
||||
assert worker_adapter_manager.device == device
|
||||
assert (worker_adapter_manager._adapter_manager.punica_wrapper.device ==
|
||||
device)
|
||||
assert worker_adapter_manager._adapter_manager.punica_wrapper.device == device
|
||||
|
||||
|
||||
@pytest.mark.parametrize("device", DEVICES)
|
||||
def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device,
|
||||
tmp_path):
|
||||
def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path):
|
||||
# Should remove every LoRA not specified in the request.
|
||||
lora_config = LoRAConfig(max_lora_rank=8,
|
||||
max_cpu_loras=4,
|
||||
max_loras=4,
|
||||
lora_dtype=DEFAULT_DTYPE)
|
||||
lora_config = LoRAConfig(
|
||||
max_lora_rank=8, max_cpu_loras=4, max_loras=4, lora_dtype=DEFAULT_DTYPE
|
||||
)
|
||||
|
||||
model_config = ModelConfig(max_model_len=16)
|
||||
vllm_config = VllmConfig(model_config=model_config,
|
||||
lora_config=lora_config)
|
||||
vllm_config = VllmConfig(model_config=model_config, lora_config=lora_config)
|
||||
|
||||
vllm_config.scheduler_config.max_num_seqs = 4
|
||||
vllm_config.scheduler_config.max_num_batched_tokens = 2
|
||||
|
||||
worker_adapter_manager = WorkerLoRAManager(vllm_config, device,
|
||||
EMBEDDING_MODULES,
|
||||
EMBEDDING_PADDING_MODULES)
|
||||
worker_adapter_manager = WorkerLoRAManager(
|
||||
vllm_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES
|
||||
)
|
||||
worker_adapter_manager.vocab_size = (
|
||||
dummy_model_gate_up.unpadded_vocab_size -
|
||||
lora_config.lora_extra_vocab_size)
|
||||
dummy_model_gate_up.unpadded_vocab_size - lora_config.lora_extra_vocab_size
|
||||
)
|
||||
worker_adapter_manager.create_lora_manager(dummy_model_gate_up)
|
||||
|
||||
dummy_lora_files = f"{tmp_path}/lora_adapter"
|
||||
@@ -553,49 +567,61 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device,
|
||||
)
|
||||
|
||||
mapping = LoRAMapping([], [])
|
||||
worker_adapter_manager.set_active_adapters([
|
||||
LoRARequest("1", 1, dummy_lora_files),
|
||||
LoRARequest("2", 2, dummy_lora_files)
|
||||
], mapping)
|
||||
worker_adapter_manager.set_active_adapters(
|
||||
[LoRARequest("1", 1, dummy_lora_files), LoRARequest("2", 2, dummy_lora_files)],
|
||||
mapping,
|
||||
)
|
||||
assert worker_adapter_manager.list_adapters() == {1, 2}
|
||||
assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
|
||||
assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
|
||||
|
||||
worker_adapter_manager.set_active_adapters([
|
||||
LoRARequest("1", 1, dummy_lora_files),
|
||||
LoRARequest("3", 3, dummy_lora_files),
|
||||
LoRARequest("4", 4, dummy_lora_files)
|
||||
], mapping)
|
||||
worker_adapter_manager.set_active_adapters(
|
||||
[
|
||||
LoRARequest("1", 1, dummy_lora_files),
|
||||
LoRARequest("3", 3, dummy_lora_files),
|
||||
LoRARequest("4", 4, dummy_lora_files),
|
||||
],
|
||||
mapping,
|
||||
)
|
||||
assert worker_adapter_manager.list_adapters() == {1, 3, 4}
|
||||
assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
|
||||
assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 3
|
||||
assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 4
|
||||
|
||||
worker_adapter_manager.set_active_adapters([
|
||||
LoRARequest("1", 1, dummy_lora_files),
|
||||
LoRARequest("2", 2, dummy_lora_files),
|
||||
LoRARequest("5", 5, dummy_lora_files)
|
||||
], mapping)
|
||||
worker_adapter_manager.set_active_adapters(
|
||||
[
|
||||
LoRARequest("1", 1, dummy_lora_files),
|
||||
LoRARequest("2", 2, dummy_lora_files),
|
||||
LoRARequest("5", 5, dummy_lora_files),
|
||||
],
|
||||
mapping,
|
||||
)
|
||||
assert worker_adapter_manager.list_adapters() == {1, 2, 5}
|
||||
assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
|
||||
assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
|
||||
assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 5
|
||||
|
||||
worker_adapter_manager.set_active_adapters([
|
||||
LoRARequest("1", 1, dummy_lora_files),
|
||||
LoRARequest("1", 1, dummy_lora_files),
|
||||
LoRARequest("1", 1, dummy_lora_files)
|
||||
], mapping)
|
||||
worker_adapter_manager.set_active_adapters(
|
||||
[
|
||||
LoRARequest("1", 1, dummy_lora_files),
|
||||
LoRARequest("1", 1, dummy_lora_files),
|
||||
LoRARequest("1", 1, dummy_lora_files),
|
||||
],
|
||||
mapping,
|
||||
)
|
||||
assert worker_adapter_manager.list_adapters() == {1}
|
||||
assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
|
||||
assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] is None
|
||||
assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] is None
|
||||
|
||||
worker_adapter_manager.set_active_adapters([
|
||||
LoRARequest("6", 6, dummy_lora_files),
|
||||
LoRARequest("7", 7, dummy_lora_files),
|
||||
LoRARequest("8", 8, dummy_lora_files)
|
||||
], mapping)
|
||||
worker_adapter_manager.set_active_adapters(
|
||||
[
|
||||
LoRARequest("6", 6, dummy_lora_files),
|
||||
LoRARequest("7", 7, dummy_lora_files),
|
||||
LoRARequest("8", 8, dummy_lora_files),
|
||||
],
|
||||
mapping,
|
||||
)
|
||||
assert worker_adapter_manager.list_adapters() == {6, 7, 8}
|
||||
assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 8
|
||||
assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 6
|
||||
@@ -603,17 +629,19 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device,
|
||||
|
||||
# Over capacity
|
||||
with pytest.raises(RuntimeError):
|
||||
worker_adapter_manager.set_active_adapters([
|
||||
LoRARequest("10", 10, dummy_lora_files),
|
||||
LoRARequest("11", 11, dummy_lora_files),
|
||||
LoRARequest("12", 12, dummy_lora_files),
|
||||
LoRARequest("13", 13, dummy_lora_files),
|
||||
LoRARequest("14", 14, dummy_lora_files)
|
||||
], mapping)
|
||||
worker_adapter_manager.set_active_adapters(
|
||||
[
|
||||
LoRARequest("10", 10, dummy_lora_files),
|
||||
LoRARequest("11", 11, dummy_lora_files),
|
||||
LoRARequest("12", 12, dummy_lora_files),
|
||||
LoRARequest("13", 13, dummy_lora_files),
|
||||
LoRARequest("14", 14, dummy_lora_files),
|
||||
],
|
||||
mapping,
|
||||
)
|
||||
|
||||
assert worker_adapter_manager.device == device
|
||||
assert (worker_adapter_manager._adapter_manager.punica_wrapper.device ==
|
||||
device)
|
||||
assert worker_adapter_manager._adapter_manager.punica_wrapper.device == device
|
||||
|
||||
|
||||
@pytest.mark.parametrize("device", DEVICES)
|
||||
@@ -624,7 +652,8 @@ def test_packed_loras(dist_init, dummy_model_gate_up, device):
|
||||
model,
|
||||
module_name="gate_up_proj",
|
||||
replaced_module_names=["gate_proj", "up_proj"],
|
||||
device=device)
|
||||
device=device,
|
||||
)
|
||||
model_lora1 = create_packed_lora(
|
||||
2,
|
||||
model,
|
||||
@@ -634,19 +663,21 @@ def test_packed_loras(dist_init, dummy_model_gate_up, device):
|
||||
empty_replaced_module_name="gate_proj",
|
||||
)
|
||||
|
||||
manager = LoRAModelManager(model,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
LoRAConfig(max_lora_rank=8,
|
||||
max_cpu_loras=2,
|
||||
max_loras=2,
|
||||
lora_dtype=DEFAULT_DTYPE),
|
||||
device=device)
|
||||
manager = LoRAModelManager(
|
||||
model,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
LoRAConfig(
|
||||
max_lora_rank=8, max_cpu_loras=2, max_loras=2, lora_dtype=DEFAULT_DTYPE
|
||||
),
|
||||
device=device,
|
||||
)
|
||||
model = manager.model
|
||||
|
||||
assert isinstance(model.get_submodule("gate_up_proj"),
|
||||
MergedColumnParallelLinearWithLoRA)
|
||||
assert isinstance(
|
||||
model.get_submodule("gate_up_proj"), MergedColumnParallelLinearWithLoRA
|
||||
)
|
||||
# Verify packed lora is correct
|
||||
model_lora_clone = model_lora.clone(1)
|
||||
model_lora_clone1 = model_lora1.clone(1)
|
||||
@@ -659,21 +690,27 @@ def test_packed_loras(dist_init, dummy_model_gate_up, device):
|
||||
packed_lora = model_lora.get_lora("gate_up_proj")
|
||||
assert packed_lora and isinstance(packed_lora, PackedLoRALayerWeights)
|
||||
|
||||
torch.testing.assert_close(packed_lora.lora_a[0],
|
||||
model_lora_clone.get_lora("gate_proj").lora_a)
|
||||
torch.testing.assert_close(packed_lora.lora_b[0],
|
||||
model_lora_clone.get_lora("gate_proj").lora_b)
|
||||
torch.testing.assert_close(packed_lora.lora_a[1],
|
||||
model_lora_clone.get_lora("up_proj").lora_a)
|
||||
torch.testing.assert_close(packed_lora.lora_b[1],
|
||||
model_lora_clone.get_lora("up_proj").lora_b)
|
||||
torch.testing.assert_close(
|
||||
packed_lora.lora_a[0], model_lora_clone.get_lora("gate_proj").lora_a
|
||||
)
|
||||
torch.testing.assert_close(
|
||||
packed_lora.lora_b[0], model_lora_clone.get_lora("gate_proj").lora_b
|
||||
)
|
||||
torch.testing.assert_close(
|
||||
packed_lora.lora_a[1], model_lora_clone.get_lora("up_proj").lora_a
|
||||
)
|
||||
torch.testing.assert_close(
|
||||
packed_lora.lora_b[1], model_lora_clone.get_lora("up_proj").lora_b
|
||||
)
|
||||
|
||||
packed_lora1 = model_lora1.get_lora("gate_up_proj")
|
||||
assert packed_lora1 and isinstance(packed_lora1, PackedLoRALayerWeights)
|
||||
|
||||
assert packed_lora1.lora_a[0] is None
|
||||
assert packed_lora1.lora_b[0] is None
|
||||
torch.testing.assert_close(packed_lora1.lora_a[1],
|
||||
model_lora_clone1.get_lora("up_proj").lora_a)
|
||||
torch.testing.assert_close(packed_lora1.lora_b[1],
|
||||
model_lora_clone1.get_lora("up_proj").lora_b)
|
||||
torch.testing.assert_close(
|
||||
packed_lora1.lora_a[1], model_lora_clone1.get_lora("up_proj").lora_a
|
||||
)
|
||||
torch.testing.assert_close(
|
||||
packed_lora1.lora_b[1], model_lora_clone1.get_lora("up_proj").lora_b
|
||||
)
|
||||
|
||||
@@ -15,7 +15,8 @@ MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5"
|
||||
PROMPT_TEMPLATE = (
|
||||
"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"
|
||||
"(<image>./</image>)\nWhat is in the image?<|eot_id|>"
|
||||
"<|start_header_id|>assistant<|end_header_id|>\n\n")
|
||||
"<|start_header_id|>assistant<|end_header_id|>\n\n"
|
||||
)
|
||||
|
||||
IMAGE_ASSETS = [
|
||||
ImageAsset("stop_sign"),
|
||||
@@ -34,18 +35,18 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
stop_token_ids=[128001, 128009], # eos_id, eot_id
|
||||
)
|
||||
|
||||
inputs = [{
|
||||
"prompt": PROMPT_TEMPLATE,
|
||||
"multi_modal_data": {
|
||||
"image": asset.pil_image
|
||||
},
|
||||
} for asset in IMAGE_ASSETS]
|
||||
inputs = [
|
||||
{
|
||||
"prompt": PROMPT_TEMPLATE,
|
||||
"multi_modal_data": {"image": asset.pil_image},
|
||||
}
|
||||
for asset in IMAGE_ASSETS
|
||||
]
|
||||
|
||||
outputs = llm.generate(
|
||||
inputs,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
|
||||
if lora_id else None,
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
|
||||
)
|
||||
# Print the outputs.
|
||||
generated_texts: list[str] = []
|
||||
@@ -58,7 +59,8 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
|
||||
@pytest.mark.xfail(
|
||||
current_platform.is_rocm(),
|
||||
reason="MiniCPM-V dependency xformers incompatible with ROCm")
|
||||
reason="MiniCPM-V dependency xformers incompatible with ROCm",
|
||||
)
|
||||
def test_minicpmv_lora(minicpmv_lora_files):
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
@@ -68,10 +70,7 @@ def test_minicpmv_lora(minicpmv_lora_files):
|
||||
max_lora_rank=8,
|
||||
enforce_eager=True,
|
||||
max_model_len=2048,
|
||||
limit_mm_per_prompt={
|
||||
"image": 2,
|
||||
"video": 0
|
||||
},
|
||||
limit_mm_per_prompt={"image": 2, "video": 0},
|
||||
trust_remote_code=True,
|
||||
)
|
||||
output1 = do_sample(llm, minicpmv_lora_files, lora_id=1)
|
||||
@@ -82,11 +81,13 @@ def test_minicpmv_lora(minicpmv_lora_files):
|
||||
assert EXPECTED_OUTPUT[i].startswith(output2[i])
|
||||
|
||||
|
||||
@pytest.mark.skipif(current_platform.is_cuda_alike(),
|
||||
reason="Skipping to avoid redundant model tests")
|
||||
@pytest.mark.skipif(
|
||||
current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
|
||||
)
|
||||
@pytest.mark.xfail(
|
||||
current_platform.is_rocm(),
|
||||
reason="MiniCPM-V dependency xformers incompatible with ROCm")
|
||||
reason="MiniCPM-V dependency xformers incompatible with ROCm",
|
||||
)
|
||||
@create_new_process_for_each_test()
|
||||
def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
|
||||
llm = vllm.LLM(
|
||||
@@ -96,10 +97,7 @@ def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
|
||||
max_loras=4,
|
||||
max_lora_rank=64,
|
||||
tensor_parallel_size=4,
|
||||
limit_mm_per_prompt={
|
||||
"image": 2,
|
||||
"video": 0
|
||||
},
|
||||
limit_mm_per_prompt={"image": 2, "video": 0},
|
||||
trust_remote_code=True,
|
||||
)
|
||||
output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
|
||||
@@ -107,11 +105,13 @@ def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
|
||||
assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
|
||||
|
||||
|
||||
@pytest.mark.skipif(current_platform.is_cuda_alike(),
|
||||
reason="Skipping to avoid redundant model tests")
|
||||
@pytest.mark.skipif(
|
||||
current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
|
||||
)
|
||||
@pytest.mark.xfail(
|
||||
current_platform.is_rocm(),
|
||||
reason="MiniCPM-V dependency xformers incompatible with ROCm")
|
||||
reason="MiniCPM-V dependency xformers incompatible with ROCm",
|
||||
)
|
||||
@create_new_process_for_each_test()
|
||||
def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files):
|
||||
llm = vllm.LLM(
|
||||
@@ -122,10 +122,7 @@ def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files):
|
||||
max_lora_rank=8,
|
||||
tensor_parallel_size=4,
|
||||
trust_remote_code=True,
|
||||
limit_mm_per_prompt={
|
||||
"image": 1,
|
||||
"video": 0
|
||||
},
|
||||
limit_mm_per_prompt={"image": 1, "video": 0},
|
||||
fully_sharded_loras=True,
|
||||
)
|
||||
output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
|
||||
|
||||
@@ -11,15 +11,15 @@ from vllm.platforms import current_platform
|
||||
MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
||||
|
||||
|
||||
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int,
|
||||
prompts: list[str]) -> list[str]:
|
||||
|
||||
def do_sample(
|
||||
llm: vllm.LLM, lora_path: str, lora_id: int, prompts: list[str]
|
||||
) -> list[str]:
|
||||
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256)
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
|
||||
if lora_id else None)
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
|
||||
)
|
||||
# Print the outputs.
|
||||
generated_texts: list[str] = []
|
||||
for output in outputs:
|
||||
@@ -33,8 +33,11 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int,
|
||||
@pytest.mark.parametrize("tp_size", [4])
|
||||
def test_mixtral_lora(mixtral_lora_files, tp_size):
|
||||
"""Original test, the LoRA model has the common target modules, not all"""
|
||||
if torch.cuda.device_count(
|
||||
) < tp_size and tp_size > 1 and current_platform.is_cuda_alike():
|
||||
if (
|
||||
torch.cuda.device_count() < tp_size
|
||||
and tp_size > 1
|
||||
and current_platform.is_cuda_alike()
|
||||
):
|
||||
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
|
||||
|
||||
prompts = [
|
||||
@@ -57,7 +60,11 @@ def test_mixtral_lora(mixtral_lora_files, tp_size):
|
||||
"give_opinion(name[SpellForce 3], developer[Grimlore Games], release_year[2017], rating[poor])", # noqa: E501
|
||||
"inform(name[BioShock], release_year[2007], rating[good], genres[action-adventure, role-playing, shooter], platforms[PlayStation, Xbox, PC], available_on_steam[yes], has_linux_release[no], has_mac_release[yes])", # noqa: E501
|
||||
]
|
||||
assert do_sample(llm, mixtral_lora_files, lora_id=1,
|
||||
prompts=prompts) == expected_lora_output
|
||||
assert do_sample(llm, mixtral_lora_files, lora_id=2,
|
||||
prompts=prompts) == expected_lora_output
|
||||
assert (
|
||||
do_sample(llm, mixtral_lora_files, lora_id=1, prompts=prompts)
|
||||
== expected_lora_output
|
||||
)
|
||||
assert (
|
||||
do_sample(llm, mixtral_lora_files, lora_id=2, prompts=prompts)
|
||||
== expected_lora_output
|
||||
)
|
||||
|
||||
@@ -13,34 +13,27 @@ from vllm.lora.peft_helper import PEFTHelper
|
||||
ERROR_CASES = [
|
||||
(
|
||||
"test_rank",
|
||||
{
|
||||
"r": 1024
|
||||
},
|
||||
{"r": 1024},
|
||||
"is greater than max_lora_rank",
|
||||
),
|
||||
(
|
||||
"test_bias",
|
||||
{
|
||||
"bias": "all"
|
||||
},
|
||||
{"bias": "all"},
|
||||
"Adapter bias cannot be used without bias_enabled",
|
||||
),
|
||||
("test_dora", {
|
||||
"use_dora": True
|
||||
}, "does not yet support DoRA"),
|
||||
("test_dora", {"use_dora": True}, "does not yet support DoRA"),
|
||||
(
|
||||
"test_modules_to_save",
|
||||
{
|
||||
"modules_to_save": ["lm_head"]
|
||||
},
|
||||
{"modules_to_save": ["lm_head"]},
|
||||
"only supports modules_to_save being None",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def test_peft_helper_pass(sql_lora_files, tmp_path):
|
||||
peft_helper = PEFTHelper.from_local_dir(sql_lora_files,
|
||||
max_position_embeddings=4096)
|
||||
peft_helper = PEFTHelper.from_local_dir(
|
||||
sql_lora_files, max_position_embeddings=4096
|
||||
)
|
||||
lora_config = LoRAConfig(max_lora_rank=16, max_cpu_loras=3, max_loras=2)
|
||||
peft_helper.validate_legal(lora_config)
|
||||
assert peft_helper.r == 8
|
||||
@@ -74,8 +67,7 @@ def test_peft_helper_pass(sql_lora_files, tmp_path):
|
||||
with open(config_path, "w") as f:
|
||||
json.dump(adapter_config, f)
|
||||
|
||||
peft_helper = PEFTHelper.from_local_dir(test_dir,
|
||||
max_position_embeddings=4096)
|
||||
peft_helper = PEFTHelper.from_local_dir(test_dir, max_position_embeddings=4096)
|
||||
peft_helper.validate_legal(lora_config)
|
||||
scaling = peft_helper.lora_alpha / math.sqrt(peft_helper.r)
|
||||
assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3
|
||||
@@ -106,4 +98,5 @@ def test_peft_helper_error(
|
||||
# Test loading the adapter
|
||||
with pytest.raises(ValueError, match=expected_error):
|
||||
PEFTHelper.from_local_dir(
|
||||
test_dir, max_position_embeddings=4096).validate_legal(lora_config)
|
||||
test_dir, max_position_embeddings=4096
|
||||
).validate_legal(lora_config)
|
||||
|
||||
@@ -21,11 +21,18 @@ def reset_device(reset_default_device):
|
||||
|
||||
# Utility shrink and expand operations used as reference implementations.
|
||||
def sgmv_shrink_for_nslices(
|
||||
nslices: int, inputs_tensor: torch.Tensor,
|
||||
lora_weights_lst: list[torch.Tensor], out_tensor: torch.Tensor,
|
||||
b_seq_start_loc: torch.Tensor, seq_len_tensor: torch.Tensor,
|
||||
prompt_lora_mapping: torch.Tensor, batches: int, max_seq_length: int,
|
||||
num_tokens: int, scaling: float):
|
||||
nslices: int,
|
||||
inputs_tensor: torch.Tensor,
|
||||
lora_weights_lst: list[torch.Tensor],
|
||||
out_tensor: torch.Tensor,
|
||||
b_seq_start_loc: torch.Tensor,
|
||||
seq_len_tensor: torch.Tensor,
|
||||
prompt_lora_mapping: torch.Tensor,
|
||||
batches: int,
|
||||
max_seq_length: int,
|
||||
num_tokens: int,
|
||||
scaling: float,
|
||||
):
|
||||
"""
|
||||
Wrapper around torch_ops.sgmv_shrink that handles any nslices.
|
||||
"""
|
||||
@@ -44,15 +51,20 @@ def sgmv_shrink_for_nslices(
|
||||
)
|
||||
|
||||
|
||||
def sgmv_expand_for_nslices(nslices: int, hidden_size: int,
|
||||
inputs_tensor: torch.Tensor,
|
||||
lora_weights_lst: list[torch.Tensor],
|
||||
out_tensor: torch.Tensor,
|
||||
b_seq_start_loc: torch.Tensor,
|
||||
seq_len_tensor: torch.Tensor,
|
||||
prompt_lora_mapping: torch.Tensor, batches: int,
|
||||
max_seq_length: int, num_tokens: int,
|
||||
add_inputs: bool) -> None:
|
||||
def sgmv_expand_for_nslices(
|
||||
nslices: int,
|
||||
hidden_size: int,
|
||||
inputs_tensor: torch.Tensor,
|
||||
lora_weights_lst: list[torch.Tensor],
|
||||
out_tensor: torch.Tensor,
|
||||
b_seq_start_loc: torch.Tensor,
|
||||
seq_len_tensor: torch.Tensor,
|
||||
prompt_lora_mapping: torch.Tensor,
|
||||
batches: int,
|
||||
max_seq_length: int,
|
||||
num_tokens: int,
|
||||
add_inputs: bool,
|
||||
) -> None:
|
||||
"""
|
||||
Wrapper around torch_ops.sgmv_expand that handles any nslices.
|
||||
"""
|
||||
@@ -94,10 +106,17 @@ def sgmv_expand_for_nslices(nslices: int, hidden_size: int,
|
||||
_dict_lock = Lock()
|
||||
|
||||
|
||||
def check_lora_shrink_kernel(batches: int, num_loras: int, rank: int,
|
||||
hidden_size: int, nslices: int,
|
||||
dtype: torch.dtype, device: str, seq_length: int,
|
||||
scaling: float):
|
||||
def check_lora_shrink_kernel(
|
||||
batches: int,
|
||||
num_loras: int,
|
||||
rank: int,
|
||||
hidden_size: int,
|
||||
nslices: int,
|
||||
dtype: torch.dtype,
|
||||
device: str,
|
||||
seq_length: int,
|
||||
scaling: float,
|
||||
):
|
||||
"""
|
||||
Compare outputs of torch_ops.sgmv_shrink and triton_ops.lora_shrink
|
||||
kernels.
|
||||
@@ -116,14 +135,19 @@ def check_lora_shrink_kernel(batches: int, num_loras: int, rank: int,
|
||||
max_seq_length, token_nums = data.meta()
|
||||
|
||||
# Setup metadata information for SGMV and reference kernels
|
||||
sgmv_meta_args = (data.b_seq_start_loc, data.seq_len_tensor,
|
||||
data.prompt_lora_mapping, batches, max_seq_length,
|
||||
token_nums)
|
||||
sgmv_meta_args = (
|
||||
data.b_seq_start_loc,
|
||||
data.seq_len_tensor,
|
||||
data.prompt_lora_mapping,
|
||||
batches,
|
||||
max_seq_length,
|
||||
token_nums,
|
||||
)
|
||||
|
||||
# Setup metadata information for the LoRA kernel.
|
||||
lora_meta = LoRAKernelMeta.make(max_loras=num_loras,
|
||||
max_num_tokens=token_nums,
|
||||
device='cuda')
|
||||
lora_meta = LoRAKernelMeta.make(
|
||||
max_loras=num_loras, max_num_tokens=token_nums, device="cuda"
|
||||
)
|
||||
lora_meta.prepare_tensors(data.token_lora_mapping)
|
||||
|
||||
ref_out_tensor = data.ref_out_tensor
|
||||
@@ -154,10 +178,17 @@ def check_lora_shrink_kernel(batches: int, num_loras: int, rank: int,
|
||||
assert_close(out_tensor, ref_out_tensor)
|
||||
|
||||
|
||||
def check_lora_expand_kernel(batches: int, num_loras: int, rank: int,
|
||||
hidden_size: int, nslices: int,
|
||||
dtype: torch.dtype, device: str, seq_length: int,
|
||||
add_inputs: bool):
|
||||
def check_lora_expand_kernel(
|
||||
batches: int,
|
||||
num_loras: int,
|
||||
rank: int,
|
||||
hidden_size: int,
|
||||
nslices: int,
|
||||
dtype: torch.dtype,
|
||||
device: str,
|
||||
seq_length: int,
|
||||
add_inputs: bool,
|
||||
):
|
||||
"""
|
||||
Compare outputs of torch_ops.sgmv_expand and triton_ops.lora_expand
|
||||
kernels.
|
||||
@@ -177,14 +208,19 @@ def check_lora_expand_kernel(batches: int, num_loras: int, rank: int,
|
||||
max_seq_length, token_nums = data.meta()
|
||||
|
||||
# Setup metadata information for SGMV and reference kernels
|
||||
sgmv_meta_args = (data.b_seq_start_loc, data.seq_len_tensor,
|
||||
data.prompt_lora_mapping, batches, max_seq_length,
|
||||
token_nums)
|
||||
sgmv_meta_args = (
|
||||
data.b_seq_start_loc,
|
||||
data.seq_len_tensor,
|
||||
data.prompt_lora_mapping,
|
||||
batches,
|
||||
max_seq_length,
|
||||
token_nums,
|
||||
)
|
||||
|
||||
# Setup metadata information for the LoRA kernel.
|
||||
lora_meta = LoRAKernelMeta.make(max_loras=num_loras,
|
||||
max_num_tokens=token_nums,
|
||||
device='cuda')
|
||||
lora_meta = LoRAKernelMeta.make(
|
||||
max_loras=num_loras, max_num_tokens=token_nums, device="cuda"
|
||||
)
|
||||
lora_meta.prepare_tensors(data.token_lora_mapping)
|
||||
|
||||
# Setup output tensors
|
||||
@@ -194,21 +230,25 @@ def check_lora_expand_kernel(batches: int, num_loras: int, rank: int,
|
||||
with _dict_lock:
|
||||
# lora_expand kernel
|
||||
_LORA_B_PTR_DICT.clear()
|
||||
triton_ops.lora_expand(data.inputs_tensor,
|
||||
data.lora_weights,
|
||||
out_tensor,
|
||||
*lora_meta.meta_args(token_nums=token_nums),
|
||||
offset_start=0,
|
||||
add_inputs=add_inputs)
|
||||
triton_ops.lora_expand(
|
||||
data.inputs_tensor,
|
||||
data.lora_weights,
|
||||
out_tensor,
|
||||
*lora_meta.meta_args(token_nums=token_nums),
|
||||
offset_start=0,
|
||||
add_inputs=add_inputs,
|
||||
)
|
||||
|
||||
# Reference
|
||||
sgmv_expand_for_nslices(nslices,
|
||||
hidden_size,
|
||||
data.inputs_tensor,
|
||||
data.lora_weights,
|
||||
ref_out_tensor,
|
||||
*sgmv_meta_args,
|
||||
add_inputs=add_inputs)
|
||||
sgmv_expand_for_nslices(
|
||||
nslices,
|
||||
hidden_size,
|
||||
data.inputs_tensor,
|
||||
data.lora_weights,
|
||||
ref_out_tensor,
|
||||
*sgmv_meta_args,
|
||||
add_inputs=add_inputs,
|
||||
)
|
||||
|
||||
assert_close(out_tensor, ref_out_tensor)
|
||||
|
||||
@@ -299,7 +339,7 @@ HIDDEN_SIZES = [
|
||||
128000,
|
||||
128256,
|
||||
]
|
||||
#The size of TP
|
||||
# The size of TP
|
||||
divisibility = [1, 2, 8, 16, 64]
|
||||
|
||||
all_hidden_size = []
|
||||
@@ -331,10 +371,10 @@ DEVICES = [f"cuda:{0}"]
|
||||
SEED = [0]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("batches", test_params['batches'])
|
||||
@pytest.mark.parametrize("num_loras", test_params['num_loras'])
|
||||
@pytest.mark.parametrize("rank", test_params['max_ranks'])
|
||||
@pytest.mark.parametrize("hidden_size", test_params['hidden_sizes'])
|
||||
@pytest.mark.parametrize("batches", test_params["batches"])
|
||||
@pytest.mark.parametrize("num_loras", test_params["num_loras"])
|
||||
@pytest.mark.parametrize("rank", test_params["max_ranks"])
|
||||
@pytest.mark.parametrize("hidden_size", test_params["hidden_sizes"])
|
||||
@pytest.mark.parametrize("nslices", [1, 2, 3])
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("device", DEVICES)
|
||||
@@ -358,31 +398,35 @@ def test_kernels(
|
||||
current_platform.seed_everything(seed)
|
||||
|
||||
if op_type == "shrink":
|
||||
check_lora_shrink_kernel(batches=batches,
|
||||
num_loras=num_loras,
|
||||
rank=rank,
|
||||
hidden_size=hidden_size,
|
||||
nslices=nslices,
|
||||
dtype=dtype,
|
||||
device=device,
|
||||
seq_length=128,
|
||||
scaling=0.5)
|
||||
check_lora_shrink_kernel(
|
||||
batches=batches,
|
||||
num_loras=num_loras,
|
||||
rank=rank,
|
||||
hidden_size=hidden_size,
|
||||
nslices=nslices,
|
||||
dtype=dtype,
|
||||
device=device,
|
||||
seq_length=128,
|
||||
scaling=0.5,
|
||||
)
|
||||
else:
|
||||
check_lora_expand_kernel(batches=batches,
|
||||
num_loras=num_loras,
|
||||
rank=rank,
|
||||
hidden_size=hidden_size,
|
||||
nslices=nslices,
|
||||
dtype=dtype,
|
||||
device=device,
|
||||
seq_length=128,
|
||||
add_inputs=True)
|
||||
check_lora_expand_kernel(
|
||||
batches=batches,
|
||||
num_loras=num_loras,
|
||||
rank=rank,
|
||||
hidden_size=hidden_size,
|
||||
nslices=nslices,
|
||||
dtype=dtype,
|
||||
device=device,
|
||||
seq_length=128,
|
||||
add_inputs=True,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("batches", hs_test_params['batches'])
|
||||
@pytest.mark.parametrize("num_loras", hs_test_params['num_loras'])
|
||||
@pytest.mark.parametrize("rank", hs_test_params['max_ranks'])
|
||||
@pytest.mark.parametrize("hidden_size", hs_test_params['hidden_sizes'])
|
||||
@pytest.mark.parametrize("batches", hs_test_params["batches"])
|
||||
@pytest.mark.parametrize("num_loras", hs_test_params["num_loras"])
|
||||
@pytest.mark.parametrize("rank", hs_test_params["max_ranks"])
|
||||
@pytest.mark.parametrize("hidden_size", hs_test_params["hidden_sizes"])
|
||||
@pytest.mark.parametrize("nslices", [1, 2, 3])
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("device", DEVICES)
|
||||
@@ -406,22 +450,26 @@ def test_kernels_hidden_size(
|
||||
current_platform.seed_everything(seed)
|
||||
|
||||
if op_type == "shrink":
|
||||
check_lora_shrink_kernel(batches=batches,
|
||||
num_loras=num_loras,
|
||||
rank=rank,
|
||||
hidden_size=hidden_size,
|
||||
nslices=nslices,
|
||||
dtype=dtype,
|
||||
device=device,
|
||||
seq_length=128,
|
||||
scaling=0.5)
|
||||
check_lora_shrink_kernel(
|
||||
batches=batches,
|
||||
num_loras=num_loras,
|
||||
rank=rank,
|
||||
hidden_size=hidden_size,
|
||||
nslices=nslices,
|
||||
dtype=dtype,
|
||||
device=device,
|
||||
seq_length=128,
|
||||
scaling=0.5,
|
||||
)
|
||||
else:
|
||||
check_lora_expand_kernel(batches=batches,
|
||||
num_loras=num_loras,
|
||||
rank=rank,
|
||||
hidden_size=hidden_size,
|
||||
nslices=nslices,
|
||||
dtype=dtype,
|
||||
device=device,
|
||||
seq_length=128,
|
||||
add_inputs=True)
|
||||
check_lora_expand_kernel(
|
||||
batches=batches,
|
||||
num_loras=num_loras,
|
||||
rank=rank,
|
||||
hidden_size=hidden_size,
|
||||
nslices=nslices,
|
||||
dtype=dtype,
|
||||
device=device,
|
||||
seq_length=128,
|
||||
add_inputs=True,
|
||||
)
|
||||
|
||||
@@ -20,28 +20,27 @@ class ModelWithQuantization:
|
||||
|
||||
|
||||
MODELS: list[ModelWithQuantization]
|
||||
#AWQ quantization is currently not supported in ROCm.
|
||||
# AWQ quantization is currently not supported in ROCm.
|
||||
if current_platform.is_rocm():
|
||||
MODELS = [
|
||||
ModelWithQuantization(
|
||||
model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
|
||||
quantization="gptq"),
|
||||
model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", quantization="gptq"
|
||||
),
|
||||
]
|
||||
else:
|
||||
MODELS = [
|
||||
ModelWithQuantization(
|
||||
model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
|
||||
quantization="awq"),
|
||||
model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", quantization="awq"
|
||||
),
|
||||
ModelWithQuantization(
|
||||
model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
|
||||
quantization="gptq"),
|
||||
model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", quantization="gptq"
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def do_sample(llm: vllm.LLM,
|
||||
lora_path: str,
|
||||
lora_id: int,
|
||||
max_tokens: int = 256) -> list[str]:
|
||||
def do_sample(
|
||||
llm: vllm.LLM, lora_path: str, lora_id: int, max_tokens: int = 256
|
||||
) -> list[str]:
|
||||
raw_prompts = [
|
||||
"Give me an orange-ish brown color",
|
||||
"Give me a neon pink color",
|
||||
@@ -52,14 +51,14 @@ def do_sample(llm: vllm.LLM,
|
||||
|
||||
prompts = [format_prompt_tuples(p) for p in raw_prompts]
|
||||
|
||||
sampling_params = vllm.SamplingParams(temperature=0,
|
||||
max_tokens=max_tokens,
|
||||
stop=["<|im_end|>"])
|
||||
sampling_params = vllm.SamplingParams(
|
||||
temperature=0, max_tokens=max_tokens, stop=["<|im_end|>"]
|
||||
)
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
|
||||
if lora_id else None)
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
|
||||
)
|
||||
# Print the outputs.
|
||||
generated_texts: list[str] = []
|
||||
for output in outputs:
|
||||
@@ -72,18 +71,18 @@ def do_sample(llm: vllm.LLM,
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
def test_quant_model_lora(tinyllama_lora_files, model):
|
||||
|
||||
llm = vllm.LLM(
|
||||
model=model.model_path,
|
||||
enable_lora=True,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
max_model_len=400,
|
||||
gpu_memory_utilization=0.2, #avoid OOM
|
||||
gpu_memory_utilization=0.2, # avoid OOM
|
||||
quantization=model.quantization,
|
||||
trust_remote_code=True,
|
||||
enable_chunked_prefill=True,
|
||||
tokenizer=tinyllama_lora_files)
|
||||
tokenizer=tinyllama_lora_files,
|
||||
)
|
||||
|
||||
if model.quantization is None:
|
||||
expected_lora_output = [
|
||||
@@ -104,11 +103,11 @@ def test_quant_model_lora(tinyllama_lora_files, model):
|
||||
def expect_match(output, expected_output):
|
||||
# HACK: GPTQ lora outputs are just incredibly unstable.
|
||||
# Assert that the outputs changed.
|
||||
if (model.quantization == "gptq"
|
||||
and expected_output is expected_lora_output):
|
||||
if model.quantization == "gptq" and expected_output is expected_lora_output:
|
||||
for i, o in enumerate(output):
|
||||
assert o.startswith(
|
||||
'#'), f"Expected example {i} to start with # but got {o}"
|
||||
assert o.startswith("#"), (
|
||||
f"Expected example {i} to start with # but got {o}"
|
||||
)
|
||||
return
|
||||
assert output == expected_output
|
||||
|
||||
@@ -116,17 +115,11 @@ def test_quant_model_lora(tinyllama_lora_files, model):
|
||||
|
||||
print("lora adapter created")
|
||||
print("lora 1")
|
||||
output = do_sample(llm,
|
||||
tinyllama_lora_files,
|
||||
lora_id=1,
|
||||
max_tokens=max_tokens)
|
||||
output = do_sample(llm, tinyllama_lora_files, lora_id=1, max_tokens=max_tokens)
|
||||
expect_match(output, expected_lora_output)
|
||||
|
||||
print("lora 2")
|
||||
output = do_sample(llm,
|
||||
tinyllama_lora_files,
|
||||
lora_id=2,
|
||||
max_tokens=max_tokens)
|
||||
output = do_sample(llm, tinyllama_lora_files, lora_id=2, max_tokens=max_tokens)
|
||||
expect_match(output, expected_lora_output)
|
||||
|
||||
print("removing lora")
|
||||
@@ -136,8 +129,7 @@ def test_quant_model_lora(tinyllama_lora_files, model):
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
|
||||
model):
|
||||
def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available, model):
|
||||
if num_gpus_available < 2:
|
||||
pytest.skip(f"Not enough GPUs for tensor parallelism {2}")
|
||||
if model.quantization == "gptq":
|
||||
@@ -147,10 +139,11 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
|
||||
enable_lora=True,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
gpu_memory_utilization=0.2, #avoid OOM
|
||||
gpu_memory_utilization=0.2, # avoid OOM
|
||||
quantization=model.quantization,
|
||||
trust_remote_code=True,
|
||||
enable_chunked_prefill=True)
|
||||
enable_chunked_prefill=True,
|
||||
)
|
||||
output_tp1 = do_sample(llm_tp1, tinyllama_lora_files, lora_id=1)
|
||||
|
||||
del llm_tp1
|
||||
@@ -162,9 +155,10 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
tensor_parallel_size=2,
|
||||
gpu_memory_utilization=0.2, #avoid OOM
|
||||
gpu_memory_utilization=0.2, # avoid OOM
|
||||
quantization=model.quantization,
|
||||
enable_chunked_prefill=True)
|
||||
enable_chunked_prefill=True,
|
||||
)
|
||||
output_tp2 = do_sample(llm_tp2, tinyllama_lora_files, lora_id=1)
|
||||
|
||||
del llm_tp2
|
||||
|
||||
@@ -37,7 +37,8 @@ class Qwen2VLTester:
|
||||
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>"
|
||||
"\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
|
||||
"What is in the image?<|im_end|>\n"
|
||||
"<|im_start|>assistant\n")
|
||||
"<|im_start|>assistant\n"
|
||||
)
|
||||
|
||||
def __init__(self, config: TestConfig):
|
||||
self.config = config
|
||||
@@ -56,68 +57,68 @@ class Qwen2VLTester:
|
||||
max_model_len=self.config.max_model_len,
|
||||
)
|
||||
|
||||
def run_test(self,
|
||||
images: list[ImageAsset],
|
||||
expected_outputs: list[str],
|
||||
lora_id: Optional[int] = None,
|
||||
temperature: float = 0,
|
||||
max_tokens: int = 5):
|
||||
|
||||
def run_test(
|
||||
self,
|
||||
images: list[ImageAsset],
|
||||
expected_outputs: list[str],
|
||||
lora_id: Optional[int] = None,
|
||||
temperature: float = 0,
|
||||
max_tokens: int = 5,
|
||||
):
|
||||
sampling_params = vllm.SamplingParams(
|
||||
temperature=temperature,
|
||||
max_tokens=max_tokens,
|
||||
)
|
||||
inputs = [{
|
||||
"prompt": self.PROMPT_TEMPLATE,
|
||||
"multi_modal_data": {
|
||||
"image": asset.pil_image
|
||||
},
|
||||
} for asset in images]
|
||||
|
||||
lora_request = LoRARequest(str(lora_id), lora_id,
|
||||
self.config.lora_path)
|
||||
outputs = self.llm.generate(inputs,
|
||||
sampling_params,
|
||||
lora_request=lora_request)
|
||||
generated_texts = [
|
||||
output.outputs[0].text.strip() for output in outputs
|
||||
inputs = [
|
||||
{
|
||||
"prompt": self.PROMPT_TEMPLATE,
|
||||
"multi_modal_data": {"image": asset.pil_image},
|
||||
}
|
||||
for asset in images
|
||||
]
|
||||
|
||||
lora_request = LoRARequest(str(lora_id), lora_id, self.config.lora_path)
|
||||
outputs = self.llm.generate(inputs, sampling_params, lora_request=lora_request)
|
||||
generated_texts = [output.outputs[0].text.strip() for output in outputs]
|
||||
|
||||
# Validate outputs
|
||||
for generated, expected in zip(generated_texts, expected_outputs):
|
||||
assert expected.startswith(
|
||||
generated), f"Generated text {generated} doesn't "
|
||||
assert expected.startswith(generated), (
|
||||
f"Generated text {generated} doesn't "
|
||||
)
|
||||
f"match expected pattern {expected}"
|
||||
|
||||
def run_beam_search_test(self,
|
||||
images: list[ImageAsset],
|
||||
expected_outputs: list[list[str]],
|
||||
lora_id: Optional[int] = None,
|
||||
temperature: float = 0,
|
||||
beam_width: int = 2,
|
||||
max_tokens: int = 5):
|
||||
def run_beam_search_test(
|
||||
self,
|
||||
images: list[ImageAsset],
|
||||
expected_outputs: list[list[str]],
|
||||
lora_id: Optional[int] = None,
|
||||
temperature: float = 0,
|
||||
beam_width: int = 2,
|
||||
max_tokens: int = 5,
|
||||
):
|
||||
beam_search_params = BeamSearchParams(
|
||||
beam_width=beam_width, max_tokens=max_tokens, temperature=temperature
|
||||
)
|
||||
|
||||
beam_search_params = BeamSearchParams(beam_width=beam_width,
|
||||
max_tokens=max_tokens,
|
||||
temperature=temperature)
|
||||
inputs = [
|
||||
{
|
||||
"prompt": self.PROMPT_TEMPLATE,
|
||||
"multi_modal_data": {"image": asset.pil_image},
|
||||
}
|
||||
for asset in images
|
||||
]
|
||||
|
||||
inputs = [{
|
||||
"prompt": self.PROMPT_TEMPLATE,
|
||||
"multi_modal_data": {
|
||||
"image": asset.pil_image
|
||||
},
|
||||
} for asset in images]
|
||||
|
||||
lora_request = LoRARequest(str(lora_id), lora_id,
|
||||
self.config.lora_path)
|
||||
outputs = self.llm.beam_search(inputs,
|
||||
beam_search_params,
|
||||
lora_request=lora_request)
|
||||
lora_request = LoRARequest(str(lora_id), lora_id, self.config.lora_path)
|
||||
outputs = self.llm.beam_search(
|
||||
inputs, beam_search_params, lora_request=lora_request
|
||||
)
|
||||
|
||||
for output_obj, expected_outs in zip(outputs, expected_outputs):
|
||||
output_texts = [seq.text for seq in output_obj.sequences]
|
||||
assert output_texts == expected_outs, \
|
||||
f"Generated texts {output_texts} do not match expected {expected_outs}" # noqa: E501
|
||||
assert output_texts == expected_outs, (
|
||||
f"Generated texts {output_texts} do not match expected {expected_outs}"
|
||||
) # noqa: E501
|
||||
|
||||
|
||||
TEST_IMAGES = [
|
||||
@@ -144,27 +145,25 @@ QWEN25VL_MODEL_PATH = "Qwen/Qwen2.5-VL-3B-Instruct"
|
||||
|
||||
@pytest.mark.xfail(
|
||||
current_platform.is_rocm(),
|
||||
reason="Qwen2-VL dependency xformers incompatible with ROCm")
|
||||
reason="Qwen2-VL dependency xformers incompatible with ROCm",
|
||||
)
|
||||
def test_qwen2vl_lora(qwen2vl_lora_files):
|
||||
"""Test Qwen 2.0 VL model with LoRA"""
|
||||
config = TestConfig(model_path=QWEN2VL_MODEL_PATH,
|
||||
lora_path=qwen2vl_lora_files)
|
||||
config = TestConfig(model_path=QWEN2VL_MODEL_PATH, lora_path=qwen2vl_lora_files)
|
||||
tester = Qwen2VLTester(config)
|
||||
|
||||
# Test with different LoRA IDs
|
||||
for lora_id in [1, 2]:
|
||||
tester.run_test(TEST_IMAGES,
|
||||
expected_outputs=EXPECTED_OUTPUTS,
|
||||
lora_id=lora_id)
|
||||
tester.run_test(TEST_IMAGES, expected_outputs=EXPECTED_OUTPUTS, lora_id=lora_id)
|
||||
|
||||
|
||||
@pytest.mark.xfail(
|
||||
current_platform.is_rocm(),
|
||||
reason="Qwen2-VL dependency xformers incompatible with ROCm")
|
||||
reason="Qwen2-VL dependency xformers incompatible with ROCm",
|
||||
)
|
||||
def test_qwen2vl_lora_beam_search(qwen2vl_lora_files):
|
||||
"""Test Qwen 2.0 VL model with LoRA through beam search."""
|
||||
config = TestConfig(model_path=QWEN2VL_MODEL_PATH,
|
||||
lora_path=qwen2vl_lora_files)
|
||||
config = TestConfig(model_path=QWEN2VL_MODEL_PATH, lora_path=qwen2vl_lora_files)
|
||||
tester = Qwen2VLTester(config)
|
||||
|
||||
# Test with different LoRA IDs
|
||||
@@ -176,7 +175,8 @@ def test_qwen2vl_lora_beam_search(qwen2vl_lora_files):
|
||||
tester.run_beam_search_test(
|
||||
[ImageAsset("cherry_blossom")],
|
||||
expected_outputs=EXPECTED_BEAM_SEARCH_OUTPUTS,
|
||||
lora_id=lora_id)
|
||||
lora_id=lora_id,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.xfail(
|
||||
@@ -185,12 +185,9 @@ def test_qwen2vl_lora_beam_search(qwen2vl_lora_files):
|
||||
)
|
||||
def test_qwen25vl_lora(qwen25vl_lora_files):
|
||||
"""Test Qwen 2.5 VL model with LoRA"""
|
||||
config = TestConfig(model_path=QWEN25VL_MODEL_PATH,
|
||||
lora_path=qwen25vl_lora_files)
|
||||
config = TestConfig(model_path=QWEN25VL_MODEL_PATH, lora_path=qwen25vl_lora_files)
|
||||
tester = Qwen2VLTester(config)
|
||||
|
||||
# Test with different LoRA IDs
|
||||
for lora_id in [1, 2]:
|
||||
tester.run_test(TEST_IMAGES,
|
||||
expected_outputs=EXPECTED_OUTPUTS,
|
||||
lora_id=lora_id)
|
||||
tester.run_test(TEST_IMAGES, expected_outputs=EXPECTED_OUTPUTS, lora_id=lora_id)
|
||||
|
||||
@@ -12,13 +12,15 @@ from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
|
||||
class DummyLoRAResolver(LoRAResolver):
|
||||
"""A dummy LoRA resolver for testing."""
|
||||
|
||||
async def resolve_lora(self, base_model_name: str,
|
||||
lora_name: str) -> Optional[LoRARequest]:
|
||||
async def resolve_lora(
|
||||
self, base_model_name: str, lora_name: str
|
||||
) -> Optional[LoRARequest]:
|
||||
if lora_name == "test_lora":
|
||||
return LoRARequest(
|
||||
lora_name=lora_name,
|
||||
lora_path=f"/dummy/path/{base_model_name}/{lora_name}",
|
||||
lora_int_id=abs(hash(lora_name)))
|
||||
lora_int_id=abs(hash(lora_name)),
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
@@ -70,6 +72,5 @@ async def test_dummy_resolver_resolve():
|
||||
assert result.lora_path == f"/dummy/path/{base_model_name}/{lora_name}"
|
||||
|
||||
# Test failed resolution
|
||||
result = await dummy_resolver.resolve_lora(base_model_name,
|
||||
"nonexistent_lora")
|
||||
result = await dummy_resolver.resolve_lora(base_model_name, "nonexistent_lora")
|
||||
assert result is None
|
||||
|
||||
@@ -24,20 +24,18 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
prompts = [
|
||||
PROMPT_TEMPLATE.format(query="How many singers do we have?"),
|
||||
PROMPT_TEMPLATE.format(
|
||||
query=
|
||||
"What is the average, minimum, and maximum age of all singers from France?" # noqa: E501
|
||||
query="What is the average, minimum, and maximum age of all singers from France?" # noqa: E501
|
||||
),
|
||||
PROMPT_TEMPLATE.format(
|
||||
query=
|
||||
"What are all distinct countries where singers above age 20 are from?" # noqa: E501
|
||||
query="What are all distinct countries where singers above age 20 are from?" # noqa: E501
|
||||
),
|
||||
]
|
||||
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
|
||||
if lora_id else None)
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
|
||||
)
|
||||
# Print the outputs.
|
||||
generated_texts: list[str] = []
|
||||
for output in outputs:
|
||||
@@ -49,13 +47,15 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
|
||||
|
||||
def test_ilama_lora(ilama_lora_files):
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
max_lora_rank=16,
|
||||
trust_remote_code=True,
|
||||
enable_chunked_prefill=True)
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
max_lora_rank=16,
|
||||
trust_remote_code=True,
|
||||
enable_chunked_prefill=True,
|
||||
)
|
||||
|
||||
output1 = do_sample(llm, ilama_lora_files, lora_id=1)
|
||||
for i in range(len(EXPECTED_LORA_OUTPUT)):
|
||||
@@ -65,20 +65,23 @@ def test_ilama_lora(ilama_lora_files):
|
||||
assert output2[i] == EXPECTED_LORA_OUTPUT[i]
|
||||
|
||||
|
||||
@pytest.mark.skipif(current_platform.is_cuda_alike(),
|
||||
reason="Skipping to avoid redundant model tests")
|
||||
@pytest.mark.skipif(
|
||||
current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
|
||||
)
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
@create_new_process_for_each_test()
|
||||
def test_ilama_lora_tp4(ilama_lora_files):
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
max_lora_rank=16,
|
||||
tensor_parallel_size=4,
|
||||
trust_remote_code=True,
|
||||
fully_sharded_loras=False,
|
||||
enable_chunked_prefill=True)
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
max_lora_rank=16,
|
||||
tensor_parallel_size=4,
|
||||
trust_remote_code=True,
|
||||
fully_sharded_loras=False,
|
||||
enable_chunked_prefill=True,
|
||||
)
|
||||
|
||||
output1 = do_sample(llm, ilama_lora_files, lora_id=1)
|
||||
for i in range(len(EXPECTED_LORA_OUTPUT)):
|
||||
@@ -88,20 +91,23 @@ def test_ilama_lora_tp4(ilama_lora_files):
|
||||
assert output2[i] == EXPECTED_LORA_OUTPUT[i]
|
||||
|
||||
|
||||
@pytest.mark.skipif(current_platform.is_cuda_alike(),
|
||||
reason="Skipping to avoid redundant model tests")
|
||||
@pytest.mark.skipif(
|
||||
current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
|
||||
)
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
@create_new_process_for_each_test()
|
||||
def test_ilama_lora_tp4_fully_sharded_loras(ilama_lora_files):
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
max_lora_rank=16,
|
||||
tensor_parallel_size=4,
|
||||
trust_remote_code=True,
|
||||
fully_sharded_loras=True,
|
||||
enable_chunked_prefill=True)
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
max_lora_rank=16,
|
||||
tensor_parallel_size=4,
|
||||
trust_remote_code=True,
|
||||
fully_sharded_loras=True,
|
||||
enable_chunked_prefill=True,
|
||||
)
|
||||
output1 = do_sample(llm, ilama_lora_files, lora_id=1)
|
||||
for i in range(len(EXPECTED_LORA_OUTPUT)):
|
||||
assert output1[i] == EXPECTED_LORA_OUTPUT[i]
|
||||
|
||||
@@ -9,8 +9,11 @@ import pytest
|
||||
from huggingface_hub.utils import HfHubHTTPError
|
||||
from torch import nn
|
||||
|
||||
from vllm.lora.utils import (get_adapter_absolute_path,
|
||||
parse_fine_tuned_lora_name, replace_submodule)
|
||||
from vllm.lora.utils import (
|
||||
get_adapter_absolute_path,
|
||||
parse_fine_tuned_lora_name,
|
||||
replace_submodule,
|
||||
)
|
||||
from vllm.model_executor.models.utils import WeightsMapper
|
||||
|
||||
|
||||
@@ -24,10 +27,12 @@ class LoRANameParserTestConfig(NamedTuple):
|
||||
|
||||
def test_parse_fine_tuned_lora_name_valid():
|
||||
fixture = [
|
||||
LoRANameParserTestConfig("base_model.model.lm_head.lora_A.weight",
|
||||
"lm_head", True, False),
|
||||
LoRANameParserTestConfig("base_model.model.lm_head.lora_B.weight",
|
||||
"lm_head", False, False),
|
||||
LoRANameParserTestConfig(
|
||||
"base_model.model.lm_head.lora_A.weight", "lm_head", True, False
|
||||
),
|
||||
LoRANameParserTestConfig(
|
||||
"base_model.model.lm_head.lora_B.weight", "lm_head", False, False
|
||||
),
|
||||
LoRANameParserTestConfig(
|
||||
"base_model.model.model.embed_tokens.lora_embedding_A",
|
||||
"model.embed_tokens",
|
||||
@@ -71,7 +76,8 @@ def test_parse_fine_tuned_lora_name_valid():
|
||||
True,
|
||||
False,
|
||||
weights_mapper=WeightsMapper(
|
||||
orig_to_new_prefix={"model.": "language_model.model."}),
|
||||
orig_to_new_prefix={"model.": "language_model.model."}
|
||||
),
|
||||
),
|
||||
LoRANameParserTestConfig(
|
||||
"base_model.model.model.layers.9.mlp.down_proj.lora_B.weight",
|
||||
@@ -79,7 +85,8 @@ def test_parse_fine_tuned_lora_name_valid():
|
||||
False,
|
||||
False,
|
||||
weights_mapper=WeightsMapper(
|
||||
orig_to_new_prefix={"model.": "language_model.model."}),
|
||||
orig_to_new_prefix={"model.": "language_model.model."}
|
||||
),
|
||||
),
|
||||
LoRANameParserTestConfig(
|
||||
"model.layers.9.mlp.down_proj.lora_A.weight",
|
||||
@@ -87,7 +94,8 @@ def test_parse_fine_tuned_lora_name_valid():
|
||||
True,
|
||||
False,
|
||||
weights_mapper=WeightsMapper(
|
||||
orig_to_new_prefix={"model.": "language_model.model."}),
|
||||
orig_to_new_prefix={"model.": "language_model.model."}
|
||||
),
|
||||
),
|
||||
LoRANameParserTestConfig(
|
||||
"model.layers.9.mlp.down_proj.lora_B.weight",
|
||||
@@ -95,12 +103,14 @@ def test_parse_fine_tuned_lora_name_valid():
|
||||
False,
|
||||
False,
|
||||
weights_mapper=WeightsMapper(
|
||||
orig_to_new_prefix={"model.": "language_model.model."}),
|
||||
orig_to_new_prefix={"model.": "language_model.model."}
|
||||
),
|
||||
),
|
||||
]
|
||||
for name, module_name, is_lora_a, is_bias, weights_mapper in fixture:
|
||||
assert (module_name, is_lora_a,
|
||||
is_bias) == parse_fine_tuned_lora_name(name, weights_mapper)
|
||||
assert (module_name, is_lora_a, is_bias) == parse_fine_tuned_lora_name(
|
||||
name, weights_mapper
|
||||
)
|
||||
|
||||
|
||||
def test_parse_fine_tuned_lora_name_invalid():
|
||||
@@ -115,22 +125,28 @@ def test_parse_fine_tuned_lora_name_invalid():
|
||||
|
||||
def test_replace_submodule():
|
||||
model = nn.Sequential(
|
||||
OrderedDict([
|
||||
("dense1", nn.Linear(764, 100)),
|
||||
("act1", nn.ReLU()),
|
||||
("dense2", nn.Linear(100, 50)),
|
||||
(
|
||||
"seq1",
|
||||
nn.Sequential(
|
||||
OrderedDict([
|
||||
("dense1", nn.Linear(100, 10)),
|
||||
("dense2", nn.Linear(10, 50)),
|
||||
])),
|
||||
),
|
||||
("act2", nn.ReLU()),
|
||||
("output", nn.Linear(50, 10)),
|
||||
("outact", nn.Sigmoid()),
|
||||
]))
|
||||
OrderedDict(
|
||||
[
|
||||
("dense1", nn.Linear(764, 100)),
|
||||
("act1", nn.ReLU()),
|
||||
("dense2", nn.Linear(100, 50)),
|
||||
(
|
||||
"seq1",
|
||||
nn.Sequential(
|
||||
OrderedDict(
|
||||
[
|
||||
("dense1", nn.Linear(100, 10)),
|
||||
("dense2", nn.Linear(10, 50)),
|
||||
]
|
||||
)
|
||||
),
|
||||
),
|
||||
("act2", nn.ReLU()),
|
||||
("output", nn.Linear(50, 10)),
|
||||
("outact", nn.Sigmoid()),
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
sigmoid = nn.Sigmoid()
|
||||
|
||||
@@ -143,52 +159,51 @@ def test_replace_submodule():
|
||||
|
||||
|
||||
# Unit tests for get_adapter_absolute_path
|
||||
@patch('os.path.isabs')
|
||||
@patch("os.path.isabs")
|
||||
def test_get_adapter_absolute_path_absolute(mock_isabs):
|
||||
path = '/absolute/path/to/lora'
|
||||
path = "/absolute/path/to/lora"
|
||||
mock_isabs.return_value = True
|
||||
assert get_adapter_absolute_path(path) == path
|
||||
|
||||
|
||||
@patch('os.path.expanduser')
|
||||
@patch("os.path.expanduser")
|
||||
def test_get_adapter_absolute_path_expanduser(mock_expanduser):
|
||||
# Path with ~ that needs to be expanded
|
||||
path = '~/relative/path/to/lora'
|
||||
absolute_path = '/home/user/relative/path/to/lora'
|
||||
path = "~/relative/path/to/lora"
|
||||
absolute_path = "/home/user/relative/path/to/lora"
|
||||
mock_expanduser.return_value = absolute_path
|
||||
assert get_adapter_absolute_path(path) == absolute_path
|
||||
|
||||
|
||||
@patch('os.path.exists')
|
||||
@patch('os.path.abspath')
|
||||
@patch("os.path.exists")
|
||||
@patch("os.path.abspath")
|
||||
def test_get_adapter_absolute_path_local_existing(mock_abspath, mock_exist):
|
||||
# Relative path that exists locally
|
||||
path = 'relative/path/to/lora'
|
||||
absolute_path = '/absolute/path/to/lora'
|
||||
path = "relative/path/to/lora"
|
||||
absolute_path = "/absolute/path/to/lora"
|
||||
mock_exist.return_value = True
|
||||
mock_abspath.return_value = absolute_path
|
||||
assert get_adapter_absolute_path(path) == absolute_path
|
||||
|
||||
|
||||
@patch('huggingface_hub.snapshot_download')
|
||||
@patch('os.path.exists')
|
||||
def test_get_adapter_absolute_path_huggingface(mock_exist,
|
||||
mock_snapshot_download):
|
||||
@patch("huggingface_hub.snapshot_download")
|
||||
@patch("os.path.exists")
|
||||
def test_get_adapter_absolute_path_huggingface(mock_exist, mock_snapshot_download):
|
||||
# Hugging Face model identifier
|
||||
path = 'org/repo'
|
||||
absolute_path = '/mock/snapshot/path'
|
||||
path = "org/repo"
|
||||
absolute_path = "/mock/snapshot/path"
|
||||
mock_exist.return_value = False
|
||||
mock_snapshot_download.return_value = absolute_path
|
||||
assert get_adapter_absolute_path(path) == absolute_path
|
||||
|
||||
|
||||
@patch('huggingface_hub.snapshot_download')
|
||||
@patch('os.path.exists')
|
||||
def test_get_adapter_absolute_path_huggingface_error(mock_exist,
|
||||
mock_snapshot_download):
|
||||
@patch("huggingface_hub.snapshot_download")
|
||||
@patch("os.path.exists")
|
||||
def test_get_adapter_absolute_path_huggingface_error(
|
||||
mock_exist, mock_snapshot_download
|
||||
):
|
||||
# Hugging Face model identifier with download error
|
||||
path = 'org/repo'
|
||||
path = "org/repo"
|
||||
mock_exist.return_value = False
|
||||
mock_snapshot_download.side_effect = HfHubHTTPError(
|
||||
"failed to query model info")
|
||||
mock_snapshot_download.side_effect = HfHubHTTPError("failed to query model info")
|
||||
assert get_adapter_absolute_path(path) == path
|
||||
|
||||
@@ -6,8 +6,14 @@ import random
|
||||
import tempfile
|
||||
from unittest.mock import patch
|
||||
|
||||
from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
|
||||
ParallelConfig, SchedulerConfig, VllmConfig)
|
||||
from vllm.config import (
|
||||
CacheConfig,
|
||||
DeviceConfig,
|
||||
ModelConfig,
|
||||
ParallelConfig,
|
||||
SchedulerConfig,
|
||||
VllmConfig,
|
||||
)
|
||||
from vllm.config.load import LoadConfig
|
||||
from vllm.config.lora import LoRAConfig
|
||||
from vllm.lora.models import LoRAMapping
|
||||
@@ -19,12 +25,12 @@ NUM_LORAS = 16
|
||||
|
||||
@patch.dict(os.environ, {"RANK": "0"})
|
||||
def test_worker_apply_lora(sql_lora_files):
|
||||
|
||||
def set_active_loras(worker: Worker, lora_requests: list[LoRARequest]):
|
||||
lora_mapping = LoRAMapping([], [])
|
||||
|
||||
worker.model_runner.lora_manager.set_active_adapters(
|
||||
lora_requests, lora_mapping)
|
||||
lora_requests, lora_mapping
|
||||
)
|
||||
|
||||
vllm_config = VllmConfig(
|
||||
model_config=ModelConfig(
|
||||
@@ -49,9 +55,9 @@ def test_worker_apply_lora(sql_lora_files):
|
||||
swap_space=0,
|
||||
cache_dtype="auto",
|
||||
),
|
||||
lora_config=LoRAConfig(max_lora_rank=8,
|
||||
max_cpu_loras=NUM_LORAS,
|
||||
max_loras=NUM_LORAS),
|
||||
lora_config=LoRAConfig(
|
||||
max_lora_rank=8, max_cpu_loras=NUM_LORAS, max_loras=NUM_LORAS
|
||||
),
|
||||
)
|
||||
worker = Worker(
|
||||
vllm_config=vllm_config,
|
||||
@@ -67,23 +73,22 @@ def test_worker_apply_lora(sql_lora_files):
|
||||
assert worker.list_loras() == set()
|
||||
|
||||
lora_requests = [
|
||||
LoRARequest(str(i + 1), i + 1, sql_lora_files)
|
||||
for i in range(NUM_LORAS)
|
||||
LoRARequest(str(i + 1), i + 1, sql_lora_files) for i in range(NUM_LORAS)
|
||||
]
|
||||
|
||||
set_active_loras(worker, lora_requests)
|
||||
assert worker.list_loras() == {
|
||||
lora_request.lora_int_id
|
||||
for lora_request in lora_requests
|
||||
lora_request.lora_int_id for lora_request in lora_requests
|
||||
}
|
||||
|
||||
for i in range(NUM_LORAS):
|
||||
random.seed(i)
|
||||
iter_lora_requests = random.choices(lora_requests,
|
||||
k=random.randint(1, NUM_LORAS))
|
||||
iter_lora_requests = random.choices(
|
||||
lora_requests, k=random.randint(1, NUM_LORAS)
|
||||
)
|
||||
random.shuffle(iter_lora_requests)
|
||||
iter_lora_requests = iter_lora_requests[:-random.randint(0, NUM_LORAS)]
|
||||
iter_lora_requests = iter_lora_requests[: -random.randint(0, NUM_LORAS)]
|
||||
set_active_loras(worker, lora_requests)
|
||||
assert worker.list_loras().issuperset(
|
||||
{lora_request.lora_int_id
|
||||
for lora_request in iter_lora_requests})
|
||||
{lora_request.lora_int_id for lora_request in iter_lora_requests}
|
||||
)
|
||||
|
||||
@@ -13,7 +13,6 @@ from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
|
||||
|
||||
|
||||
class DummyLoRAManager:
|
||||
|
||||
def __init__(self, device: torch.device = "cuda:0"):
|
||||
super().__init__()
|
||||
self._loras: dict[str, LoRALayerWeights] = {}
|
||||
@@ -36,12 +35,12 @@ class DummyLoRAManager:
|
||||
module_name,
|
||||
rank=rank,
|
||||
lora_alpha=1,
|
||||
lora_a=torch.rand([rank, weight.shape[1]],
|
||||
dtype=weight.dtype,
|
||||
device=self._device),
|
||||
lora_b=torch.rand([weight.shape[0], rank],
|
||||
dtype=weight.dtype,
|
||||
device=self._device),
|
||||
lora_a=torch.rand(
|
||||
[rank, weight.shape[1]], dtype=weight.dtype, device=self._device
|
||||
),
|
||||
lora_b=torch.rand(
|
||||
[weight.shape[0], rank], dtype=weight.dtype, device=self._device
|
||||
),
|
||||
)
|
||||
if generate_embeddings_tensor:
|
||||
lora.embeddings_tensor = torch.rand(
|
||||
@@ -146,27 +145,26 @@ def generate_data(
|
||||
op_type,
|
||||
device,
|
||||
) -> PunicaTensors:
|
||||
seq_len_tensor = torch.randint(seq_length, seq_length + 1,
|
||||
(batches, )).to(device)
|
||||
seq_len_tensor = torch.randint(seq_length, seq_length + 1, (batches,)).to(device)
|
||||
b_seq_start_loc = torch.cumsum(
|
||||
torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
|
||||
dim=0,
|
||||
).to(device)
|
||||
total_tokens = seq_len_tensor.sum()
|
||||
if op_type == "shrink":
|
||||
inputs_tensor = torch.rand((total_tokens, hidden_size),
|
||||
dtype=dtype).to(device)
|
||||
inputs_tensor = torch.rand((total_tokens, hidden_size), dtype=dtype).to(device)
|
||||
lora_weights = torch.rand(
|
||||
(lora_nums, max_rank, hidden_size), # col-major
|
||||
dtype=dtype,
|
||||
).to(device)
|
||||
# shrink op need atomic_add, so output is initinized by 0
|
||||
ref_out_tensor = torch.zeros((total_tokens, max_rank),
|
||||
dtype=dtype,
|
||||
device=inputs_tensor.device)
|
||||
ref_out_tensor = torch.zeros(
|
||||
(total_tokens, max_rank), dtype=dtype, device=inputs_tensor.device
|
||||
)
|
||||
# NOTE shrink kernel using torch.float32 as output type
|
||||
our_out_tensor = torch.zeros((total_tokens, max_rank),
|
||||
dtype=torch.float32).to(device)
|
||||
our_out_tensor = torch.zeros((total_tokens, max_rank), dtype=torch.float32).to(
|
||||
device
|
||||
)
|
||||
else:
|
||||
inputs_tensor = torch.rand(
|
||||
(total_tokens, max_rank),
|
||||
@@ -184,15 +182,16 @@ def generate_data(
|
||||
).to(device)
|
||||
# Ensure the same input.
|
||||
our_out_tensor = ref_out_tensor.clone()
|
||||
lora_indices_tensor = torch.randint(0,
|
||||
lora_nums - 1 if lora_nums > 1 else 1,
|
||||
(batches, )).to(device)
|
||||
lora_indices_tensor = torch.randint(
|
||||
0, lora_nums - 1 if lora_nums > 1 else 1, (batches,)
|
||||
).to(device)
|
||||
indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
|
||||
current_offset = 0
|
||||
for b_id in range(batches):
|
||||
lora_index = lora_indices_tensor[b_id]
|
||||
indices[current_offset:current_offset +
|
||||
seq_len_tensor[b_id]].copy_(lora_index)
|
||||
indices[current_offset : current_offset + seq_len_tensor[b_id]].copy_(
|
||||
lora_index
|
||||
)
|
||||
current_offset += seq_len_tensor[b_id].item()
|
||||
|
||||
return PunicaTensors(
|
||||
@@ -217,8 +216,7 @@ def generate_data_for_expand_nslices(
|
||||
nslices,
|
||||
device,
|
||||
) -> PunicaTensors:
|
||||
seq_len_tensor = torch.randint(seq_length, seq_length + 1,
|
||||
(batches, )).to(device)
|
||||
seq_len_tensor = torch.randint(seq_length, seq_length + 1, (batches,)).to(device)
|
||||
b_seq_start_loc = torch.cumsum(
|
||||
torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
|
||||
dim=0,
|
||||
@@ -234,22 +232,25 @@ def generate_data_for_expand_nslices(
|
||||
torch.rand(
|
||||
(lora_nums, hidden_size, max_rank), # col-major
|
||||
dtype=dtype,
|
||||
).to(device))
|
||||
).to(device)
|
||||
)
|
||||
# expand op needs to complete y+=a@lora_b, so output is
|
||||
# initinized randomly
|
||||
ref_out_tensor = torch.rand((total_tokens, hidden_size * nslices),
|
||||
dtype=dtype).to(device)
|
||||
ref_out_tensor = torch.rand((total_tokens, hidden_size * nslices), dtype=dtype).to(
|
||||
device
|
||||
)
|
||||
# Ensure the same input.
|
||||
our_out_tensor = ref_out_tensor.clone()
|
||||
lora_indices_tensor = torch.randint(0,
|
||||
lora_nums - 1 if lora_nums > 1 else 1,
|
||||
(batches, ))
|
||||
lora_indices_tensor = torch.randint(
|
||||
0, lora_nums - 1 if lora_nums > 1 else 1, (batches,)
|
||||
)
|
||||
indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
|
||||
current_offset = 0
|
||||
for b_id in range(batches):
|
||||
lora_index = lora_indices_tensor[b_id]
|
||||
indices[current_offset:current_offset +
|
||||
seq_len_tensor[b_id]] = (lora_index.item())
|
||||
indices[current_offset : current_offset + seq_len_tensor[b_id]] = (
|
||||
lora_index.item()
|
||||
)
|
||||
current_offset += seq_len_tensor[b_id].item()
|
||||
|
||||
lora_indices_tensor = lora_indices_tensor.to(device)
|
||||
@@ -276,8 +277,7 @@ def generate_data_for_nslices(
|
||||
op_type,
|
||||
device,
|
||||
) -> PunicaTensors:
|
||||
seq_len_tensor = torch.randint(seq_length, seq_length + 1,
|
||||
(batches, )).to(device)
|
||||
seq_len_tensor = torch.randint(seq_length, seq_length + 1, (batches,)).to(device)
|
||||
b_seq_start_loc = torch.cumsum(
|
||||
torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
|
||||
dim=0,
|
||||
@@ -286,9 +286,7 @@ def generate_data_for_nslices(
|
||||
|
||||
lora_weights_lst = []
|
||||
if op_type == "shrink":
|
||||
|
||||
inputs_tensor = torch.rand((total_tokens, hidden_size),
|
||||
dtype=dtype).to(device)
|
||||
inputs_tensor = torch.rand((total_tokens, hidden_size), dtype=dtype).to(device)
|
||||
|
||||
for _ in range(nslices):
|
||||
if op_type == "shrink":
|
||||
@@ -296,7 +294,8 @@ def generate_data_for_nslices(
|
||||
torch.rand(
|
||||
(lora_nums, max_rank, hidden_size), # col-major
|
||||
dtype=dtype,
|
||||
).to(device))
|
||||
).to(device)
|
||||
)
|
||||
# NOTE shrink kernel using torch.float32 as output type
|
||||
# shrink op need atomic_add, so output is initinized by 0
|
||||
our_out_tensor = torch.zeros(
|
||||
@@ -313,23 +312,26 @@ def generate_data_for_nslices(
|
||||
torch.rand(
|
||||
(lora_nums, hidden_size, max_rank), # col-major
|
||||
dtype=dtype,
|
||||
).to(device))
|
||||
).to(device)
|
||||
)
|
||||
# expand op needs to complete y+=a@lora_b, so output is
|
||||
# initinized randomly
|
||||
our_out_tensor = torch.rand((total_tokens, hidden_size * nslices),
|
||||
dtype=dtype).to(device)
|
||||
our_out_tensor = torch.rand(
|
||||
(total_tokens, hidden_size * nslices), dtype=dtype
|
||||
).to(device)
|
||||
|
||||
# Ensure the same input.
|
||||
ref_out_tensor = our_out_tensor.clone()
|
||||
lora_indices_tensor = torch.randint(0,
|
||||
lora_nums - 1 if lora_nums > 1 else 1,
|
||||
(batches, ))
|
||||
lora_indices_tensor = torch.randint(
|
||||
0, lora_nums - 1 if lora_nums > 1 else 1, (batches,)
|
||||
)
|
||||
indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
|
||||
current_offset = 0
|
||||
for b_id in range(batches):
|
||||
lora_index = lora_indices_tensor[b_id]
|
||||
indices[current_offset:current_offset +
|
||||
seq_len_tensor[b_id]] = (lora_index.item())
|
||||
indices[current_offset : current_offset + seq_len_tensor[b_id]] = (
|
||||
lora_index.item()
|
||||
)
|
||||
current_offset += seq_len_tensor[b_id].item()
|
||||
|
||||
lora_indices_tensor = lora_indices_tensor.to(device)
|
||||
@@ -379,24 +381,20 @@ def create_peft_lora(
|
||||
}
|
||||
|
||||
for module_name in target_modules:
|
||||
|
||||
module = model
|
||||
for attr in module_name.split("."):
|
||||
module = getattr(module, attr)
|
||||
|
||||
if hasattr(module, "input_size") and hasattr(module, "output_size"):
|
||||
|
||||
in_features = module.input_size
|
||||
out_features = module.output_size
|
||||
|
||||
elif hasattr(module, "embedding_dim") and hasattr(
|
||||
module, "num_embeddings"):
|
||||
elif hasattr(module, "embedding_dim") and hasattr(module, "num_embeddings"):
|
||||
# ParallelLMHead
|
||||
in_features = module.embedding_dim
|
||||
out_features = module.num_embeddings
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unable to determine dimensions for module {module_name}")
|
||||
raise ValueError(f"Unable to determine dimensions for module {module_name}")
|
||||
|
||||
lora_A = torch.randn(rank, in_features, dtype=lora_dtype)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user