[LoRA][2/2]Remove LoRA extra vocab (#28545)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
This commit is contained in:
@@ -250,6 +250,16 @@ def olmoe_lora_files():
|
||||
return snapshot_download(repo_id="jeeejeee/olmoe-instruct-text2sql-spider")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def qwen3_lora_files():
|
||||
return snapshot_download(repo_id="charent/self_cognition_Alice")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def llama32_lora_files():
|
||||
return snapshot_download(repo_id="jeeejeee/llama32-3b-text2sql-spider")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def reset_default_device():
|
||||
"""
|
||||
|
||||
@@ -136,7 +136,6 @@ def populate_loras(
|
||||
id_to_index: list[int | None],
|
||||
layer: BaseLayerWithLoRA,
|
||||
layer_weights: torch.Tensor,
|
||||
generate_embeddings_tensor: int = 0,
|
||||
repeats: int = 1,
|
||||
) -> tuple[dict[int, LoRALayerWeights], dict[int, list[LoRALayerWeights]]]:
|
||||
"""This method populates the lora layers with lora weights.
|
||||
@@ -148,8 +147,6 @@ def populate_loras(
|
||||
layer: the LoRAlayer to populate.
|
||||
layer_weights: the PyTorch tensor containing the layer's
|
||||
weights.
|
||||
generate_embeddings_tensor: whether to generate an
|
||||
embeddings tensor for each LoRA.
|
||||
repeats: must only be set for column parallel packed
|
||||
layers. Indicates the number of loras to compose
|
||||
together to create a single lora layer.
|
||||
@@ -171,7 +168,6 @@ def populate_loras(
|
||||
sublora = DummyLoRAManager(layer_weights.device).init_random_lora(
|
||||
module_name=f"fake_{i}",
|
||||
weight=layer_weights,
|
||||
generate_embeddings_tensor=generate_embeddings_tensor,
|
||||
)
|
||||
sublora.lora_b = sublora.lora_b[
|
||||
(sublora_len * i) : (sublora_len * (i + 1)), :
|
||||
@@ -185,7 +181,6 @@ def populate_loras(
|
||||
slot_idx,
|
||||
lora_a=lora.lora_a,
|
||||
lora_b=lora.lora_b,
|
||||
embeddings_tensor=lora.embeddings_tensor,
|
||||
)
|
||||
|
||||
lora_dict[lora_id] = lora
|
||||
@@ -306,7 +301,6 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
|
||||
id_to_index,
|
||||
max_loras,
|
||||
vocab_size,
|
||||
lora_config.lora_extra_vocab_size,
|
||||
)
|
||||
|
||||
lora_result = lora_embedding(torch.cat(inputs))
|
||||
@@ -344,7 +338,6 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
|
||||
id_to_index,
|
||||
max_loras,
|
||||
vocab_size,
|
||||
lora_config.lora_extra_vocab_size,
|
||||
)
|
||||
|
||||
lora_result = lora_embedding(torch.cat(inputs))
|
||||
@@ -354,149 +347,6 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
|
||||
torch.testing.assert_close(lora_result, expected_result, rtol=rtol, atol=atol)
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
# @pytest.mark.skip(
|
||||
# reason="Fails when loras are in any slot other than the first.")
|
||||
@pytest.mark.parametrize("num_loras", [1, 2, 4])
|
||||
@pytest.mark.parametrize("device", DEVICES)
|
||||
@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
|
||||
@pytest.mark.parametrize("stage", STAGES)
|
||||
def test_embeddings_with_new_embeddings(
|
||||
dist_init, num_loras, device, vocab_size, stage
|
||||
) -> None:
|
||||
if current_platform.is_cuda_alike():
|
||||
torch.cuda.set_device(device)
|
||||
|
||||
torch.set_default_device(device)
|
||||
max_loras = 8
|
||||
punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
|
||||
assert check_punica_wrapper(punica_wrapper)
|
||||
lora_config = LoRAConfig(
|
||||
max_loras=max_loras, max_lora_rank=8, lora_dtype=torch.float16
|
||||
)
|
||||
|
||||
def create_random_embedding_layer():
|
||||
embedding = VocabParallelEmbedding(vocab_size, 256)
|
||||
embedding_data = torch.rand_like(embedding.weight.data)
|
||||
embedding.weight.data = embedding_data
|
||||
embedding.weight.data[vocab_size:, :] = 0
|
||||
expanded_embedding = VocabParallelEmbedding(
|
||||
vocab_size + lora_config.lora_extra_vocab_size * max_loras,
|
||||
256,
|
||||
org_num_embeddings=vocab_size,
|
||||
)
|
||||
expanded_embedding.weight.data[:vocab_size, :] = embedding_data
|
||||
# We need to deepcopy the embedding as it will be modified
|
||||
# in place
|
||||
lora_embedding = VocabParallelEmbeddingWithLoRA(deepcopy(expanded_embedding))
|
||||
lora_embedding.create_lora_weights(max_loras, lora_config)
|
||||
|
||||
return expanded_embedding, lora_embedding
|
||||
|
||||
for i in range(NUM_RANDOM_SEEDS):
|
||||
set_random_seed(i)
|
||||
|
||||
id_to_index = get_random_id_to_index(num_loras, max_loras)
|
||||
expanded_embedding, lora_embedding = create_random_embedding_layer()
|
||||
lora_dict, _ = populate_loras(
|
||||
id_to_index,
|
||||
layer=lora_embedding,
|
||||
layer_weights=torch.zeros(
|
||||
(256, vocab_size + lora_config.lora_extra_vocab_size)
|
||||
),
|
||||
generate_embeddings_tensor=256,
|
||||
)
|
||||
|
||||
lora_embedding.set_mapping(punica_wrapper)
|
||||
# All embeddings tensors have the same shape.
|
||||
embeddings_tensors = [
|
||||
lora_dict[id].embeddings_tensor for id in sorted(lora_dict.keys())
|
||||
]
|
||||
embeddings_tensor_len = embeddings_tensors[0].shape[0]
|
||||
|
||||
# Add empty embeddings_tensors for unoccupied lora slots.
|
||||
for _ in range(max_loras - len(embeddings_tensors)):
|
||||
embeddings_tensors.append(torch.zeros(embeddings_tensors[0].shape))
|
||||
|
||||
inputs, index_mapping, prompt_mapping = create_random_inputs(
|
||||
active_lora_ids=list(lora_dict.keys()),
|
||||
num_inputs=num_loras * 3,
|
||||
input_size=(200,),
|
||||
input_range=(1, vocab_size),
|
||||
device=device,
|
||||
)
|
||||
lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage)
|
||||
punica_wrapper.update_metadata(
|
||||
lora_mapping,
|
||||
id_to_index,
|
||||
max_loras,
|
||||
vocab_size,
|
||||
lora_config.lora_extra_vocab_size,
|
||||
)
|
||||
original_inputs = deepcopy(inputs)
|
||||
|
||||
# Force some of the inputs to be in the extended embeddings range
|
||||
# to guarantee that their behavior is tested.
|
||||
for input_, original_input_, lora_id in zip(
|
||||
inputs, original_inputs, prompt_mapping
|
||||
):
|
||||
embedding_id = lora_id - 1
|
||||
input_[-1] = vocab_size + (embedding_id * embeddings_tensor_len)
|
||||
original_input_[-1] = vocab_size
|
||||
input_[-2] = vocab_size + ((embedding_id + 1) * embeddings_tensor_len - 1)
|
||||
original_input_[-2] = vocab_size + embeddings_tensor_len - 1
|
||||
|
||||
expanded_embedding.weight[
|
||||
vocab_size : vocab_size + (embeddings_tensor_len * max_loras)
|
||||
] = torch.cat(embeddings_tensors)
|
||||
|
||||
lora_result = lora_embedding(torch.cat(original_inputs))
|
||||
|
||||
expected_results: list[torch.Tensor] = []
|
||||
for input_, original_input_, lora_id in zip(
|
||||
inputs, original_inputs, prompt_mapping
|
||||
):
|
||||
lora = lora_dict[lora_id]
|
||||
result = expanded_embedding(input_)
|
||||
after_a = F.embedding(
|
||||
original_input_,
|
||||
lora.lora_a.T,
|
||||
)
|
||||
result += after_a @ lora.lora_b.T
|
||||
expected_results.append(result)
|
||||
expected_result = torch.cat(expected_results)
|
||||
|
||||
rtol, atol = TOLERANCES[lora_result.dtype]
|
||||
torch.testing.assert_close(lora_result, expected_result, rtol=rtol, atol=atol)
|
||||
|
||||
# Check that resetting the lora weights succeeds
|
||||
|
||||
for slot_idx in range(max_loras):
|
||||
lora_embedding.reset_lora(slot_idx)
|
||||
|
||||
inputs, index_mapping, prompt_mapping = create_random_inputs(
|
||||
active_lora_ids=[0],
|
||||
num_inputs=num_loras * 3,
|
||||
input_size=(200,),
|
||||
input_range=(1, vocab_size),
|
||||
device=device,
|
||||
)
|
||||
original_inputs = deepcopy(inputs)
|
||||
lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage)
|
||||
punica_wrapper.update_metadata(
|
||||
lora_mapping,
|
||||
id_to_index,
|
||||
max_loras,
|
||||
vocab_size,
|
||||
lora_config.lora_extra_vocab_size,
|
||||
)
|
||||
lora_result = lora_embedding(torch.cat(original_inputs))
|
||||
expected_result = expanded_embedding(torch.cat(inputs))
|
||||
|
||||
rtol, atol = TOLERANCES[lora_result.dtype]
|
||||
torch.testing.assert_close(lora_result, expected_result, rtol=rtol, atol=atol)
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
@pytest.mark.parametrize("num_loras", [1, 2, 4])
|
||||
@pytest.mark.parametrize("device", DEVICES)
|
||||
@@ -518,16 +368,13 @@ def test_lm_head_logits_processor(
|
||||
|
||||
def _pretest():
|
||||
linear = ParallelLMHead(
|
||||
vocab_size + lora_config.lora_extra_vocab_size,
|
||||
1024,
|
||||
vocab_size,
|
||||
num_embeddings=vocab_size,
|
||||
embedding_dim=1024,
|
||||
params_dtype=torch.float16,
|
||||
)
|
||||
linear.weight.data = torch.rand_like(linear.weight.data)
|
||||
linear.weight.data[:, vocab_size:] = 0
|
||||
logits_processor = LogitsProcessor(
|
||||
vocab_size + lora_config.lora_extra_vocab_size, vocab_size
|
||||
)
|
||||
logits_processor = LogitsProcessor(vocab_size)
|
||||
lora_logits_processor = LogitsProcessorWithLoRA(
|
||||
logits_processor, 1024, linear.weight.dtype, linear.weight.device, None
|
||||
)
|
||||
@@ -541,15 +388,12 @@ def test_lm_head_logits_processor(
|
||||
id_to_index = get_random_id_to_index(num_loras, max_loras)
|
||||
linear, logits_processor, lora_logits_processor = _pretest()
|
||||
lora_logits_processor.set_mapping(punica_wrapper)
|
||||
# NOTE: all the generated loras share the same embeddings tensor.
|
||||
|
||||
lora_dict, _ = populate_loras(
|
||||
id_to_index,
|
||||
layer=lora_logits_processor,
|
||||
layer_weights=linear.weight,
|
||||
generate_embeddings_tensor=1024,
|
||||
)
|
||||
embeddings_tensor = list(lora_dict.values())[0].embeddings_tensor
|
||||
embeddings_tensor_len = embeddings_tensor.shape[0]
|
||||
|
||||
inputs, index_mapping, prompt_mapping = create_random_inputs(
|
||||
active_lora_ids=list(lora_dict.keys()),
|
||||
@@ -565,7 +409,6 @@ def test_lm_head_logits_processor(
|
||||
id_to_index,
|
||||
max_loras,
|
||||
vocab_size,
|
||||
lora_config.lora_extra_vocab_size,
|
||||
)
|
||||
input_ = torch.rand(20, 1024)
|
||||
|
||||
@@ -575,23 +418,16 @@ def test_lm_head_logits_processor(
|
||||
|
||||
original_lm_head = deepcopy(linear)
|
||||
|
||||
linear.weight[
|
||||
logits_processor.org_vocab_size : logits_processor.org_vocab_size
|
||||
+ embeddings_tensor_len
|
||||
] = embeddings_tensor
|
||||
|
||||
logits_processor.org_vocab_size = vocab_size + lora_config.lora_extra_vocab_size
|
||||
expected_results: list[torch.Tensor] = []
|
||||
for input_, lora_id in zip(inputs, prompt_mapping):
|
||||
lora = lora_dict[lora_id]
|
||||
result = logits_processor._get_logits(
|
||||
hidden_states=input_, lm_head=linear, embedding_bias=None
|
||||
)
|
||||
result[:, vocab_size + embeddings_tensor_len :] = float("-inf")
|
||||
|
||||
result += input_ @ lora.lora_a.T @ lora.lora_b.T * lora.scaling
|
||||
expected_results.append(result)
|
||||
expected_result = torch.cat(expected_results)
|
||||
logits_processor.org_vocab_size = vocab_size
|
||||
|
||||
# Check that resetting the lora weights succeeds
|
||||
|
||||
@@ -612,7 +448,6 @@ def test_lm_head_logits_processor(
|
||||
id_to_index,
|
||||
max_loras,
|
||||
vocab_size,
|
||||
lora_config.lora_extra_vocab_size,
|
||||
)
|
||||
|
||||
lora_result = lora_logits_processor._get_logits(
|
||||
@@ -694,7 +529,6 @@ def test_linear_replicated(
|
||||
id_to_index,
|
||||
max_loras,
|
||||
512,
|
||||
lora_config.lora_extra_vocab_size,
|
||||
)
|
||||
|
||||
lora_result = lora_linear(torch.cat(inputs))[0]
|
||||
@@ -726,7 +560,10 @@ def test_linear_replicated(
|
||||
lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage)
|
||||
|
||||
punica_wrapper.update_metadata(
|
||||
lora_mapping, id_to_index, max_loras, 512, lora_config.lora_extra_vocab_size
|
||||
lora_mapping,
|
||||
id_to_index,
|
||||
max_loras,
|
||||
512,
|
||||
)
|
||||
|
||||
lora_result = lora_linear(torch.cat(inputs))[0]
|
||||
@@ -817,7 +654,6 @@ def test_linear_parallel(
|
||||
id_to_index,
|
||||
max_loras,
|
||||
512,
|
||||
lora_config.lora_extra_vocab_size,
|
||||
)
|
||||
|
||||
lora_result = lora_linear(torch.cat(inputs))[0]
|
||||
@@ -849,7 +685,10 @@ def test_linear_parallel(
|
||||
lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage)
|
||||
|
||||
punica_wrapper.update_metadata(
|
||||
lora_mapping, id_to_index, max_loras, 512, lora_config.lora_extra_vocab_size
|
||||
lora_mapping,
|
||||
id_to_index,
|
||||
max_loras,
|
||||
512,
|
||||
)
|
||||
|
||||
lora_result = lora_linear(torch.cat(inputs))[0]
|
||||
@@ -963,7 +802,6 @@ def test_column_parallel_packed(
|
||||
id_to_index,
|
||||
max_loras,
|
||||
512,
|
||||
lora_config.lora_extra_vocab_size,
|
||||
)
|
||||
|
||||
lora_result = lora_linear(torch.cat(inputs))[0]
|
||||
@@ -1000,7 +838,6 @@ def test_column_parallel_packed(
|
||||
id_to_index,
|
||||
max_loras,
|
||||
512,
|
||||
lora_config.lora_extra_vocab_size,
|
||||
)
|
||||
|
||||
lora_result = lora_linear(torch.cat(inputs))[0]
|
||||
|
||||
@@ -13,17 +13,27 @@ from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
|
||||
|
||||
from ..utils import VLLM_PATH, create_new_process_for_each_test, multi_gpu_test
|
||||
|
||||
MODEL_PATH = "meta-llama/Llama-2-7b-hf"
|
||||
PROMPT_TEMPLATE = """<|eot_id|><|start_header_id|>user<|end_header_id|>
|
||||
I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.
|
||||
"
|
||||
##Instruction:
|
||||
candidate_poll contains tables such as candidate, people. Table candidate has columns such as Candidate_ID, People_ID, Poll_Source, Date, Support_rate, Consider_rate, Oppose_rate, Unsure_rate. Candidate_ID is the primary key.
|
||||
Table people has columns such as People_ID, Sex, Name, Date_of_Birth, Height, Weight. People_ID is the primary key.
|
||||
The People_ID of candidate is the foreign key of People_ID of people.
|
||||
###Input:
|
||||
{context}
|
||||
###Response:<|eot_id|><|start_header_id|>assistant<|end_header_id|>
|
||||
""" # noqa: E501
|
||||
|
||||
EXPECTED_LORA_OUTPUT = [
|
||||
" SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501
|
||||
" SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ",
|
||||
" SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ", # noqa: E501
|
||||
" SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ", # noqa: E501
|
||||
" SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ",
|
||||
" SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' ", # noqa: E501
|
||||
"SELECT count(*) FROM candidate",
|
||||
"SELECT count(*) FROM candidate",
|
||||
"SELECT poll_source FROM candidate GROUP BY poll_source ORDER BY count(*) DESC LIMIT 1", # noqa: E501
|
||||
"SELECT poll_source FROM candidate GROUP BY poll_source ORDER BY count(*) DESC LIMIT 1", # noqa: E501
|
||||
]
|
||||
|
||||
MODEL_PATH = "meta-llama/Llama-3.2-3B-Instruct"
|
||||
|
||||
|
||||
def do_sample(
|
||||
llm: vllm.LLM,
|
||||
@@ -32,18 +42,19 @@ def do_sample(
|
||||
tensorizer_config_dict: dict | None = None,
|
||||
) -> list[str]:
|
||||
prompts = [
|
||||
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501
|
||||
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501
|
||||
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]", # noqa: E501
|
||||
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]", # noqa: E501
|
||||
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]", # noqa: E501
|
||||
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]", # noqa: E501
|
||||
PROMPT_TEMPLATE.format(context="How many candidates are there?"),
|
||||
PROMPT_TEMPLATE.format(context="Count the number of candidates."),
|
||||
PROMPT_TEMPLATE.format(
|
||||
context="Which poll resource provided the most number of candidate information?" # noqa: E501
|
||||
),
|
||||
PROMPT_TEMPLATE.format(
|
||||
context="Return the poll resource associated with the most candidates."
|
||||
),
|
||||
]
|
||||
|
||||
sampling_params = vllm.SamplingParams(
|
||||
temperature=0, max_tokens=256, skip_special_tokens=False, stop=["[/assistant]"]
|
||||
temperature=0, max_tokens=64, stop=["<|im_end|>"]
|
||||
)
|
||||
|
||||
if tensorizer_config_dict is not None:
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
@@ -75,13 +86,15 @@ def do_sample(
|
||||
return generated_texts
|
||||
|
||||
|
||||
def generate_and_test(llm, sql_lora_files, tensorizer_config_dict: dict | None = None):
|
||||
def generate_and_test(
|
||||
llm, llama32_lora_files, tensorizer_config_dict: dict | None = None
|
||||
):
|
||||
print("lora adapter created")
|
||||
print("lora 1")
|
||||
assert (
|
||||
do_sample(
|
||||
llm,
|
||||
sql_lora_files,
|
||||
llama32_lora_files,
|
||||
tensorizer_config_dict=tensorizer_config_dict,
|
||||
lora_id=1,
|
||||
)
|
||||
@@ -92,7 +105,7 @@ def generate_and_test(llm, sql_lora_files, tensorizer_config_dict: dict | None =
|
||||
assert (
|
||||
do_sample(
|
||||
llm,
|
||||
sql_lora_files,
|
||||
llama32_lora_files,
|
||||
tensorizer_config_dict=tensorizer_config_dict,
|
||||
lora_id=2,
|
||||
)
|
||||
@@ -104,51 +117,52 @@ def generate_and_test(llm, sql_lora_files, tensorizer_config_dict: dict | None =
|
||||
|
||||
@create_new_process_for_each_test()
|
||||
@pytest.mark.parametrize("cudagraph_specialize_lora", [True, False])
|
||||
def test_llama_lora(sql_lora_files, cudagraph_specialize_lora: bool):
|
||||
def test_llama_lora(llama32_lora_files, cudagraph_specialize_lora: bool):
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
tokenizer=sql_lora_files,
|
||||
enable_lora=True,
|
||||
# also test odd max_num_seqs
|
||||
max_num_seqs=13,
|
||||
max_num_seqs=7,
|
||||
max_model_len=1024,
|
||||
max_loras=4,
|
||||
compilation_config=vllm.config.CompilationConfig(
|
||||
cudagraph_specialize_lora=cudagraph_specialize_lora,
|
||||
),
|
||||
)
|
||||
generate_and_test(llm, sql_lora_files)
|
||||
generate_and_test(llm, llama32_lora_files)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
def test_llama_lora_tp4(sql_lora_files):
|
||||
def test_llama_lora_tp4(llama32_lora_files):
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
tokenizer=sql_lora_files,
|
||||
enable_lora=True,
|
||||
max_num_seqs=16,
|
||||
max_num_seqs=7,
|
||||
max_model_len=1024,
|
||||
max_loras=4,
|
||||
tensor_parallel_size=4,
|
||||
)
|
||||
generate_and_test(llm, sql_lora_files)
|
||||
generate_and_test(llm, llama32_lora_files)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
|
||||
def test_llama_lora_tp4_fully_sharded_loras(llama32_lora_files):
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
tokenizer=sql_lora_files,
|
||||
enable_lora=True,
|
||||
max_num_seqs=16,
|
||||
max_num_seqs=8,
|
||||
max_loras=4,
|
||||
max_model_len=1024,
|
||||
tensor_parallel_size=4,
|
||||
fully_sharded_loras=True,
|
||||
)
|
||||
generate_and_test(llm, sql_lora_files)
|
||||
generate_and_test(llm, llama32_lora_files)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
def test_tp2_serialize_and_deserialize_lora(
|
||||
tmp_path, sql_lora_files, sql_lora_huggingface_id
|
||||
tmp_path,
|
||||
llama32_lora_files,
|
||||
):
|
||||
# Run the tensorizing of the LoRA adapter and the model in a subprocess
|
||||
# to guarantee cleanup
|
||||
@@ -157,7 +171,7 @@ def test_tp2_serialize_and_deserialize_lora(
|
||||
model_name = "model-rank-%03d.tensors"
|
||||
|
||||
model_ref = MODEL_PATH
|
||||
lora_path = sql_lora_huggingface_id
|
||||
lora_path = llama32_lora_files
|
||||
suffix = "test"
|
||||
try:
|
||||
result = subprocess.run(
|
||||
@@ -195,12 +209,12 @@ def test_tp2_serialize_and_deserialize_lora(
|
||||
|
||||
loaded_llm = LLM(
|
||||
model=model_ref,
|
||||
tokenizer=sql_lora_files,
|
||||
load_format="tensorizer",
|
||||
enable_lora=True,
|
||||
enforce_eager=True,
|
||||
model_loader_extra_config=tensorizer_config,
|
||||
max_num_seqs=13,
|
||||
max_num_seqs=7,
|
||||
max_model_len=1024,
|
||||
tensor_parallel_size=2,
|
||||
max_loras=2,
|
||||
)
|
||||
@@ -211,7 +225,7 @@ def test_tp2_serialize_and_deserialize_lora(
|
||||
print("lora 1")
|
||||
assert (
|
||||
do_sample(
|
||||
loaded_llm, sql_lora_files, tensorizer_config_dict=tc_as_dict, lora_id=1
|
||||
loaded_llm, llama32_lora_files, tensorizer_config_dict=tc_as_dict, lora_id=1
|
||||
)
|
||||
== EXPECTED_LORA_OUTPUT
|
||||
)
|
||||
|
||||
@@ -13,8 +13,8 @@ from vllm.entrypoints.openai.api_server import (
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.v1.engine.llm_engine import LLMEngine
|
||||
|
||||
MODEL_PATH = "meta-llama/Llama-2-7b-hf"
|
||||
LORA_MODULE_PATH = "yard1/llama-2-7b-sql-lora-test"
|
||||
MODEL_PATH = "Qwen/Qwen3-0.6B"
|
||||
LORA_MODULE_PATH = "charent/self_cognition_Alice"
|
||||
LORA_RANK = 8
|
||||
|
||||
|
||||
|
||||
@@ -48,9 +48,6 @@ DEFAULT_DTYPE = torch.get_default_dtype()
|
||||
@pytest.mark.parametrize("device", DEVICES)
|
||||
def test_from_lora_tensors(sql_lora_files, device):
|
||||
tensors = load_file(os.path.join(sql_lora_files, "adapter_model.safetensors"))
|
||||
new_embeddings = load_file(
|
||||
os.path.join(sql_lora_files, "new_embeddings.safetensors")
|
||||
)
|
||||
|
||||
peft_helper = PEFTHelper.from_local_dir(
|
||||
sql_lora_files, max_position_embeddings=4096
|
||||
@@ -60,7 +57,6 @@ def test_from_lora_tensors(sql_lora_files, device):
|
||||
tensors,
|
||||
peft_helper=peft_helper,
|
||||
device=device,
|
||||
embeddings=new_embeddings,
|
||||
embedding_modules=EMBEDDING_MODULES,
|
||||
embedding_padding_modules=EMBEDDING_PADDING_MODULES,
|
||||
)
|
||||
@@ -76,18 +72,6 @@ def test_from_lora_tensors(sql_lora_files, device):
|
||||
f"{lora.lora_a.shape=}, {lora.lora_b.shape=}"
|
||||
)
|
||||
assert lora.lora_a.shape[0] == 8
|
||||
embeddings_module = next(
|
||||
(k for k in EMBEDDING_MODULES if k in module_name), None
|
||||
)
|
||||
if embeddings_module:
|
||||
assert torch.equal(
|
||||
lora.embeddings_tensor,
|
||||
new_embeddings[EMBEDDING_MODULES[embeddings_module]].to(
|
||||
device=lora.embeddings_tensor.device
|
||||
),
|
||||
)
|
||||
else:
|
||||
assert lora.embeddings_tensor is None
|
||||
|
||||
|
||||
def create_lora(
|
||||
@@ -552,9 +536,7 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path
|
||||
worker_adapter_manager = WorkerLoRAManager(
|
||||
vllm_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES
|
||||
)
|
||||
worker_adapter_manager.vocab_size = (
|
||||
dummy_model_gate_up.unpadded_vocab_size - lora_config.lora_extra_vocab_size
|
||||
)
|
||||
worker_adapter_manager.vocab_size = dummy_model_gate_up.unpadded_vocab_size
|
||||
worker_adapter_manager.create_lora_manager(dummy_model_gate_up)
|
||||
|
||||
dummy_lora_files = f"{tmp_path}/lora_adapter"
|
||||
|
||||
@@ -20,11 +20,12 @@ from vllm.lora.models import LoRAMapping
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.v1.worker.gpu_worker import Worker
|
||||
|
||||
MODEL_PATH = "Qwen/Qwen3-0.6B"
|
||||
NUM_LORAS = 16
|
||||
|
||||
|
||||
@patch.dict(os.environ, {"RANK": "0"})
|
||||
def test_worker_apply_lora(sql_lora_files):
|
||||
def test_worker_apply_lora(qwen3_lora_files):
|
||||
def set_active_loras(worker: Worker, lora_requests: list[LoRARequest]):
|
||||
lora_mapping = LoRAMapping([], [])
|
||||
|
||||
@@ -34,9 +35,10 @@ def test_worker_apply_lora(sql_lora_files):
|
||||
|
||||
vllm_config = VllmConfig(
|
||||
model_config=ModelConfig(
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
MODEL_PATH,
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
max_model_len=127,
|
||||
enforce_eager=True,
|
||||
),
|
||||
load_config=LoadConfig(
|
||||
@@ -73,7 +75,7 @@ def test_worker_apply_lora(sql_lora_files):
|
||||
assert worker.list_loras() == set()
|
||||
|
||||
lora_requests = [
|
||||
LoRARequest(str(i + 1), i + 1, sql_lora_files) for i in range(NUM_LORAS)
|
||||
LoRARequest(str(i + 1), i + 1, qwen3_lora_files) for i in range(NUM_LORAS)
|
||||
]
|
||||
|
||||
set_active_loras(worker, lora_requests)
|
||||
|
||||
@@ -28,7 +28,6 @@ class DummyLoRAManager:
|
||||
module_name: str,
|
||||
weight: torch.Tensor,
|
||||
rank: int = 8,
|
||||
generate_embeddings_tensor: int = 0,
|
||||
):
|
||||
lora = LoRALayerWeights(
|
||||
module_name,
|
||||
@@ -41,13 +40,6 @@ class DummyLoRAManager:
|
||||
[weight.shape[0], rank], dtype=weight.dtype, device=self._device
|
||||
),
|
||||
)
|
||||
if generate_embeddings_tensor:
|
||||
lora.embeddings_tensor = torch.rand(
|
||||
5,
|
||||
generate_embeddings_tensor,
|
||||
dtype=weight.dtype,
|
||||
device=self._device,
|
||||
)
|
||||
self.set_module_lora(module_name, lora)
|
||||
|
||||
return lora
|
||||
|
||||
Reference in New Issue
Block a user