Convert formatting to use ruff instead of yapf + isort (#26247)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -20,28 +20,27 @@ class ModelWithQuantization:
|
||||
|
||||
|
||||
MODELS: list[ModelWithQuantization]
|
||||
#AWQ quantization is currently not supported in ROCm.
|
||||
# AWQ quantization is currently not supported in ROCm.
|
||||
if current_platform.is_rocm():
|
||||
MODELS = [
|
||||
ModelWithQuantization(
|
||||
model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
|
||||
quantization="gptq"),
|
||||
model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", quantization="gptq"
|
||||
),
|
||||
]
|
||||
else:
|
||||
MODELS = [
|
||||
ModelWithQuantization(
|
||||
model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
|
||||
quantization="awq"),
|
||||
model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", quantization="awq"
|
||||
),
|
||||
ModelWithQuantization(
|
||||
model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
|
||||
quantization="gptq"),
|
||||
model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", quantization="gptq"
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def do_sample(llm: vllm.LLM,
|
||||
lora_path: str,
|
||||
lora_id: int,
|
||||
max_tokens: int = 256) -> list[str]:
|
||||
def do_sample(
|
||||
llm: vllm.LLM, lora_path: str, lora_id: int, max_tokens: int = 256
|
||||
) -> list[str]:
|
||||
raw_prompts = [
|
||||
"Give me an orange-ish brown color",
|
||||
"Give me a neon pink color",
|
||||
@@ -52,14 +51,14 @@ def do_sample(llm: vllm.LLM,
|
||||
|
||||
prompts = [format_prompt_tuples(p) for p in raw_prompts]
|
||||
|
||||
sampling_params = vllm.SamplingParams(temperature=0,
|
||||
max_tokens=max_tokens,
|
||||
stop=["<|im_end|>"])
|
||||
sampling_params = vllm.SamplingParams(
|
||||
temperature=0, max_tokens=max_tokens, stop=["<|im_end|>"]
|
||||
)
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
|
||||
if lora_id else None)
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
|
||||
)
|
||||
# Print the outputs.
|
||||
generated_texts: list[str] = []
|
||||
for output in outputs:
|
||||
@@ -72,18 +71,18 @@ def do_sample(llm: vllm.LLM,
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
def test_quant_model_lora(tinyllama_lora_files, model):
|
||||
|
||||
llm = vllm.LLM(
|
||||
model=model.model_path,
|
||||
enable_lora=True,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
max_model_len=400,
|
||||
gpu_memory_utilization=0.2, #avoid OOM
|
||||
gpu_memory_utilization=0.2, # avoid OOM
|
||||
quantization=model.quantization,
|
||||
trust_remote_code=True,
|
||||
enable_chunked_prefill=True,
|
||||
tokenizer=tinyllama_lora_files)
|
||||
tokenizer=tinyllama_lora_files,
|
||||
)
|
||||
|
||||
if model.quantization is None:
|
||||
expected_lora_output = [
|
||||
@@ -104,11 +103,11 @@ def test_quant_model_lora(tinyllama_lora_files, model):
|
||||
def expect_match(output, expected_output):
|
||||
# HACK: GPTQ lora outputs are just incredibly unstable.
|
||||
# Assert that the outputs changed.
|
||||
if (model.quantization == "gptq"
|
||||
and expected_output is expected_lora_output):
|
||||
if model.quantization == "gptq" and expected_output is expected_lora_output:
|
||||
for i, o in enumerate(output):
|
||||
assert o.startswith(
|
||||
'#'), f"Expected example {i} to start with # but got {o}"
|
||||
assert o.startswith("#"), (
|
||||
f"Expected example {i} to start with # but got {o}"
|
||||
)
|
||||
return
|
||||
assert output == expected_output
|
||||
|
||||
@@ -116,17 +115,11 @@ def test_quant_model_lora(tinyllama_lora_files, model):
|
||||
|
||||
print("lora adapter created")
|
||||
print("lora 1")
|
||||
output = do_sample(llm,
|
||||
tinyllama_lora_files,
|
||||
lora_id=1,
|
||||
max_tokens=max_tokens)
|
||||
output = do_sample(llm, tinyllama_lora_files, lora_id=1, max_tokens=max_tokens)
|
||||
expect_match(output, expected_lora_output)
|
||||
|
||||
print("lora 2")
|
||||
output = do_sample(llm,
|
||||
tinyllama_lora_files,
|
||||
lora_id=2,
|
||||
max_tokens=max_tokens)
|
||||
output = do_sample(llm, tinyllama_lora_files, lora_id=2, max_tokens=max_tokens)
|
||||
expect_match(output, expected_lora_output)
|
||||
|
||||
print("removing lora")
|
||||
@@ -136,8 +129,7 @@ def test_quant_model_lora(tinyllama_lora_files, model):
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
|
||||
model):
|
||||
def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available, model):
|
||||
if num_gpus_available < 2:
|
||||
pytest.skip(f"Not enough GPUs for tensor parallelism {2}")
|
||||
if model.quantization == "gptq":
|
||||
@@ -147,10 +139,11 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
|
||||
enable_lora=True,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
gpu_memory_utilization=0.2, #avoid OOM
|
||||
gpu_memory_utilization=0.2, # avoid OOM
|
||||
quantization=model.quantization,
|
||||
trust_remote_code=True,
|
||||
enable_chunked_prefill=True)
|
||||
enable_chunked_prefill=True,
|
||||
)
|
||||
output_tp1 = do_sample(llm_tp1, tinyllama_lora_files, lora_id=1)
|
||||
|
||||
del llm_tp1
|
||||
@@ -162,9 +155,10 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
tensor_parallel_size=2,
|
||||
gpu_memory_utilization=0.2, #avoid OOM
|
||||
gpu_memory_utilization=0.2, # avoid OOM
|
||||
quantization=model.quantization,
|
||||
enable_chunked_prefill=True)
|
||||
enable_chunked_prefill=True,
|
||||
)
|
||||
output_tp2 = do_sample(llm_tp2, tinyllama_lora_files, lora_id=1)
|
||||
|
||||
del llm_tp2
|
||||
|
||||
Reference in New Issue
Block a user