Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-05 15:06:22 +01:00
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -20,28 +20,27 @@ class ModelWithQuantization:


 MODELS: list[ModelWithQuantization]
-#AWQ quantization is currently not supported in ROCm.
+# AWQ quantization is currently not supported in ROCm.
 if current_platform.is_rocm():
    MODELS = [
        ModelWithQuantization(
-            model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
-            quantization="gptq"),
+            model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", quantization="gptq"
+        ),
    ]
 else:
    MODELS = [
        ModelWithQuantization(
-            model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
-            quantization="awq"),
+            model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", quantization="awq"
+        ),
        ModelWithQuantization(
-            model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
-            quantization="gptq"),
+            model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", quantization="gptq"
+        ),
    ]


-def do_sample(llm: vllm.LLM,
-              lora_path: str,
-              lora_id: int,
-              max_tokens: int = 256) -> list[str]:
+def do_sample(
+    llm: vllm.LLM, lora_path: str, lora_id: int, max_tokens: int = 256
+) -> list[str]:
    raw_prompts = [
        "Give me an orange-ish brown color",
        "Give me a neon pink color",
@@ -52,14 +51,14 @@ def do_sample(llm: vllm.LLM,

    prompts = [format_prompt_tuples(p) for p in raw_prompts]

-    sampling_params = vllm.SamplingParams(temperature=0,
-                                          max_tokens=max_tokens,
-                                          stop=["<|im_end|>"])
+    sampling_params = vllm.SamplingParams(
+        temperature=0, max_tokens=max_tokens, stop=["<|im_end|>"]
+    )
    outputs = llm.generate(
        prompts,
        sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None)
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
+    )
    # Print the outputs.
    generated_texts: list[str] = []
    for output in outputs:
@@ -72,18 +71,18 @@ def do_sample(llm: vllm.LLM,

@pytest.mark.parametrize("model", MODELS)
 def test_quant_model_lora(tinyllama_lora_files, model):
-
    llm = vllm.LLM(
        model=model.model_path,
        enable_lora=True,
        max_num_seqs=16,
        max_loras=4,
        max_model_len=400,
-        gpu_memory_utilization=0.2,  #avoid OOM
+        gpu_memory_utilization=0.2,  # avoid OOM
        quantization=model.quantization,
        trust_remote_code=True,
        enable_chunked_prefill=True,
-        tokenizer=tinyllama_lora_files)
+        tokenizer=tinyllama_lora_files,
+    )

    if model.quantization is None:
        expected_lora_output = [
@@ -104,11 +103,11 @@ def test_quant_model_lora(tinyllama_lora_files, model):
    def expect_match(output, expected_output):
        # HACK: GPTQ lora outputs are just incredibly unstable.
        # Assert that the outputs changed.
-        if (model.quantization == "gptq"
-                and expected_output is expected_lora_output):
+        if model.quantization == "gptq" and expected_output is expected_lora_output:
            for i, o in enumerate(output):
-                assert o.startswith(
-                    '#'), f"Expected example {i} to start with # but got {o}"
+                assert o.startswith("#"), (
+                    f"Expected example {i} to start with # but got {o}"
+                )
            return
        assert output == expected_output

@@ -116,17 +115,11 @@ def test_quant_model_lora(tinyllama_lora_files, model):

    print("lora adapter created")
    print("lora 1")
-    output = do_sample(llm,
-                       tinyllama_lora_files,
-                       lora_id=1,
-                       max_tokens=max_tokens)
+    output = do_sample(llm, tinyllama_lora_files, lora_id=1, max_tokens=max_tokens)
    expect_match(output, expected_lora_output)

    print("lora 2")
-    output = do_sample(llm,
-                       tinyllama_lora_files,
-                       lora_id=2,
-                       max_tokens=max_tokens)
+    output = do_sample(llm, tinyllama_lora_files, lora_id=2, max_tokens=max_tokens)
    expect_match(output, expected_lora_output)

    print("removing lora")
@@ -136,8 +129,7 @@ def test_quant_model_lora(tinyllama_lora_files, model):


@pytest.mark.parametrize("model", MODELS)
-def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
-                                 model):
+def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available, model):
    if num_gpus_available < 2:
        pytest.skip(f"Not enough GPUs for tensor parallelism {2}")
    if model.quantization == "gptq":
@@ -147,10 +139,11 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
        enable_lora=True,
        max_num_seqs=16,
        max_loras=4,
-        gpu_memory_utilization=0.2,  #avoid OOM
+        gpu_memory_utilization=0.2,  # avoid OOM
        quantization=model.quantization,
        trust_remote_code=True,
-        enable_chunked_prefill=True)
+        enable_chunked_prefill=True,
+    )
    output_tp1 = do_sample(llm_tp1, tinyllama_lora_files, lora_id=1)

    del llm_tp1
@@ -162,9 +155,10 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
        max_num_seqs=16,
        max_loras=4,
        tensor_parallel_size=2,
-        gpu_memory_utilization=0.2,  #avoid OOM
+        gpu_memory_utilization=0.2,  # avoid OOM
        quantization=model.quantization,
-        enable_chunked_prefill=True)
+        enable_chunked_prefill=True,
+    )
    output_tp2 = do_sample(llm_tp2, tinyllama_lora_files, lora_id=1)

    del llm_tp2