Convert formatting to use ruff instead of yapf + isort (#26247)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -4,6 +4,7 @@
|
||||
|
||||
Run `pytest tests/basic_correctness/test_basic_correctness.py`.
|
||||
"""
|
||||
|
||||
import os
|
||||
import weakref
|
||||
from unittest.mock import Mock
|
||||
@@ -37,16 +38,21 @@ def test_vllm_gc_ed():
|
||||
|
||||
|
||||
def _fix_prompt_embed_outputs(
|
||||
vllm_outputs: list[tuple[list[int], str]], hf_model: HfRunner,
|
||||
example_prompts: list[str]) -> list[tuple[list[int], str]]:
|
||||
vllm_outputs: list[tuple[list[int], str]],
|
||||
hf_model: HfRunner,
|
||||
example_prompts: list[str],
|
||||
) -> list[tuple[list[int], str]]:
|
||||
fixed_vllm_outputs = []
|
||||
for vllm_output, hf_input, prompt in zip(
|
||||
vllm_outputs, hf_model.get_inputs(example_prompts),
|
||||
example_prompts):
|
||||
vllm_outputs, hf_model.get_inputs(example_prompts), example_prompts
|
||||
):
|
||||
hf_input_ids = hf_input["input_ids"].tolist()[0]
|
||||
fixed_vllm_outputs.append(
|
||||
(hf_input_ids + vllm_output[0][len(hf_input_ids):],
|
||||
prompt + vllm_output[1]))
|
||||
(
|
||||
hf_input_ids + vllm_output[0][len(hf_input_ids) :],
|
||||
prompt + vllm_output[1],
|
||||
)
|
||||
)
|
||||
return fixed_vllm_outputs
|
||||
|
||||
|
||||
@@ -69,8 +75,7 @@ def test_models(
|
||||
enable_prompt_embeds: bool,
|
||||
) -> None:
|
||||
if backend == "XFORMERS" and model == "google/gemma-2-2b-it":
|
||||
pytest.skip(
|
||||
f"{backend} does not support gemma2 with full context length.")
|
||||
pytest.skip(f"{backend} does not support gemma2 with full context length.")
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_ATTENTION_BACKEND", backend)
|
||||
@@ -78,34 +83,35 @@ def test_models(
|
||||
# 5042 tokens for gemma2
|
||||
# gemma2 has alternating sliding window size of 4096
|
||||
# we need a prompt with more than 4096 tokens to test the sliding window
|
||||
prompt = "The following numbers of the sequence " + ", ".join(
|
||||
str(i) for i in range(1024)) + " are:"
|
||||
prompt = (
|
||||
"The following numbers of the sequence "
|
||||
+ ", ".join(str(i) for i in range(1024))
|
||||
+ " are:"
|
||||
)
|
||||
example_prompts = [prompt]
|
||||
|
||||
with hf_runner(model) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||
if enable_prompt_embeds:
|
||||
with torch.no_grad():
|
||||
prompt_embeds = hf_model.get_prompt_embeddings(
|
||||
example_prompts)
|
||||
prompt_embeds = hf_model.get_prompt_embeddings(example_prompts)
|
||||
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=8192,
|
||||
enforce_eager=enforce_eager,
|
||||
enable_prompt_embeds=enable_prompt_embeds,
|
||||
gpu_memory_utilization=0.7,
|
||||
async_scheduling=async_scheduling,
|
||||
distributed_executor_backend=model_executor,
|
||||
model,
|
||||
max_model_len=8192,
|
||||
enforce_eager=enforce_eager,
|
||||
enable_prompt_embeds=enable_prompt_embeds,
|
||||
gpu_memory_utilization=0.7,
|
||||
async_scheduling=async_scheduling,
|
||||
distributed_executor_backend=model_executor,
|
||||
) as vllm_model:
|
||||
if enable_prompt_embeds:
|
||||
vllm_outputs = vllm_model.generate_greedy(
|
||||
prompt_embeds, max_tokens)
|
||||
vllm_outputs = vllm_model.generate_greedy(prompt_embeds, max_tokens)
|
||||
vllm_outputs = _fix_prompt_embed_outputs(
|
||||
vllm_outputs, hf_model, example_prompts)
|
||||
vllm_outputs, hf_model, example_prompts
|
||||
)
|
||||
else:
|
||||
vllm_outputs = vllm_model.generate_greedy(
|
||||
example_prompts, max_tokens)
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=hf_outputs,
|
||||
@@ -117,21 +123,18 @@ def test_models(
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize(
|
||||
"model, distributed_executor_backend, attention_backend, "
|
||||
"test_suite, extra_env", [
|
||||
"model, distributed_executor_backend, attention_backend, test_suite, extra_env",
|
||||
[
|
||||
("distilbert/distilgpt2", "ray", "", "L4", {}),
|
||||
("distilbert/distilgpt2", "mp", "", "L4", {}),
|
||||
("distilbert/distilgpt2", "ray", "", "L4", {
|
||||
"VLLM_SLEEP_WHEN_IDLE": "1"
|
||||
}),
|
||||
("distilbert/distilgpt2", "mp", "", "L4", {
|
||||
"VLLM_SLEEP_WHEN_IDLE": "1"
|
||||
}),
|
||||
("distilbert/distilgpt2", "ray", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
|
||||
("distilbert/distilgpt2", "mp", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
|
||||
("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4", {}),
|
||||
("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4", {}),
|
||||
("distilbert/distilgpt2", "ray", "", "A100", {}),
|
||||
("distilbert/distilgpt2", "mp", "", "A100", {}),
|
||||
])
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("enable_prompt_embeds", [True, False])
|
||||
def test_models_distributed(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
@@ -149,11 +152,14 @@ def test_models_distributed(
|
||||
pytest.skip(f"Skip test for {test_suite}")
|
||||
|
||||
with monkeypatch.context() as monkeypatch_context:
|
||||
if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
|
||||
if (
|
||||
model == "meta-llama/Llama-3.2-1B-Instruct"
|
||||
and distributed_executor_backend == "ray"
|
||||
and attention_backend == ""
|
||||
and test_suite == "L4"
|
||||
): # noqa
|
||||
if enable_prompt_embeds:
|
||||
pytest.skip(
|
||||
"enable_prompt_embeds does not work with ray compiled dag."
|
||||
)
|
||||
pytest.skip("enable_prompt_embeds does not work with ray compiled dag.")
|
||||
monkeypatch_context.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
|
||||
monkeypatch_context.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
|
||||
|
||||
@@ -175,30 +181,26 @@ def test_models_distributed(
|
||||
# will hurt multiprocessing backend with fork method
|
||||
# (the default method).
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enable_prompt_embeds=enable_prompt_embeds,
|
||||
gpu_memory_utilization=0.7,
|
||||
model,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enable_prompt_embeds=enable_prompt_embeds,
|
||||
gpu_memory_utilization=0.7,
|
||||
) as vllm_model:
|
||||
if enable_prompt_embeds:
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
with torch.no_grad():
|
||||
prompt_embeds = hf_model.get_prompt_embeddings(
|
||||
example_prompts)
|
||||
vllm_outputs = vllm_model.generate_greedy(
|
||||
prompt_embeds, max_tokens)
|
||||
prompt_embeds = hf_model.get_prompt_embeddings(example_prompts)
|
||||
vllm_outputs = vllm_model.generate_greedy(prompt_embeds, max_tokens)
|
||||
vllm_outputs = _fix_prompt_embed_outputs(
|
||||
vllm_outputs, hf_model, example_prompts)
|
||||
hf_outputs = hf_model.generate_greedy(
|
||||
example_prompts, max_tokens)
|
||||
vllm_outputs, hf_model, example_prompts
|
||||
)
|
||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||
else:
|
||||
vllm_outputs = vllm_model.generate_greedy(
|
||||
example_prompts, max_tokens)
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy(
|
||||
example_prompts, max_tokens)
|
||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=hf_outputs,
|
||||
@@ -209,27 +211,23 @@ def test_models_distributed(
|
||||
|
||||
|
||||
def test_failed_model_execution(vllm_runner, monkeypatch) -> None:
|
||||
|
||||
from vllm.envs import VLLM_USE_V1
|
||||
|
||||
if not VLLM_USE_V1:
|
||||
pytest.skip("Skipping V0 test, dump input not supported")
|
||||
|
||||
# Needed to mock an error in the same process
|
||||
monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0')
|
||||
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
||||
|
||||
with vllm_runner('facebook/opt-125m', enforce_eager=True) as vllm_model:
|
||||
with vllm_runner("facebook/opt-125m", enforce_eager=True) as vllm_model:
|
||||
if isinstance(vllm_model.llm.llm_engine, LLMEngineV1):
|
||||
v1_test_failed_model_execution(vllm_model)
|
||||
|
||||
|
||||
def v1_test_failed_model_execution(vllm_model):
|
||||
|
||||
engine = vllm_model.llm.llm_engine
|
||||
mocked_execute_model = Mock(
|
||||
side_effect=RuntimeError("Mocked Critical Error"))
|
||||
engine.engine_core.engine_core.model_executor.execute_model =\
|
||||
mocked_execute_model
|
||||
mocked_execute_model = Mock(side_effect=RuntimeError("Mocked Critical Error"))
|
||||
engine.engine_core.engine_core.model_executor.execute_model = mocked_execute_model
|
||||
|
||||
with pytest.raises(RuntimeError) as exc_info:
|
||||
prompts = [
|
||||
|
||||
@@ -5,5 +5,6 @@ from ..utils import compare_two_settings
|
||||
|
||||
|
||||
def test_cpu_offload():
|
||||
compare_two_settings("meta-llama/Llama-3.2-1B-Instruct", [],
|
||||
["--cpu-offload-gb", "1"])
|
||||
compare_two_settings(
|
||||
"meta-llama/Llama-3.2-1B-Instruct", [], ["--cpu-offload-gb", "1"]
|
||||
)
|
||||
|
||||
@@ -23,13 +23,13 @@ def test_python_error():
|
||||
tensors = []
|
||||
with allocator.use_memory_pool():
|
||||
# allocate 70% of the total memory
|
||||
x = torch.empty(alloc_bytes, dtype=torch.uint8, device='cuda')
|
||||
x = torch.empty(alloc_bytes, dtype=torch.uint8, device="cuda")
|
||||
tensors.append(x)
|
||||
# release the memory
|
||||
allocator.sleep()
|
||||
|
||||
# allocate more memory than the total memory
|
||||
y = torch.empty(alloc_bytes, dtype=torch.uint8, device='cuda')
|
||||
y = torch.empty(alloc_bytes, dtype=torch.uint8, device="cuda")
|
||||
tensors.append(y)
|
||||
with pytest.raises(RuntimeError):
|
||||
# when the allocator is woken up, it should raise an error
|
||||
@@ -41,17 +41,17 @@ def test_python_error():
|
||||
def test_basic_cumem():
|
||||
# some tensors from default memory pool
|
||||
shape = (1024, 1024)
|
||||
x = torch.empty(shape, device='cuda')
|
||||
x = torch.empty(shape, device="cuda")
|
||||
x.zero_()
|
||||
|
||||
# some tensors from custom memory pool
|
||||
allocator = CuMemAllocator.get_instance()
|
||||
with allocator.use_memory_pool():
|
||||
# custom memory pool
|
||||
y = torch.empty(shape, device='cuda')
|
||||
y = torch.empty(shape, device="cuda")
|
||||
y.zero_()
|
||||
y += 1
|
||||
z = torch.empty(shape, device='cuda')
|
||||
z = torch.empty(shape, device="cuda")
|
||||
z.zero_()
|
||||
z += 2
|
||||
|
||||
@@ -74,16 +74,16 @@ def test_basic_cumem():
|
||||
def test_cumem_with_cudagraph():
|
||||
allocator = CuMemAllocator.get_instance()
|
||||
with allocator.use_memory_pool():
|
||||
weight = torch.eye(1024, device='cuda')
|
||||
weight = torch.eye(1024, device="cuda")
|
||||
with allocator.use_memory_pool(tag="discard"):
|
||||
cache = torch.empty(1024, 1024, device='cuda')
|
||||
cache = torch.empty(1024, 1024, device="cuda")
|
||||
|
||||
def model(x):
|
||||
out = x @ weight
|
||||
cache[:out.size(0)].copy_(out)
|
||||
cache[: out.size(0)].copy_(out)
|
||||
return out + 1
|
||||
|
||||
x = torch.empty(128, 1024, device='cuda')
|
||||
x = torch.empty(128, 1024, device="cuda")
|
||||
|
||||
# warmup
|
||||
model(x)
|
||||
@@ -109,7 +109,7 @@ def test_cumem_with_cudagraph():
|
||||
model_graph.replay()
|
||||
|
||||
# cache content is as expected
|
||||
assert torch.allclose(x, cache[:x.size(0)])
|
||||
assert torch.allclose(x, cache[: x.size(0)])
|
||||
|
||||
# output content is as expected
|
||||
assert torch.allclose(y, x + 1)
|
||||
@@ -123,7 +123,8 @@ def test_cumem_with_cudagraph():
|
||||
("meta-llama/Llama-3.2-1B", True),
|
||||
# sleep mode with pytorch checkpoint
|
||||
("facebook/opt-125m", True),
|
||||
])
|
||||
],
|
||||
)
|
||||
def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
|
||||
with monkeypatch.context() as m:
|
||||
assert use_v1
|
||||
|
||||
Reference in New Issue
Block a user