[Misc] Replace os environ to monkeypatch in test suite (#14516)
Signed-off-by: sibi <85477603+t-sibiraj@users.noreply.github.com> Signed-off-by: Aaron Pham <contact@aarnphm.xyz> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Aaron Pham <contact@aarnphm.xyz>
This commit is contained in:
@@ -3,7 +3,10 @@
|
||||
|
||||
Run `pytest tests/distributed/test_comm_ops.py`.
|
||||
"""
|
||||
import os
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Callable
|
||||
|
||||
import pytest
|
||||
import ray
|
||||
@@ -17,12 +20,18 @@ from ..utils import init_test_distributed_environment, multi_process_parallel
|
||||
|
||||
|
||||
@ray.remote(num_gpus=1, max_calls=1)
|
||||
def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
|
||||
distributed_init_port: str):
|
||||
def all_reduce_test_worker(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
tp_size: int,
|
||||
pp_size: int,
|
||||
rank: int,
|
||||
distributed_init_port: str,
|
||||
):
|
||||
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
|
||||
# so that each worker can see all the GPUs
|
||||
# they will be able to set the device to the correct GPU
|
||||
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
|
||||
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
|
||||
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||
@@ -39,12 +48,17 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
|
||||
|
||||
|
||||
@ray.remote(num_gpus=1, max_calls=1)
|
||||
def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
|
||||
distributed_init_port: str):
|
||||
def all_gather_test_worker(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
tp_size: int,
|
||||
pp_size: int,
|
||||
rank: int,
|
||||
distributed_init_port: str,
|
||||
):
|
||||
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
|
||||
# so that each worker can see all the GPUs
|
||||
# they will be able to set the device to the correct GPU
|
||||
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
|
||||
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||
@@ -67,12 +81,17 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
|
||||
|
||||
|
||||
@ray.remote(num_gpus=1, max_calls=1)
|
||||
def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
|
||||
distributed_init_port: str):
|
||||
def broadcast_tensor_dict_test_worker(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
tp_size: int,
|
||||
pp_size: int,
|
||||
rank: int,
|
||||
distributed_init_port: str,
|
||||
):
|
||||
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
|
||||
# so that each worker can see all the GPUs
|
||||
# they will be able to set the device to the correct GPU
|
||||
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
|
||||
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||
@@ -106,9 +125,14 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
|
||||
|
||||
|
||||
@ray.remote(num_gpus=1, max_calls=1)
|
||||
def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
|
||||
distributed_init_port: str):
|
||||
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
|
||||
def send_recv_tensor_dict_test_worker(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
tp_size: int,
|
||||
pp_size: int,
|
||||
rank: int,
|
||||
distributed_init_port: str,
|
||||
):
|
||||
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||
@@ -146,9 +170,14 @@ def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
|
||||
|
||||
|
||||
@ray.remote(num_gpus=1, max_calls=1)
|
||||
def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
|
||||
distributed_init_port: str):
|
||||
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
|
||||
def send_recv_test_worker(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
tp_size: int,
|
||||
pp_size: int,
|
||||
rank: int,
|
||||
distributed_init_port: str,
|
||||
):
|
||||
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||
@@ -174,8 +203,12 @@ def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
|
||||
all_reduce_test_worker, all_gather_test_worker,
|
||||
broadcast_tensor_dict_test_worker
|
||||
])
|
||||
def test_multi_process_tensor_parallel(tp_size, test_target):
|
||||
multi_process_parallel(tp_size, 1, test_target)
|
||||
def test_multi_process_tensor_parallel(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
tp_size: int,
|
||||
test_target: Callable[..., Any],
|
||||
):
|
||||
multi_process_parallel(monkeypatch, tp_size, 1, test_target)
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2,
|
||||
@@ -183,8 +216,12 @@ def test_multi_process_tensor_parallel(tp_size, test_target):
|
||||
@pytest.mark.parametrize("pp_size", [2])
|
||||
@pytest.mark.parametrize(
|
||||
"test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker])
|
||||
def test_multi_process_pipeline_parallel(pp_size, test_target):
|
||||
multi_process_parallel(1, pp_size, test_target)
|
||||
def test_multi_process_pipeline_parallel(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
pp_size: int,
|
||||
test_target: Callable[..., Any],
|
||||
):
|
||||
multi_process_parallel(monkeypatch, 1, pp_size, test_target)
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 4,
|
||||
@@ -197,5 +234,9 @@ def test_multi_process_pipeline_parallel(pp_size, test_target):
|
||||
broadcast_tensor_dict_test_worker
|
||||
])
|
||||
def test_multi_process_tensor_parallel_pipeline_parallel(
|
||||
tp_size, pp_size, test_target):
|
||||
multi_process_parallel(tp_size, pp_size, test_target)
|
||||
tp_size: int,
|
||||
pp_size: int,
|
||||
test_target: Callable[..., Any],
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
):
|
||||
multi_process_parallel(monkeypatch, tp_size, pp_size, test_target)
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import os
|
||||
import random
|
||||
|
||||
import pytest
|
||||
@@ -23,95 +22,115 @@ for i, v in enumerate(test_sizes):
|
||||
|
||||
|
||||
@ray.remote(num_gpus=1, max_calls=1)
|
||||
def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
|
||||
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||
distributed_init_port)
|
||||
ensure_model_parallel_initialized(tp_size, pp_size)
|
||||
group = get_tensor_model_parallel_group().device_group
|
||||
def graph_allreduce(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
tp_size,
|
||||
pp_size,
|
||||
rank,
|
||||
distributed_init_port,
|
||||
):
|
||||
with monkeypatch.context() as m:
|
||||
m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||
distributed_init_port)
|
||||
ensure_model_parallel_initialized(tp_size, pp_size)
|
||||
group = get_tensor_model_parallel_group().device_group
|
||||
|
||||
# A small all_reduce for warmup.
|
||||
# this is needed because device communicators might be created lazily
|
||||
# (e.g. NCCL). This will ensure that the communicator is initialized
|
||||
# before any communication happens, so that this group can be used for
|
||||
# graph capture immediately.
|
||||
data = torch.zeros(1)
|
||||
data = data.to(device=device)
|
||||
torch.distributed.all_reduce(data, group=group)
|
||||
torch.cuda.synchronize()
|
||||
del data
|
||||
# A small all_reduce for warmup.
|
||||
# this is needed because device communicators might be created lazily
|
||||
# (e.g. NCCL). This will ensure that the communicator is initialized
|
||||
# before any communication happens, so that this group can be used for
|
||||
# graph capture immediately.
|
||||
data = torch.zeros(1)
|
||||
data = data.to(device=device)
|
||||
torch.distributed.all_reduce(data, group=group)
|
||||
torch.cuda.synchronize()
|
||||
del data
|
||||
|
||||
# we use the first group to communicate once
|
||||
# and the second group to communicate twice
|
||||
# and so on
|
||||
# this is used to demonstrate that each group can
|
||||
# communicate independently
|
||||
num_communication = rank // tp_size + 1
|
||||
# we use the first group to communicate once
|
||||
# and the second group to communicate twice
|
||||
# and so on
|
||||
# this is used to demonstrate that each group can
|
||||
# communicate independently
|
||||
num_communication = rank // tp_size + 1
|
||||
|
||||
for sz in test_sizes:
|
||||
for dtype in [torch.float32, torch.float16, torch.bfloat16]:
|
||||
with graph_capture(device=device) as graph_capture_context:
|
||||
# use integers so result matches NCCL exactly
|
||||
inp1 = torch.randint(1,
|
||||
16, (sz, ),
|
||||
dtype=dtype,
|
||||
device=torch.cuda.current_device())
|
||||
inp2 = torch.randint(1,
|
||||
16, (sz, ),
|
||||
dtype=dtype,
|
||||
device=torch.cuda.current_device())
|
||||
torch.cuda.synchronize()
|
||||
graph = torch.cuda.CUDAGraph()
|
||||
with torch.cuda.graph(graph,
|
||||
stream=graph_capture_context.stream):
|
||||
for i in range(num_communication):
|
||||
out1 = tensor_model_parallel_all_reduce(inp1)
|
||||
# the input buffer is immediately modified to test
|
||||
# synchronization
|
||||
dist.all_reduce(inp1, group=group)
|
||||
out2 = tensor_model_parallel_all_reduce(inp2)
|
||||
dist.all_reduce(inp2, group=group)
|
||||
graph.replay()
|
||||
torch.testing.assert_close(out1, inp1)
|
||||
torch.testing.assert_close(out2, inp2)
|
||||
for sz in test_sizes:
|
||||
for dtype in [torch.float32, torch.float16, torch.bfloat16]:
|
||||
with graph_capture(device=device) as graph_capture_context:
|
||||
# use integers so result matches NCCL exactly
|
||||
inp1 = torch.randint(1,
|
||||
16, (sz, ),
|
||||
dtype=dtype,
|
||||
device=torch.cuda.current_device())
|
||||
inp2 = torch.randint(1,
|
||||
16, (sz, ),
|
||||
dtype=dtype,
|
||||
device=torch.cuda.current_device())
|
||||
torch.cuda.synchronize()
|
||||
graph = torch.cuda.CUDAGraph()
|
||||
with torch.cuda.graph(graph,
|
||||
stream=graph_capture_context.stream):
|
||||
for i in range(num_communication):
|
||||
out1 = tensor_model_parallel_all_reduce(inp1)
|
||||
# the input buffer is immediately modified to test
|
||||
# synchronization
|
||||
dist.all_reduce(inp1, group=group)
|
||||
out2 = tensor_model_parallel_all_reduce(inp2)
|
||||
dist.all_reduce(inp2, group=group)
|
||||
graph.replay()
|
||||
torch.testing.assert_close(out1, inp1)
|
||||
torch.testing.assert_close(out2, inp2)
|
||||
|
||||
|
||||
@ray.remote(num_gpus=1, max_calls=1)
|
||||
def eager_allreduce(tp_size, pp_size, rank, distributed_init_port):
|
||||
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||
distributed_init_port)
|
||||
def eager_allreduce(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
tp_size,
|
||||
pp_size,
|
||||
rank,
|
||||
distributed_init_port,
|
||||
):
|
||||
with monkeypatch.context() as m:
|
||||
m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||
distributed_init_port)
|
||||
|
||||
# we use the first group to communicate once
|
||||
# and the second group to communicate twice
|
||||
# and so on
|
||||
# this is used to demonstrate that each group can
|
||||
# communicate independently
|
||||
num_communication = rank // tp_size + 1
|
||||
sz = 1024
|
||||
fa = get_tp_group().ca_comm
|
||||
inp = torch.ones(sz, dtype=torch.float32, device=device)
|
||||
out = inp
|
||||
for _ in range(num_communication):
|
||||
out = fa.all_reduce(out, registered=False)
|
||||
torch.testing.assert_close(out, inp * (tp_size**num_communication))
|
||||
# we use the first group to communicate once
|
||||
# and the second group to communicate twice
|
||||
# and so on
|
||||
# this is used to demonstrate that each group can
|
||||
# communicate independently
|
||||
num_communication = rank // tp_size + 1
|
||||
sz = 1024
|
||||
fa = get_tp_group().ca_comm
|
||||
inp = torch.ones(sz, dtype=torch.float32, device=device)
|
||||
out = inp
|
||||
for _ in range(num_communication):
|
||||
out = fa.all_reduce(out, registered=False)
|
||||
torch.testing.assert_close(out, inp * (tp_size**num_communication))
|
||||
|
||||
inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device)
|
||||
out = inp
|
||||
for _ in range(num_communication):
|
||||
out = fa.all_reduce(out, registered=False)
|
||||
torch.testing.assert_close(out, inp * (tp_size**num_communication))
|
||||
inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device)
|
||||
out = inp
|
||||
for _ in range(num_communication):
|
||||
out = fa.all_reduce(out, registered=False)
|
||||
torch.testing.assert_close(out, inp * (tp_size**num_communication))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tp_size", [2])
|
||||
@pytest.mark.parametrize("pipeline_parallel_size", [1, 2])
|
||||
@pytest.mark.parametrize("test_target", [eager_allreduce, graph_allreduce])
|
||||
def test_custom_allreduce(tp_size, pipeline_parallel_size, test_target):
|
||||
def test_custom_allreduce(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
tp_size,
|
||||
pipeline_parallel_size,
|
||||
test_target,
|
||||
):
|
||||
world_size = tp_size * pipeline_parallel_size
|
||||
if world_size > torch.cuda.device_count():
|
||||
pytest.skip("Not enough GPUs to run the test.")
|
||||
multi_process_parallel(tp_size, pipeline_parallel_size, test_target)
|
||||
multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size,
|
||||
test_target)
|
||||
|
||||
@@ -7,33 +7,35 @@ import pytest
|
||||
from vllm.distributed.utils import get_pp_indices
|
||||
|
||||
|
||||
def test_custom_layer_partition():
|
||||
def test_custom_layer_partition(monkeypatch: pytest.MonkeyPatch):
|
||||
|
||||
def _verify(partition_str, num_layers, pp_size, goldens):
|
||||
bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None)
|
||||
os.environ["VLLM_PP_LAYER_PARTITION"] = partition_str
|
||||
for pp_rank, golden in enumerate(goldens):
|
||||
assert get_pp_indices(num_layers, pp_rank, pp_size) == golden
|
||||
if bak is not None:
|
||||
os.environ["VLLM_PP_LAYER_PARTITION"] = bak
|
||||
with monkeypatch.context() as m:
|
||||
|
||||
# Even partition
|
||||
_verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
|
||||
# Balanced partition
|
||||
_verify("4,6,6,4", 20, 4, [(0, 4), (4, 10), (10, 16), (16, 20)])
|
||||
# Put reminder somewhere
|
||||
_verify("5,6,5,6", 22, 4, [(0, 5), (5, 11), (11, 16), (16, 22)])
|
||||
# Invalid partition strings
|
||||
with pytest.raises(ValueError):
|
||||
_verify("5,5,5,5,", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
|
||||
with pytest.raises(ValueError):
|
||||
_verify("5,5,5,a", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
|
||||
# Wrong number of partitions
|
||||
with pytest.raises(ValueError):
|
||||
_verify("5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
|
||||
# Wrong number of layers
|
||||
with pytest.raises(ValueError):
|
||||
_verify("5,5,5,5", 21, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
|
||||
def _verify(partition_str, num_layers, pp_size, goldens):
|
||||
bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None)
|
||||
m.setenv("VLLM_PP_LAYER_PARTITION", partition_str)
|
||||
for pp_rank, golden in enumerate(goldens):
|
||||
assert get_pp_indices(num_layers, pp_rank, pp_size) == golden
|
||||
if bak is not None:
|
||||
m.setenv("VLLM_PP_LAYER_PARTITION", bak)
|
||||
|
||||
# Even partition
|
||||
_verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
|
||||
# Balanced partition
|
||||
_verify("4,6,6,4", 20, 4, [(0, 4), (4, 10), (10, 16), (16, 20)])
|
||||
# Put reminder somewhere
|
||||
_verify("5,6,5,6", 22, 4, [(0, 5), (5, 11), (11, 16), (16, 22)])
|
||||
# Invalid partition strings
|
||||
with pytest.raises(ValueError):
|
||||
_verify("5,5,5,5,", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
|
||||
with pytest.raises(ValueError):
|
||||
_verify("5,5,5,a", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
|
||||
# Wrong number of partitions
|
||||
with pytest.raises(ValueError):
|
||||
_verify("5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
|
||||
# Wrong number of layers
|
||||
with pytest.raises(ValueError):
|
||||
_verify("5,5,5,5", 21, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@@ -55,6 +57,10 @@ def test_custom_layer_partition():
|
||||
(5, 3, 1, (2, 4)),
|
||||
(5, 3, 2, (4, 5)),
|
||||
])
|
||||
def test_uneven_auto_partition(num_hidden_layers: int, pp_size: int,
|
||||
pp_rank: int, indices: tuple[int, int]):
|
||||
def test_uneven_auto_partition(
|
||||
num_hidden_layers: int,
|
||||
pp_size: int,
|
||||
pp_rank: int,
|
||||
indices: tuple[int, int],
|
||||
):
|
||||
assert indices == get_pp_indices(num_hidden_layers, pp_rank, pp_size)
|
||||
|
||||
@@ -1,11 +1,15 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import pytest
|
||||
|
||||
from ..utils import compare_two_settings, fork_new_process_for_each_test
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from typing_extensions import LiteralString
|
||||
|
||||
|
||||
@pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
|
||||
(2, "JackFram/llama-160m"),
|
||||
@@ -15,18 +19,24 @@ from ..utils import compare_two_settings, fork_new_process_for_each_test
|
||||
"FLASHINFER",
|
||||
])
|
||||
@fork_new_process_for_each_test
|
||||
def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):
|
||||
cudagraph_args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"float16",
|
||||
"--pipeline-parallel-size",
|
||||
str(PP_SIZE),
|
||||
"--distributed-executor-backend",
|
||||
"mp",
|
||||
]
|
||||
os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND
|
||||
def test_pp_cudagraph(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
PP_SIZE: int,
|
||||
MODEL_NAME: str,
|
||||
ATTN_BACKEND: LiteralString,
|
||||
):
|
||||
with monkeypatch.context() as m:
|
||||
cudagraph_args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"float16",
|
||||
"--pipeline-parallel-size",
|
||||
str(PP_SIZE),
|
||||
"--distributed-executor-backend",
|
||||
"mp",
|
||||
]
|
||||
m.setenv("VLLM_ATTENTION_BACKEND", ATTN_BACKEND)
|
||||
|
||||
eager_args = cudagraph_args + ["--enforce-eager"]
|
||||
eager_args = cudagraph_args + ["--enforce-eager"]
|
||||
|
||||
compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)
|
||||
compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)
|
||||
|
||||
Reference in New Issue
Block a user