[UX] Suppress gloo log spam (#29250)
Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
@@ -51,6 +51,7 @@ from vllm.distributed.utils import StatelessProcessGroup
|
|||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.utils.import_utils import resolve_obj_by_qualname
|
from vllm.utils.import_utils import resolve_obj_by_qualname
|
||||||
from vllm.utils.network_utils import get_distributed_init_method
|
from vllm.utils.network_utils import get_distributed_init_method
|
||||||
|
from vllm.utils.system_utils import suppress_stdout
|
||||||
from vllm.utils.torch_utils import (
|
from vllm.utils.torch_utils import (
|
||||||
direct_register_custom_op,
|
direct_register_custom_op,
|
||||||
supports_custom_op,
|
supports_custom_op,
|
||||||
@@ -329,7 +330,8 @@ class GroupCoordinator:
|
|||||||
)
|
)
|
||||||
# a group with `gloo` backend, to allow direct coordination between
|
# a group with `gloo` backend, to allow direct coordination between
|
||||||
# processes through the CPU.
|
# processes through the CPU.
|
||||||
cpu_group = torch.distributed.new_group(ranks, backend="gloo")
|
with suppress_stdout():
|
||||||
|
cpu_group = torch.distributed.new_group(ranks, backend="gloo")
|
||||||
if self.rank in ranks:
|
if self.rank in ranks:
|
||||||
self.ranks = ranks
|
self.ranks = ranks
|
||||||
self.world_size = len(ranks)
|
self.world_size = len(ranks)
|
||||||
|
|||||||
@@ -30,6 +30,7 @@ from torch.distributed.rendezvous import rendezvous
|
|||||||
import vllm.envs as envs
|
import vllm.envs as envs
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.utils.network_utils import get_tcp_uri
|
from vllm.utils.network_utils import get_tcp_uri
|
||||||
|
from vllm.utils.system_utils import suppress_stdout
|
||||||
from vllm.utils.torch_utils import is_torch_equal_or_newer
|
from vllm.utils.torch_utils import is_torch_equal_or_newer
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
@@ -427,33 +428,34 @@ def init_gloo_process_group(
|
|||||||
Stateless init ProcessGroup with gloo backend compatible with
|
Stateless init ProcessGroup with gloo backend compatible with
|
||||||
different torch versions.
|
different torch versions.
|
||||||
"""
|
"""
|
||||||
if is_torch_equal_or_newer("2.6"):
|
with suppress_stdout():
|
||||||
pg = ProcessGroup(
|
if is_torch_equal_or_newer("2.6"):
|
||||||
prefix_store,
|
pg = ProcessGroup(
|
||||||
group_rank,
|
prefix_store,
|
||||||
group_size,
|
group_rank,
|
||||||
)
|
group_size,
|
||||||
else:
|
)
|
||||||
options = ProcessGroup.Options(backend="gloo")
|
else:
|
||||||
pg = ProcessGroup(
|
options = ProcessGroup.Options(backend="gloo")
|
||||||
prefix_store,
|
pg = ProcessGroup(
|
||||||
group_rank,
|
prefix_store,
|
||||||
group_size,
|
group_rank,
|
||||||
options,
|
group_size,
|
||||||
)
|
options,
|
||||||
from torch.distributed.distributed_c10d import ProcessGroupGloo
|
)
|
||||||
|
from torch.distributed.distributed_c10d import ProcessGroupGloo
|
||||||
|
|
||||||
backend_class = ProcessGroupGloo(
|
backend_class = ProcessGroupGloo(
|
||||||
prefix_store, group_rank, group_size, timeout=timeout
|
prefix_store, group_rank, group_size, timeout=timeout
|
||||||
)
|
)
|
||||||
backend_type = ProcessGroup.BackendType.GLOO
|
backend_type = ProcessGroup.BackendType.GLOO
|
||||||
device = torch.device("cpu")
|
device = torch.device("cpu")
|
||||||
if is_torch_equal_or_newer("2.6"):
|
if is_torch_equal_or_newer("2.6"):
|
||||||
# _set_default_backend is supported in torch >= 2.6
|
# _set_default_backend is supported in torch >= 2.6
|
||||||
pg._set_default_backend(backend_type)
|
pg._set_default_backend(backend_type)
|
||||||
backend_class._set_sequence_number_for_group()
|
backend_class._set_sequence_number_for_group()
|
||||||
|
|
||||||
pg._register_backend(device, backend_type, backend_class)
|
pg._register_backend(device, backend_type, backend_class)
|
||||||
return pg
|
return pg
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -56,6 +56,39 @@ def set_env_var(key: str, value: str) -> Iterator[None]:
|
|||||||
os.environ[key] = old
|
os.environ[key] = old
|
||||||
|
|
||||||
|
|
||||||
|
@contextlib.contextmanager
|
||||||
|
def suppress_stdout():
|
||||||
|
"""
|
||||||
|
Suppress stdout from C libraries at the file descriptor level.
|
||||||
|
|
||||||
|
Only suppresses stdout, not stderr, to preserve error messages.
|
||||||
|
Suppression is disabled when VLLM_LOGGING_LEVEL is set to DEBUG.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
with suppress_stdout():
|
||||||
|
# C library calls that would normally print to stdout
|
||||||
|
torch.distributed.new_group(ranks, backend="gloo")
|
||||||
|
"""
|
||||||
|
# Don't suppress if logging level is DEBUG
|
||||||
|
if envs.VLLM_LOGGING_LEVEL == "DEBUG":
|
||||||
|
yield
|
||||||
|
return
|
||||||
|
|
||||||
|
stdout_fd = sys.stdout.fileno()
|
||||||
|
stdout_dup = os.dup(stdout_fd)
|
||||||
|
devnull_fd = os.open(os.devnull, os.O_WRONLY)
|
||||||
|
|
||||||
|
try:
|
||||||
|
sys.stdout.flush()
|
||||||
|
os.dup2(devnull_fd, stdout_fd)
|
||||||
|
yield
|
||||||
|
finally:
|
||||||
|
sys.stdout.flush()
|
||||||
|
os.dup2(stdout_dup, stdout_fd)
|
||||||
|
os.close(stdout_dup)
|
||||||
|
os.close(devnull_fd)
|
||||||
|
|
||||||
|
|
||||||
# File path utilities
|
# File path utilities
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user