[CI] Fail subprocess tests with root-cause error (#23795)
Signed-off-by: Nick Hill <nhill@redhat.com>
This commit is contained in:
@@ -21,6 +21,7 @@ ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline paralleli
|
|||||||
sentence-transformers # required for embedding tests
|
sentence-transformers # required for embedding tests
|
||||||
soundfile # required for audio tests
|
soundfile # required for audio tests
|
||||||
jiwer # required for audio tests
|
jiwer # required for audio tests
|
||||||
|
tblib # for pickling test exceptions
|
||||||
timm >=1.0.17 # required for internvl and gemma3n-mm test
|
timm >=1.0.17 # required for internvl and gemma3n-mm test
|
||||||
torch==2.8.0
|
torch==2.8.0
|
||||||
torchaudio==2.8.0
|
torchaudio==2.8.0
|
||||||
|
|||||||
@@ -137,7 +137,7 @@ contourpy==1.3.0
|
|||||||
# via matplotlib
|
# via matplotlib
|
||||||
cramjam==2.9.0
|
cramjam==2.9.0
|
||||||
# via fastparquet
|
# via fastparquet
|
||||||
cupy-cuda12x==13.3.0
|
cupy-cuda12x==13.6.0
|
||||||
# via ray
|
# via ray
|
||||||
cycler==0.12.1
|
cycler==0.12.1
|
||||||
# via matplotlib
|
# via matplotlib
|
||||||
@@ -1032,6 +1032,8 @@ tabledata==1.3.3
|
|||||||
# via pytablewriter
|
# via pytablewriter
|
||||||
tabulate==0.9.0
|
tabulate==0.9.0
|
||||||
# via sacrebleu
|
# via sacrebleu
|
||||||
|
tblib==3.1.0
|
||||||
|
# via -r requirements/test.in
|
||||||
tcolorpy==0.1.6
|
tcolorpy==0.1.6
|
||||||
# via pytablewriter
|
# via pytablewriter
|
||||||
tenacity==9.0.0
|
tenacity==9.0.0
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
import copyreg
|
||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
@@ -10,6 +11,30 @@ from pathlib import Path
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import requests
|
import requests
|
||||||
|
import urllib3.exceptions
|
||||||
|
|
||||||
|
|
||||||
|
def _pickle_new_connection_error(obj):
|
||||||
|
"""Custom pickler for NewConnectionError to fix tblib compatibility."""
|
||||||
|
# Extract the original message by removing the "conn: " prefix
|
||||||
|
full_message = obj.args[0] if obj.args else ""
|
||||||
|
if ': ' in full_message:
|
||||||
|
# Split off the connection part and keep the actual message
|
||||||
|
_, actual_message = full_message.split(': ', 1)
|
||||||
|
else:
|
||||||
|
actual_message = full_message
|
||||||
|
return _unpickle_new_connection_error, (actual_message, )
|
||||||
|
|
||||||
|
|
||||||
|
def _unpickle_new_connection_error(message):
|
||||||
|
"""Custom unpickler for NewConnectionError."""
|
||||||
|
# Create with None as conn and the actual message
|
||||||
|
return urllib3.exceptions.NewConnectionError(None, message)
|
||||||
|
|
||||||
|
|
||||||
|
# Register the custom pickle/unpickle functions for tblib compatibility
|
||||||
|
copyreg.pickle(urllib3.exceptions.NewConnectionError,
|
||||||
|
_pickle_new_connection_error)
|
||||||
|
|
||||||
|
|
||||||
def _query_server(prompt: str, max_tokens: int = 5) -> dict:
|
def _query_server(prompt: str, max_tokens: int = 5) -> dict:
|
||||||
@@ -52,6 +77,7 @@ def api_server(distributed_executor_backend: str):
|
|||||||
uvicorn_process.terminate()
|
uvicorn_process.terminate()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.timeout(300)
|
||||||
@pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"])
|
@pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"])
|
||||||
def test_api_server(api_server, distributed_executor_backend: str):
|
def test_api_server(api_server, distributed_executor_backend: str):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -1,5 +1,15 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
# ruff: noqa
|
||||||
|
|
||||||
|
from tblib import pickling_support
|
||||||
|
|
||||||
|
# Install support for pickling exceptions so that we can nicely propagate
|
||||||
|
# failures from tests running in a subprocess.
|
||||||
|
# This should be run before any custom exception subclasses are defined.
|
||||||
|
pickling_support.install()
|
||||||
|
|
||||||
import http.server
|
import http.server
|
||||||
import json
|
import json
|
||||||
import math
|
import math
|
||||||
|
|||||||
120
tests/utils.py
120
tests/utils.py
@@ -2,6 +2,7 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import contextlib
|
||||||
import copy
|
import copy
|
||||||
import functools
|
import functools
|
||||||
import importlib
|
import importlib
|
||||||
@@ -13,7 +14,7 @@ import sys
|
|||||||
import tempfile
|
import tempfile
|
||||||
import time
|
import time
|
||||||
import warnings
|
import warnings
|
||||||
from contextlib import contextmanager, suppress
|
from contextlib import ExitStack, contextmanager, suppress
|
||||||
from multiprocessing import Process
|
from multiprocessing import Process
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Callable, Literal, Optional, Union
|
from typing import Any, Callable, Literal, Optional, Union
|
||||||
@@ -800,43 +801,106 @@ _P = ParamSpec("_P")
|
|||||||
|
|
||||||
|
|
||||||
def fork_new_process_for_each_test(
|
def fork_new_process_for_each_test(
|
||||||
f: Callable[_P, None]) -> Callable[_P, None]:
|
func: Callable[_P, None]) -> Callable[_P, None]:
|
||||||
"""Decorator to fork a new process for each test function.
|
"""Decorator to fork a new process for each test function.
|
||||||
See https://github.com/vllm-project/vllm/issues/7053 for more details.
|
See https://github.com/vllm-project/vllm/issues/7053 for more details.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@functools.wraps(f)
|
@functools.wraps(func)
|
||||||
def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
|
def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
|
||||||
# Make the process the leader of its own process group
|
# Make the process the leader of its own process group
|
||||||
# to avoid sending SIGTERM to the parent process
|
# to avoid sending SIGTERM to the parent process
|
||||||
os.setpgrp()
|
os.setpgrp()
|
||||||
from _pytest.outcomes import Skipped
|
from _pytest.outcomes import Skipped
|
||||||
pid = os.fork()
|
|
||||||
print(f"Fork a new process to run a test {pid}")
|
# Create a unique temporary file to store exception info from child
|
||||||
if pid == 0:
|
# process. Use test function name and process ID to avoid collisions.
|
||||||
try:
|
with tempfile.NamedTemporaryFile(
|
||||||
f(*args, **kwargs)
|
delete=False,
|
||||||
except Skipped as e:
|
mode='w+b',
|
||||||
# convert Skipped to exit code 0
|
prefix=f"vllm_test_{func.__name__}_{os.getpid()}_",
|
||||||
print(str(e))
|
suffix=".exc") as exc_file, ExitStack() as delete_after:
|
||||||
os._exit(0)
|
exc_file_path = exc_file.name
|
||||||
except Exception:
|
delete_after.callback(os.remove, exc_file_path)
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
pid = os.fork()
|
||||||
os._exit(1)
|
print(f"Fork a new process to run a test {pid}")
|
||||||
|
if pid == 0:
|
||||||
|
# Parent process responsible for deleting, don't delete
|
||||||
|
# in child.
|
||||||
|
delete_after.pop_all()
|
||||||
|
try:
|
||||||
|
func(*args, **kwargs)
|
||||||
|
except Skipped as e:
|
||||||
|
# convert Skipped to exit code 0
|
||||||
|
print(str(e))
|
||||||
|
os._exit(0)
|
||||||
|
except Exception as e:
|
||||||
|
import traceback
|
||||||
|
tb_string = traceback.format_exc()
|
||||||
|
|
||||||
|
# Try to serialize the exception object first
|
||||||
|
exc_to_serialize: dict[str, Any]
|
||||||
|
try:
|
||||||
|
# First, try to pickle the actual exception with
|
||||||
|
# its traceback.
|
||||||
|
exc_to_serialize = {'pickled_exception': e}
|
||||||
|
# Test if it can be pickled
|
||||||
|
cloudpickle.dumps(exc_to_serialize)
|
||||||
|
except (Exception, KeyboardInterrupt):
|
||||||
|
# Fall back to string-based approach.
|
||||||
|
exc_to_serialize = {
|
||||||
|
'exception_type': type(e).__name__,
|
||||||
|
'exception_msg': str(e),
|
||||||
|
'traceback': tb_string,
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
with open(exc_file_path, 'wb') as f:
|
||||||
|
cloudpickle.dump(exc_to_serialize, f)
|
||||||
|
except Exception:
|
||||||
|
# Fallback: just print the traceback.
|
||||||
|
print(tb_string)
|
||||||
|
os._exit(1)
|
||||||
|
else:
|
||||||
|
os._exit(0)
|
||||||
else:
|
else:
|
||||||
os._exit(0)
|
pgid = os.getpgid(pid)
|
||||||
else:
|
_pid, _exitcode = os.waitpid(pid, 0)
|
||||||
pgid = os.getpgid(pid)
|
# ignore SIGTERM signal itself
|
||||||
_pid, _exitcode = os.waitpid(pid, 0)
|
old_signal_handler = signal.signal(signal.SIGTERM,
|
||||||
# ignore SIGTERM signal itself
|
signal.SIG_IGN)
|
||||||
old_signal_handler = signal.signal(signal.SIGTERM, signal.SIG_IGN)
|
# kill all child processes
|
||||||
# kill all child processes
|
os.killpg(pgid, signal.SIGTERM)
|
||||||
os.killpg(pgid, signal.SIGTERM)
|
# restore the signal handler
|
||||||
# restore the signal handler
|
signal.signal(signal.SIGTERM, old_signal_handler)
|
||||||
signal.signal(signal.SIGTERM, old_signal_handler)
|
if _exitcode != 0:
|
||||||
assert _exitcode == 0, (f"function {f} failed when called with"
|
# Try to read the exception from the child process
|
||||||
f" args {args} and kwargs {kwargs}")
|
exc_info = {}
|
||||||
|
if os.path.exists(exc_file_path):
|
||||||
|
with contextlib.suppress(Exception), \
|
||||||
|
open(exc_file_path, 'rb') as f:
|
||||||
|
exc_info = cloudpickle.load(f)
|
||||||
|
|
||||||
|
if (original_exception :=
|
||||||
|
exc_info.get('pickled_exception')) is not None:
|
||||||
|
# Re-raise the actual exception object if it was
|
||||||
|
# successfully pickled.
|
||||||
|
assert isinstance(original_exception, Exception)
|
||||||
|
raise original_exception
|
||||||
|
|
||||||
|
if (original_tb := exc_info.get("traceback")) is not None:
|
||||||
|
# Use string-based traceback for fallback case
|
||||||
|
raise AssertionError(
|
||||||
|
f"Test {func.__name__} failed when called with"
|
||||||
|
f" args {args} and kwargs {kwargs}"
|
||||||
|
f" (exit code: {_exitcode}):\n{original_tb}"
|
||||||
|
) from None
|
||||||
|
|
||||||
|
# Fallback to the original generic error
|
||||||
|
raise AssertionError(
|
||||||
|
f"function {func.__name__} failed when called with"
|
||||||
|
f" args {args} and kwargs {kwargs}"
|
||||||
|
f" (exit code: {_exitcode})") from None
|
||||||
|
|
||||||
return wrapper
|
return wrapper
|
||||||
|
|
||||||
|
|||||||
@@ -117,10 +117,12 @@ class RayDistributedExecutor(DistributedExecutorBase):
|
|||||||
self.driver_worker.execute_method)
|
self.driver_worker.execute_method)
|
||||||
|
|
||||||
def shutdown(self) -> None:
|
def shutdown(self) -> None:
|
||||||
logger.info(
|
if logger:
|
||||||
"Shutting down Ray distributed executor. If you see error log "
|
# Somehow logger can be None here.
|
||||||
"from logging.cc regarding SIGTERM received, please ignore because "
|
logger.info(
|
||||||
"this is the expected termination process in Ray.")
|
"Shutting down Ray distributed executor. If you see error log "
|
||||||
|
"from logging.cc regarding SIGTERM received, please ignore "
|
||||||
|
"because this is the expected termination process in Ray.")
|
||||||
if hasattr(self, "forward_dag") and self.forward_dag is not None:
|
if hasattr(self, "forward_dag") and self.forward_dag is not None:
|
||||||
self.forward_dag.teardown()
|
self.forward_dag.teardown()
|
||||||
import ray
|
import ray
|
||||||
|
|||||||
Reference in New Issue
Block a user