[CI] Add retry with 4x backoff to HTTP fetches for transient failures (#37218)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
This commit is contained in:
Andreas Karatzas
2026-03-19 14:00:20 -05:00
committed by GitHub
parent e5d96dc8fc
commit fb8b5e05fc
2 changed files with 225 additions and 13 deletions

View File

@@ -1,15 +1,201 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Mapping, MutableMapping
import asyncio
import functools
import time
from collections.abc import Callable, Coroutine, Mapping, MutableMapping
from pathlib import Path
from typing import Any, ParamSpec, TypeVar
import aiohttp
import requests
from urllib3.util import parse_url
import vllm.envs as envs
from vllm.logger import init_logger
from vllm.version import __version__ as VLLM_VERSION
logger = init_logger(__name__)
_P = ParamSpec("_P")
_T = TypeVar("_T")
# Multiplier applied to timeout and sleep on each retry attempt.
# Attempt N uses: base_timeout * (_RETRY_BACKOFF_FACTOR ** N) for the
# per-attempt timeout and sleeps _RETRY_BACKOFF_FACTOR ** N seconds.
_RETRY_BACKOFF_FACTOR = 4
def _is_retryable(exc: Exception) -> bool:
"""Return True for transient errors that are worth retrying.
Retryable:
- Timeouts (aiohttp, requests, stdlib)
- Connection-level failures (refused, reset, DNS)
- Server errors (5xx) -- includes S3 503 SlowDown
Not retryable:
- Client errors (4xx) -- bad URL, auth, not-found
- Programming errors (ValueError, TypeError, ...)
"""
# Timeouts
if isinstance(
exc,
(
TimeoutError,
asyncio.TimeoutError,
requests.exceptions.Timeout,
aiohttp.ServerTimeoutError,
),
):
return True
# Connection-level failures
if isinstance(
exc,
(
ConnectionError,
aiohttp.ClientConnectionError,
requests.exceptions.ConnectionError,
),
):
return True
# aiohttp server-side disconnects
if isinstance(exc, aiohttp.ServerDisconnectedError):
return True
# requests 5xx -- raise_for_status() throws HTTPError
if (
isinstance(exc, requests.exceptions.HTTPError)
and exc.response is not None
and exc.response.status_code >= 500
):
return True
# aiohttp 5xx -- raise_for_status() throws ClientResponseError
return isinstance(exc, aiohttp.ClientResponseError) and exc.status >= 500
def _log_retry(
args: tuple,
kwargs: dict,
attempt: int,
max_retries: int,
attempt_timeout: float | None,
exc: Exception,
backoff: float,
base_timeout: float | None,
) -> None:
# args[0] is `self` (bound method), args[1] is the URL
url = args[1] if len(args) > 1 else kwargs.get("url")
timeout_info = (
f"timeout={attempt_timeout:.3f}s" if base_timeout is not None else "no timeout"
)
next_timeout = (
f" with timeout={base_timeout * (_RETRY_BACKOFF_FACTOR ** (attempt + 1)):.3f}s"
if base_timeout is not None
else ""
)
logger.warning(
"HTTP fetch failed for %s (attempt %d/%d, %s): %s -- retrying in %.3fs%s",
url,
attempt + 1,
max_retries,
timeout_info,
exc,
backoff,
next_timeout,
)
def _sync_retry(
fn: Callable[_P, _T],
) -> Callable[_P, _T]:
"""Add retry logic with exponential backoff to a sync method.
The decorated method must accept ``timeout`` as a keyword argument.
The decorator replaces it with a per-attempt timeout that grows by
``_RETRY_BACKOFF_FACTOR`` on each retry so transient slowness on busy
hosts is absorbed.
"""
@functools.wraps(fn)
def wrapper(*args: Any, **kwargs: Any) -> _T:
base_timeout: float | None = kwargs.get("timeout")
max_retries = max(envs.VLLM_MEDIA_FETCH_MAX_RETRIES, 1)
for attempt in range(max_retries):
attempt_timeout = (
base_timeout * (_RETRY_BACKOFF_FACTOR**attempt)
if base_timeout is not None
else None
)
kwargs["timeout"] = attempt_timeout
try:
return fn(*args, **kwargs)
except Exception as e:
if not _is_retryable(e) or attempt + 1 >= max_retries:
raise
backoff = _RETRY_BACKOFF_FACTOR**attempt
_log_retry(
args,
kwargs,
attempt,
max_retries,
attempt_timeout,
e,
backoff,
base_timeout,
)
time.sleep(backoff)
raise AssertionError("unreachable")
return wrapper # type: ignore[return-value]
def _async_retry(
fn: Callable[_P, Coroutine[Any, Any, _T]],
) -> Callable[_P, Coroutine[Any, Any, _T]]:
"""Add retry logic with exponential backoff to an async method.
The decorated method must accept ``timeout`` as a keyword argument.
The decorator replaces it with a per-attempt timeout that grows by
``_RETRY_BACKOFF_FACTOR`` on each retry so transient slowness on busy
hosts is absorbed.
"""
@functools.wraps(fn)
async def wrapper(*args: Any, **kwargs: Any) -> _T:
base_timeout: float | None = kwargs.get("timeout")
max_retries = max(envs.VLLM_MEDIA_FETCH_MAX_RETRIES, 1)
for attempt in range(max_retries):
attempt_timeout = (
base_timeout * (_RETRY_BACKOFF_FACTOR**attempt)
if base_timeout is not None
else None
)
kwargs["timeout"] = attempt_timeout
try:
return await fn(*args, **kwargs)
except Exception as e:
if not _is_retryable(e) or attempt + 1 >= max_retries:
raise
backoff = _RETRY_BACKOFF_FACTOR**attempt
_log_retry(
args,
kwargs,
attempt,
max_retries,
attempt_timeout,
e,
backoff,
base_timeout,
)
await asyncio.sleep(backoff)
raise AssertionError("unreachable")
return wrapper # type: ignore[return-value]
class HTTPConnection:
"""Helper class to send HTTP requests."""
@@ -89,6 +275,7 @@ class HTTPConnection:
allow_redirects=allow_redirects,
)
@_sync_retry
def get_bytes(
self, url: str, *, timeout: float | None = None, allow_redirects: bool = True
) -> bytes:
@@ -99,6 +286,7 @@ class HTTPConnection:
return r.content
@_async_retry
async def async_get_bytes(
self,
url: str,
@@ -147,6 +335,7 @@ class HTTPConnection:
return await r.json()
@_sync_retry
def download_file(
self,
url: str,
@@ -155,15 +344,22 @@ class HTTPConnection:
timeout: float | None = None,
chunk_size: int = 128,
) -> Path:
with self.get_response(url, timeout=timeout) as r:
r.raise_for_status()
try:
with self.get_response(url, timeout=timeout) as r:
r.raise_for_status()
with save_path.open("wb") as f:
for chunk in r.iter_content(chunk_size):
f.write(chunk)
with save_path.open("wb") as f:
for chunk in r.iter_content(chunk_size):
f.write(chunk)
return save_path
return save_path
except Exception:
# Clean up partial downloads before retrying or propagating
if save_path.exists():
save_path.unlink()
raise
@_async_retry
async def async_download_file(
self,
url: str,
@@ -172,14 +368,23 @@ class HTTPConnection:
timeout: float | None = None,
chunk_size: int = 128,
) -> Path:
async with await self.get_async_response(url, timeout=timeout) as r:
r.raise_for_status()
try:
async with await self.get_async_response(
url,
timeout=timeout,
) as r:
r.raise_for_status()
with save_path.open("wb") as f:
async for chunk in r.content.iter_chunked(chunk_size):
f.write(chunk)
with save_path.open("wb") as f:
async for chunk in r.content.iter_chunked(chunk_size):
f.write(chunk)
return save_path
return save_path
except Exception:
# Clean up partial downloads before retrying or propagating
if save_path.exists():
save_path.unlink()
raise
global_http_connection = HTTPConnection()

View File

@@ -64,6 +64,7 @@ if TYPE_CHECKING:
VLLM_IMAGE_FETCH_TIMEOUT: int = 5
VLLM_VIDEO_FETCH_TIMEOUT: int = 30
VLLM_AUDIO_FETCH_TIMEOUT: int = 10
VLLM_MEDIA_FETCH_MAX_RETRIES: int = 3
VLLM_MEDIA_URL_ALLOW_REDIRECTS: bool = True
VLLM_MEDIA_LOADING_THREAD_COUNT: int = 8
VLLM_MAX_AUDIO_CLIP_FILESIZE_MB: int = 25
@@ -773,6 +774,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_AUDIO_FETCH_TIMEOUT": lambda: int(
os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")
),
# Maximum number of retries for fetching media (images, audio, video)
# from URLs. Each retry quadruples the timeout. Default is 3.
"VLLM_MEDIA_FETCH_MAX_RETRIES": lambda: int(
os.getenv("VLLM_MEDIA_FETCH_MAX_RETRIES", "3")
),
# Whether to allow HTTP redirects when fetching from media URLs.
# Default to True
"VLLM_MEDIA_URL_ALLOW_REDIRECTS": lambda: bool(
@@ -1768,6 +1774,7 @@ def compile_factors() -> dict[str, object]:
"VLLM_IMAGE_FETCH_TIMEOUT",
"VLLM_VIDEO_FETCH_TIMEOUT",
"VLLM_AUDIO_FETCH_TIMEOUT",
"VLLM_MEDIA_FETCH_MAX_RETRIES",
"VLLM_MEDIA_URL_ALLOW_REDIRECTS",
"VLLM_MEDIA_LOADING_THREAD_COUNT",
"VLLM_MAX_AUDIO_CLIP_FILESIZE_MB",