Signed-off-by: wuchenxin <wuchenxin.wcx@alibaba-inc.com> Signed-off-by: ibifrost <47308427+ibifrost@users.noreply.github.com> Co-authored-by: Simon Mo <simon.mo@hey.com>
285 lines
10 KiB
Python
285 lines
10 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||
"""
|
||
Tests for resource management in hf3fs_client.py: constructor failure cleanup
|
||
and idempotent close(). Tests use mock to replace real I/O operations
|
||
(hf3fs_fuse.io, SharedMemory, os, CUDA).
|
||
Requires hf3fs_fuse.io to be installed; skipped otherwise.
|
||
"""
|
||
|
||
from typing import Any
|
||
from unittest.mock import MagicMock, patch
|
||
|
||
import pytest
|
||
|
||
HF3FS_AVAILABLE = True
|
||
try:
|
||
from hf3fs_fuse.io import ( # noqa: F401
|
||
deregister_fd,
|
||
extract_mount_point,
|
||
make_ioring,
|
||
make_iovec,
|
||
register_fd,
|
||
)
|
||
|
||
from vllm.distributed.kv_transfer.kv_connector.v1.hf3fs.hf3fs_client import (
|
||
Hf3fsClient,
|
||
)
|
||
except Exception:
|
||
HF3FS_AVAILABLE = False
|
||
|
||
requires_hf3fs = pytest.mark.skipif(
|
||
not HF3FS_AVAILABLE,
|
||
reason="hf3fs_fuse.io is not available on this machine",
|
||
)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Helpers
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
class _FakeShm:
|
||
"""Shared-memory stub matching the multiprocessing.shared_memory.SharedMemory
|
||
interface used by Hf3fsClient:
|
||
|
||
Attributes accessed by the constructor:
|
||
.buf – memoryview / buffer-protocol object consumed by torch.frombuffer
|
||
Methods called during normal lifetime:
|
||
.unlink() – called right after the iovec is set up
|
||
.close() – called in _release_resources()
|
||
"""
|
||
|
||
def __init__(self, size: int = 1024):
|
||
self._data = bytearray(size)
|
||
self.buf = memoryview(self._data)
|
||
self.closed = False
|
||
self.close_call_count = 0
|
||
self.unlink_call_count = 0
|
||
|
||
def close(self):
|
||
self.closed = True
|
||
self.close_call_count += 1
|
||
|
||
def unlink(self):
|
||
self.unlink_call_count += 1
|
||
|
||
|
||
# ===========================================================================
|
||
# TestHf3fsClientResourceManagement
|
||
# ===========================================================================
|
||
|
||
|
||
@requires_hf3fs
|
||
class TestHf3fsClientResourceManagement:
|
||
"""Tests for constructor failure cleanup and idempotent close()."""
|
||
|
||
_MOD = "vllm.distributed.kv_transfer.kv_connector.v1.hf3fs.hf3fs_client"
|
||
|
||
# ------------------------------------------------------------------
|
||
# Helper: build a minimal Hf3fsClient bypassing all real I/O so that
|
||
# we can fully control its internal state.
|
||
# ------------------------------------------------------------------
|
||
|
||
def _make_client(self, tmp_path):
|
||
"""Return a fully-mocked Hf3fsClient with controllable internals."""
|
||
fake_shm_r = _FakeShm()
|
||
fake_shm_w = _FakeShm()
|
||
|
||
patcher_list: list[Any] = [
|
||
patch(f"{self._MOD}.HF3FS_AVAILABLE", True),
|
||
patch(f"{self._MOD}.register_fd"),
|
||
patch(f"{self._MOD}.deregister_fd"),
|
||
patch(f"{self._MOD}.extract_mount_point", return_value="/mnt/hf3fs"),
|
||
patch(f"{self._MOD}.make_ioring", return_value=MagicMock()),
|
||
patch(f"{self._MOD}.make_iovec", return_value=MagicMock()),
|
||
patch(
|
||
"multiprocessing.shared_memory.SharedMemory",
|
||
side_effect=[fake_shm_r, fake_shm_w],
|
||
),
|
||
patch("os.open", return_value=99),
|
||
patch("os.ftruncate"),
|
||
patch("os.close"),
|
||
patch("os.fsync"),
|
||
patch("torch.cuda.Stream", return_value=MagicMock()),
|
||
patch("torch.frombuffer", return_value=MagicMock()),
|
||
patch("torch.empty", return_value=MagicMock()),
|
||
]
|
||
for p in patcher_list:
|
||
p.start()
|
||
|
||
try:
|
||
client = Hf3fsClient(
|
||
path=str(tmp_path / "test.bin"),
|
||
size=1024,
|
||
bytes_per_page=256,
|
||
entries=4,
|
||
)
|
||
finally:
|
||
for p in patcher_list:
|
||
p.stop()
|
||
|
||
# Manually point internal handles to our controllable fakes so that
|
||
# assertions after close() can inspect them directly.
|
||
client.shm_r = fake_shm_r
|
||
client.shm_w = fake_shm_w
|
||
client.file = 99
|
||
return client, fake_shm_r, fake_shm_w
|
||
|
||
# ------------------------------------------------------------------
|
||
# close() idempotency
|
||
# ------------------------------------------------------------------
|
||
|
||
def test_close_idempotent_and_handles_cleared(self, tmp_path):
|
||
"""Multiple close() calls must not raise; deregister_fd called exactly
|
||
once, all handles set to None, shm.close() invoked."""
|
||
client, shm_r, shm_w = self._make_client(tmp_path)
|
||
|
||
with (
|
||
patch(f"{self._MOD}.deregister_fd") as mock_dereg,
|
||
patch("os.close"),
|
||
):
|
||
client.close() # first close
|
||
client.close() # second close — must be no-op
|
||
client.close() # third close — must be no-op
|
||
|
||
assert client._closed is True
|
||
assert mock_dereg.call_count == 1, (
|
||
f"deregister_fd called {mock_dereg.call_count} times; expected 1"
|
||
)
|
||
for attr in ("iov_r", "iov_w", "ior_r", "ior_w", "shm_r", "shm_w", "file"):
|
||
assert getattr(client, attr) is None, f"{attr} should be None after close()"
|
||
assert shm_r.closed is True
|
||
assert shm_w.closed is True
|
||
|
||
def test_flush_after_close_is_noop(self, tmp_path):
|
||
"""flush() after close() must silently do nothing (no fsync call)."""
|
||
client, _, _ = self._make_client(tmp_path)
|
||
|
||
with (
|
||
patch(f"{self._MOD}.deregister_fd"),
|
||
patch("os.close"),
|
||
patch("os.fsync") as mock_fsync,
|
||
):
|
||
client.close()
|
||
client.flush()
|
||
|
||
mock_fsync.assert_not_called()
|
||
|
||
# ------------------------------------------------------------------
|
||
# Constructor failure leaves no leaked resources
|
||
# ------------------------------------------------------------------
|
||
|
||
def test_constructor_failure_after_file_open_cleans_file(self, tmp_path):
|
||
"""If the constructor raises after os.open(), the fd must be closed."""
|
||
with (
|
||
patch(f"{self._MOD}.HF3FS_AVAILABLE", True),
|
||
patch(f"{self._MOD}.register_fd"),
|
||
patch(f"{self._MOD}.deregister_fd"),
|
||
patch(
|
||
f"{self._MOD}.extract_mount_point",
|
||
side_effect=RuntimeError("mount point not found"),
|
||
),
|
||
patch("os.open", return_value=55),
|
||
patch("os.ftruncate"),
|
||
patch("os.close") as mock_os_close,
|
||
patch("torch.cuda.Stream", return_value=MagicMock()),
|
||
pytest.raises(RuntimeError, match="mount point not found"),
|
||
):
|
||
Hf3fsClient(
|
||
path=str(tmp_path / "fail.bin"),
|
||
size=1024,
|
||
bytes_per_page=256,
|
||
entries=4,
|
||
)
|
||
|
||
mock_os_close.assert_called_once_with(55)
|
||
|
||
def test_constructor_failure_after_shm_alloc_closes_shm(self, tmp_path):
|
||
"""Constructor raises after SharedMemory creation → both shm objects closed."""
|
||
fake_shm_r = _FakeShm()
|
||
fake_shm_w = _FakeShm()
|
||
|
||
with (
|
||
patch(f"{self._MOD}.HF3FS_AVAILABLE", True),
|
||
patch(f"{self._MOD}.register_fd"),
|
||
patch(f"{self._MOD}.deregister_fd"),
|
||
patch(f"{self._MOD}.extract_mount_point", return_value="/mnt/hf3fs"),
|
||
patch(
|
||
"multiprocessing.shared_memory.SharedMemory",
|
||
side_effect=[fake_shm_r, fake_shm_w],
|
||
),
|
||
patch("os.open", return_value=66),
|
||
patch("os.ftruncate"),
|
||
patch("os.close"),
|
||
patch("torch.frombuffer", return_value=MagicMock()),
|
||
patch("torch.empty", return_value=MagicMock()),
|
||
patch(
|
||
f"{self._MOD}.make_ioring",
|
||
side_effect=RuntimeError("ioring init failed"),
|
||
),
|
||
patch(f"{self._MOD}.make_iovec", return_value=MagicMock()),
|
||
patch("torch.cuda.Stream", return_value=MagicMock()),
|
||
pytest.raises(RuntimeError, match="ioring init failed"),
|
||
):
|
||
Hf3fsClient(
|
||
path=str(tmp_path / "fail2.bin"),
|
||
size=1024,
|
||
bytes_per_page=256,
|
||
entries=4,
|
||
)
|
||
|
||
assert fake_shm_r.closed is True, (
|
||
"shm_r was not closed after constructor failure"
|
||
)
|
||
assert fake_shm_w.closed is True, (
|
||
"shm_w was not closed after constructor failure"
|
||
)
|
||
|
||
def test_constructor_failure_does_not_close_unallocated_shm(self, tmp_path):
|
||
"""Failure before SharedMemory is created must not raise AttributeError
|
||
or TypeError from cleanup."""
|
||
with (
|
||
patch(f"{self._MOD}.HF3FS_AVAILABLE", True),
|
||
patch(f"{self._MOD}.register_fd"),
|
||
patch(f"{self._MOD}.deregister_fd"),
|
||
patch(
|
||
f"{self._MOD}.extract_mount_point",
|
||
side_effect=RuntimeError("early failure"),
|
||
),
|
||
patch("os.open", return_value=77),
|
||
patch("os.ftruncate"),
|
||
patch("os.close"),
|
||
patch("torch.cuda.Stream", return_value=MagicMock()),
|
||
pytest.raises(RuntimeError, match="early failure"),
|
||
):
|
||
Hf3fsClient(
|
||
path=str(tmp_path / "early_fail.bin"),
|
||
size=1024,
|
||
bytes_per_page=256,
|
||
entries=4,
|
||
)
|
||
|
||
# ------------------------------------------------------------------
|
||
# _release_resources on already-cleared state must be a no-op
|
||
# ------------------------------------------------------------------
|
||
|
||
def test_release_resources_on_empty_state_is_safe(self, tmp_path):
|
||
"""_release_resources() on a fully-cleared client must not raise."""
|
||
client, _, _ = self._make_client(tmp_path)
|
||
|
||
with (
|
||
patch(f"{self._MOD}.deregister_fd"),
|
||
patch("os.close"),
|
||
):
|
||
client.close() # clears all handles
|
||
|
||
with (
|
||
patch(f"{self._MOD}.deregister_fd") as mock_dereg2,
|
||
patch("os.close") as mock_os_close2,
|
||
):
|
||
client._release_resources() # must not raise
|
||
|
||
mock_dereg2.assert_not_called()
|
||
mock_os_close2.assert_not_called()
|