Files
vllm/tests/v1/kv_connector/unit/test_hf3fs_client.py
ibifrost 96b5004b71 [KVConnector] Support 3FS KVConnector (#37636)
Signed-off-by: wuchenxin <wuchenxin.wcx@alibaba-inc.com>
Signed-off-by: ibifrost <47308427+ibifrost@users.noreply.github.com>
Co-authored-by: Simon Mo <simon.mo@hey.com>
2026-04-07 15:46:00 +00:00

285 lines
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Tests for resource management in hf3fs_client.py: constructor failure cleanup
and idempotent close(). Tests use mock to replace real I/O operations
(hf3fs_fuse.io, SharedMemory, os, CUDA).
Requires hf3fs_fuse.io to be installed; skipped otherwise.
"""
from typing import Any
from unittest.mock import MagicMock, patch
import pytest
HF3FS_AVAILABLE = True
try:
from hf3fs_fuse.io import ( # noqa: F401
deregister_fd,
extract_mount_point,
make_ioring,
make_iovec,
register_fd,
)
from vllm.distributed.kv_transfer.kv_connector.v1.hf3fs.hf3fs_client import (
Hf3fsClient,
)
except Exception:
HF3FS_AVAILABLE = False
requires_hf3fs = pytest.mark.skipif(
not HF3FS_AVAILABLE,
reason="hf3fs_fuse.io is not available on this machine",
)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
class _FakeShm:
"""Shared-memory stub matching the multiprocessing.shared_memory.SharedMemory
interface used by Hf3fsClient:
Attributes accessed by the constructor:
.buf memoryview / buffer-protocol object consumed by torch.frombuffer
Methods called during normal lifetime:
.unlink() called right after the iovec is set up
.close() called in _release_resources()
"""
def __init__(self, size: int = 1024):
self._data = bytearray(size)
self.buf = memoryview(self._data)
self.closed = False
self.close_call_count = 0
self.unlink_call_count = 0
def close(self):
self.closed = True
self.close_call_count += 1
def unlink(self):
self.unlink_call_count += 1
# ===========================================================================
# TestHf3fsClientResourceManagement
# ===========================================================================
@requires_hf3fs
class TestHf3fsClientResourceManagement:
"""Tests for constructor failure cleanup and idempotent close()."""
_MOD = "vllm.distributed.kv_transfer.kv_connector.v1.hf3fs.hf3fs_client"
# ------------------------------------------------------------------
# Helper: build a minimal Hf3fsClient bypassing all real I/O so that
# we can fully control its internal state.
# ------------------------------------------------------------------
def _make_client(self, tmp_path):
"""Return a fully-mocked Hf3fsClient with controllable internals."""
fake_shm_r = _FakeShm()
fake_shm_w = _FakeShm()
patcher_list: list[Any] = [
patch(f"{self._MOD}.HF3FS_AVAILABLE", True),
patch(f"{self._MOD}.register_fd"),
patch(f"{self._MOD}.deregister_fd"),
patch(f"{self._MOD}.extract_mount_point", return_value="/mnt/hf3fs"),
patch(f"{self._MOD}.make_ioring", return_value=MagicMock()),
patch(f"{self._MOD}.make_iovec", return_value=MagicMock()),
patch(
"multiprocessing.shared_memory.SharedMemory",
side_effect=[fake_shm_r, fake_shm_w],
),
patch("os.open", return_value=99),
patch("os.ftruncate"),
patch("os.close"),
patch("os.fsync"),
patch("torch.cuda.Stream", return_value=MagicMock()),
patch("torch.frombuffer", return_value=MagicMock()),
patch("torch.empty", return_value=MagicMock()),
]
for p in patcher_list:
p.start()
try:
client = Hf3fsClient(
path=str(tmp_path / "test.bin"),
size=1024,
bytes_per_page=256,
entries=4,
)
finally:
for p in patcher_list:
p.stop()
# Manually point internal handles to our controllable fakes so that
# assertions after close() can inspect them directly.
client.shm_r = fake_shm_r
client.shm_w = fake_shm_w
client.file = 99
return client, fake_shm_r, fake_shm_w
# ------------------------------------------------------------------
# close() idempotency
# ------------------------------------------------------------------
def test_close_idempotent_and_handles_cleared(self, tmp_path):
"""Multiple close() calls must not raise; deregister_fd called exactly
once, all handles set to None, shm.close() invoked."""
client, shm_r, shm_w = self._make_client(tmp_path)
with (
patch(f"{self._MOD}.deregister_fd") as mock_dereg,
patch("os.close"),
):
client.close() # first close
client.close() # second close — must be no-op
client.close() # third close — must be no-op
assert client._closed is True
assert mock_dereg.call_count == 1, (
f"deregister_fd called {mock_dereg.call_count} times; expected 1"
)
for attr in ("iov_r", "iov_w", "ior_r", "ior_w", "shm_r", "shm_w", "file"):
assert getattr(client, attr) is None, f"{attr} should be None after close()"
assert shm_r.closed is True
assert shm_w.closed is True
def test_flush_after_close_is_noop(self, tmp_path):
"""flush() after close() must silently do nothing (no fsync call)."""
client, _, _ = self._make_client(tmp_path)
with (
patch(f"{self._MOD}.deregister_fd"),
patch("os.close"),
patch("os.fsync") as mock_fsync,
):
client.close()
client.flush()
mock_fsync.assert_not_called()
# ------------------------------------------------------------------
# Constructor failure leaves no leaked resources
# ------------------------------------------------------------------
def test_constructor_failure_after_file_open_cleans_file(self, tmp_path):
"""If the constructor raises after os.open(), the fd must be closed."""
with (
patch(f"{self._MOD}.HF3FS_AVAILABLE", True),
patch(f"{self._MOD}.register_fd"),
patch(f"{self._MOD}.deregister_fd"),
patch(
f"{self._MOD}.extract_mount_point",
side_effect=RuntimeError("mount point not found"),
),
patch("os.open", return_value=55),
patch("os.ftruncate"),
patch("os.close") as mock_os_close,
patch("torch.cuda.Stream", return_value=MagicMock()),
pytest.raises(RuntimeError, match="mount point not found"),
):
Hf3fsClient(
path=str(tmp_path / "fail.bin"),
size=1024,
bytes_per_page=256,
entries=4,
)
mock_os_close.assert_called_once_with(55)
def test_constructor_failure_after_shm_alloc_closes_shm(self, tmp_path):
"""Constructor raises after SharedMemory creation → both shm objects closed."""
fake_shm_r = _FakeShm()
fake_shm_w = _FakeShm()
with (
patch(f"{self._MOD}.HF3FS_AVAILABLE", True),
patch(f"{self._MOD}.register_fd"),
patch(f"{self._MOD}.deregister_fd"),
patch(f"{self._MOD}.extract_mount_point", return_value="/mnt/hf3fs"),
patch(
"multiprocessing.shared_memory.SharedMemory",
side_effect=[fake_shm_r, fake_shm_w],
),
patch("os.open", return_value=66),
patch("os.ftruncate"),
patch("os.close"),
patch("torch.frombuffer", return_value=MagicMock()),
patch("torch.empty", return_value=MagicMock()),
patch(
f"{self._MOD}.make_ioring",
side_effect=RuntimeError("ioring init failed"),
),
patch(f"{self._MOD}.make_iovec", return_value=MagicMock()),
patch("torch.cuda.Stream", return_value=MagicMock()),
pytest.raises(RuntimeError, match="ioring init failed"),
):
Hf3fsClient(
path=str(tmp_path / "fail2.bin"),
size=1024,
bytes_per_page=256,
entries=4,
)
assert fake_shm_r.closed is True, (
"shm_r was not closed after constructor failure"
)
assert fake_shm_w.closed is True, (
"shm_w was not closed after constructor failure"
)
def test_constructor_failure_does_not_close_unallocated_shm(self, tmp_path):
"""Failure before SharedMemory is created must not raise AttributeError
or TypeError from cleanup."""
with (
patch(f"{self._MOD}.HF3FS_AVAILABLE", True),
patch(f"{self._MOD}.register_fd"),
patch(f"{self._MOD}.deregister_fd"),
patch(
f"{self._MOD}.extract_mount_point",
side_effect=RuntimeError("early failure"),
),
patch("os.open", return_value=77),
patch("os.ftruncate"),
patch("os.close"),
patch("torch.cuda.Stream", return_value=MagicMock()),
pytest.raises(RuntimeError, match="early failure"),
):
Hf3fsClient(
path=str(tmp_path / "early_fail.bin"),
size=1024,
bytes_per_page=256,
entries=4,
)
# ------------------------------------------------------------------
# _release_resources on already-cleared state must be a no-op
# ------------------------------------------------------------------
def test_release_resources_on_empty_state_is_safe(self, tmp_path):
"""_release_resources() on a fully-cleared client must not raise."""
client, _, _ = self._make_client(tmp_path)
with (
patch(f"{self._MOD}.deregister_fd"),
patch("os.close"),
):
client.close() # clears all handles
with (
patch(f"{self._MOD}.deregister_fd") as mock_dereg2,
patch("os.close") as mock_os_close2,
):
client._release_resources() # must not raise
mock_dereg2.assert_not_called()
mock_os_close2.assert_not_called()