Files
vllm/tests/basic_correctness/test_prefetch_offload.py
Ming Yang 6831650c40 [offloader] v2: Hide weight onloading latency via prefetching (#29941)
Signed-off-by: Ming Yang <minos.future@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2026-02-25 17:20:59 -08:00

34 lines
1006 B
Python

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Test prefetch offloading correctness with Llama model."""
from ..utils import compare_two_settings
def test_prefetch_offload_llama():
"""Test prefetch CPU offloading with Llama-3.2-1B-Instruct.
Compares outputs between:
1. Baseline (no offloading)
2. Prefetch offloading (group_size=8, num_in_group=2, prefetch_step=1)
This tests prefetching-based offloading on a dense model.
"""
compare_two_settings(
"meta-llama/Llama-3.2-1B-Instruct",
[
# Prefetch offloading configuration
"--offload-group-size",
"8",
"--offload-num-in-group",
"2",
"--offload-prefetch-step",
"1",
# Selective offloading: only MLP weights
"--offload-params",
"gate_up_proj",
"down_proj",
],
[], # Baseline: no offloading
)