[offloader] v2: Hide weight onloading latency via prefetching (#29941)
Signed-off-by: Ming Yang <minos.future@gmail.com> Signed-off-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Michael Goin <mgoin64@gmail.com>
This commit is contained in:
33
tests/basic_correctness/test_prefetch_offload.py
Normal file
33
tests/basic_correctness/test_prefetch_offload.py
Normal file
@@ -0,0 +1,33 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Test prefetch offloading correctness with Llama model."""
|
||||
|
||||
from ..utils import compare_two_settings
|
||||
|
||||
|
||||
def test_prefetch_offload_llama():
|
||||
"""Test prefetch CPU offloading with Llama-3.2-1B-Instruct.
|
||||
|
||||
Compares outputs between:
|
||||
1. Baseline (no offloading)
|
||||
2. Prefetch offloading (group_size=8, num_in_group=2, prefetch_step=1)
|
||||
|
||||
This tests prefetching-based offloading on a dense model.
|
||||
"""
|
||||
compare_two_settings(
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
[
|
||||
# Prefetch offloading configuration
|
||||
"--offload-group-size",
|
||||
"8",
|
||||
"--offload-num-in-group",
|
||||
"2",
|
||||
"--offload-prefetch-step",
|
||||
"1",
|
||||
# Selective offloading: only MLP weights
|
||||
"--offload-params",
|
||||
"gate_up_proj",
|
||||
"down_proj",
|
||||
],
|
||||
[], # Baseline: no offloading
|
||||
)
|
||||
Reference in New Issue
Block a user