34 lines
1006 B
Python
34 lines
1006 B
Python
|
|
# SPDX-License-Identifier: Apache-2.0
|
||
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||
|
|
"""Test prefetch offloading correctness with Llama model."""
|
||
|
|
|
||
|
|
from ..utils import compare_two_settings
|
||
|
|
|
||
|
|
|
||
|
|
def test_prefetch_offload_llama():
|
||
|
|
"""Test prefetch CPU offloading with Llama-3.2-1B-Instruct.
|
||
|
|
|
||
|
|
Compares outputs between:
|
||
|
|
1. Baseline (no offloading)
|
||
|
|
2. Prefetch offloading (group_size=8, num_in_group=2, prefetch_step=1)
|
||
|
|
|
||
|
|
This tests prefetching-based offloading on a dense model.
|
||
|
|
"""
|
||
|
|
compare_two_settings(
|
||
|
|
"meta-llama/Llama-3.2-1B-Instruct",
|
||
|
|
[
|
||
|
|
# Prefetch offloading configuration
|
||
|
|
"--offload-group-size",
|
||
|
|
"8",
|
||
|
|
"--offload-num-in-group",
|
||
|
|
"2",
|
||
|
|
"--offload-prefetch-step",
|
||
|
|
"1",
|
||
|
|
# Selective offloading: only MLP weights
|
||
|
|
"--offload-params",
|
||
|
|
"gate_up_proj",
|
||
|
|
"down_proj",
|
||
|
|
],
|
||
|
|
[], # Baseline: no offloading
|
||
|
|
)
|