Files
vllm/tests/basic_correctness/test_prefetch_offload.py

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

34 lines
1006 B
Python
Raw Normal View History

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Test prefetch offloading correctness with Llama model."""
from ..utils import compare_two_settings
def test_prefetch_offload_llama():
"""Test prefetch CPU offloading with Llama-3.2-1B-Instruct.
Compares outputs between:
1. Baseline (no offloading)
2. Prefetch offloading (group_size=8, num_in_group=2, prefetch_step=1)
This tests prefetching-based offloading on a dense model.
"""
compare_two_settings(
"meta-llama/Llama-3.2-1B-Instruct",
[
# Prefetch offloading configuration
"--offload-group-size",
"8",
"--offload-num-in-group",
"2",
"--offload-prefetch-step",
"1",
# Selective offloading: only MLP weights
"--offload-params",
"gate_up_proj",
"down_proj",
],
[], # Baseline: no offloading
)