vllm/tests/basic_correctness/test_prefetch_offload.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Test prefetch offloading correctness with Llama model."""

from ..utils import compare_two_settings


def test_prefetch_offload_llama():
    """Test prefetch CPU offloading with Llama-3.2-1B-Instruct.

    Compares outputs between:
    1. Baseline (no offloading)
    2. Prefetch offloading (group_size=8, num_in_group=2, prefetch_step=1)

    This tests prefetching-based offloading on a dense model.
    """
    compare_two_settings(
        "meta-llama/Llama-3.2-1B-Instruct",
        [
            # Prefetch offloading configuration
            "--offload-group-size",
            "8",
            "--offload-num-in-group",
            "2",
            "--offload-prefetch-step",
            "1",
            # Selective offloading: only MLP weights
            "--offload-params",
            "gate_up_proj",
            "down_proj",
        ],
        [],  # Baseline: no offloading
    )