Signed-off-by: jasonlizhengjian <jasonlizhengjian@gmail.com> Signed-off-by: Jason Li <jasonlizhengjian@gmail.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
111 lines
4.1 KiB
Python
111 lines
4.1 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
|
|
import pytest
|
|
|
|
from vllm.compilation.passes.fusion.sequence_parallelism import (
|
|
SP_MIN_HIDDEN_SIZE,
|
|
SP_MIN_PER_GPU_SIZE_MB,
|
|
get_sequence_parallelism_threshold,
|
|
)
|
|
|
|
|
|
class TestGetSequenceParallelismThreshold:
|
|
"""Tests for get_sequence_parallelism_threshold function."""
|
|
|
|
def test_non_cuda_returns_none(self, mock_cuda_platform):
|
|
"""Non-CUDA platforms should return None."""
|
|
with mock_cuda_platform(is_cuda=False):
|
|
result = get_sequence_parallelism_threshold(
|
|
hidden_size=8192, tp_size=2, element_size=2
|
|
)
|
|
assert result is None
|
|
|
|
def test_unsupported_device_capability_returns_none(self, mock_cuda_platform):
|
|
"""Unsupported device capabilities (e.g., sm80) should return None."""
|
|
with mock_cuda_platform(capability=(8, 0)):
|
|
result = get_sequence_parallelism_threshold(
|
|
hidden_size=8192, tp_size=2, element_size=2
|
|
)
|
|
assert result is None
|
|
|
|
def test_small_hidden_size_returns_none(self, mock_cuda_platform):
|
|
"""H100 with hidden_size below threshold should return None."""
|
|
with mock_cuda_platform(capability=(9, 0)):
|
|
result = get_sequence_parallelism_threshold(
|
|
hidden_size=4096,
|
|
tp_size=2,
|
|
element_size=2, # 4096 < 8192
|
|
)
|
|
assert result is None
|
|
|
|
def test_h100_large_model_returns_threshold(self, mock_cuda_platform):
|
|
"""H100 with large enough hidden_size should return calculated threshold."""
|
|
with mock_cuda_platform(capability=(9, 0)):
|
|
hidden_size = 8192
|
|
tp_size = 2
|
|
element_size = 2 # float16/bfloat16
|
|
|
|
result = get_sequence_parallelism_threshold(
|
|
hidden_size=hidden_size,
|
|
tp_size=tp_size,
|
|
element_size=element_size,
|
|
)
|
|
|
|
# Verify calculation: (8 * 2 * 1024 * 1024) // (8192 * 2) = 1024
|
|
MiB = 1024 * 1024
|
|
expected = int(
|
|
(SP_MIN_PER_GPU_SIZE_MB[90] * tp_size * MiB)
|
|
// (hidden_size * element_size)
|
|
)
|
|
assert result == expected
|
|
assert result == 1024
|
|
|
|
@pytest.mark.parametrize(
|
|
"hidden_size,tp_size,element_size,expected",
|
|
[
|
|
# Boundary: exactly at min hidden size threshold, tp_size=1
|
|
# (8 * 1 * 1024 * 1024) // (8192 * 2) = 512
|
|
(8192, 1, 2, 512),
|
|
# Larger hidden size reduces token threshold
|
|
# (8 * 1 * 1024 * 1024) // (16384 * 2) = 256
|
|
(16384, 1, 2, 256),
|
|
# Larger tp_size increases token threshold
|
|
# (8 * 4 * 1024 * 1024) // (8192 * 2) = 2048
|
|
(8192, 4, 2, 2048),
|
|
# Larger element_size (fp32) reduces token threshold
|
|
# (8 * 2 * 1024 * 1024) // (8192 * 4) = 512
|
|
(8192, 2, 4, 512),
|
|
],
|
|
)
|
|
def test_threshold_calculation_variations(
|
|
self, mock_cuda_platform, hidden_size, tp_size, element_size, expected
|
|
):
|
|
"""Test threshold calculation with various parameter combinations."""
|
|
with mock_cuda_platform(capability=(9, 0)):
|
|
result = get_sequence_parallelism_threshold(
|
|
hidden_size=hidden_size,
|
|
tp_size=tp_size,
|
|
element_size=element_size,
|
|
)
|
|
assert result == expected
|
|
|
|
def test_hidden_size_boundary(self, mock_cuda_platform):
|
|
"""Test behavior at the exact hidden_size boundary."""
|
|
with mock_cuda_platform(capability=(9, 0)):
|
|
# Just below threshold
|
|
result = get_sequence_parallelism_threshold(
|
|
hidden_size=SP_MIN_HIDDEN_SIZE[90] - 1,
|
|
tp_size=2,
|
|
element_size=2,
|
|
)
|
|
assert result is None
|
|
|
|
# Exactly at threshold
|
|
result = get_sequence_parallelism_threshold(
|
|
hidden_size=SP_MIN_HIDDEN_SIZE[90],
|
|
tp_size=2,
|
|
element_size=2,
|
|
)
|
|
assert result is not None
|