[torch.compile] Sequence Parallelism threshold compile ranges (#28672)
Signed-off-by: jasonlizhengjian <jasonlizhengjian@gmail.com> Signed-off-by: Jason Li <jasonlizhengjian@gmail.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
This commit is contained in:
110
tests/compile/test_sequence_parallelism_threshold.py
Normal file
110
tests/compile/test_sequence_parallelism_threshold.py
Normal file
@@ -0,0 +1,110 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.compilation.passes.fusion.sequence_parallelism import (
|
||||
SP_MIN_HIDDEN_SIZE,
|
||||
SP_MIN_PER_GPU_SIZE_MB,
|
||||
get_sequence_parallelism_threshold,
|
||||
)
|
||||
|
||||
|
||||
class TestGetSequenceParallelismThreshold:
|
||||
"""Tests for get_sequence_parallelism_threshold function."""
|
||||
|
||||
def test_non_cuda_returns_none(self, mock_cuda_platform):
|
||||
"""Non-CUDA platforms should return None."""
|
||||
with mock_cuda_platform(is_cuda=False):
|
||||
result = get_sequence_parallelism_threshold(
|
||||
hidden_size=8192, tp_size=2, element_size=2
|
||||
)
|
||||
assert result is None
|
||||
|
||||
def test_unsupported_device_capability_returns_none(self, mock_cuda_platform):
|
||||
"""Unsupported device capabilities (e.g., sm80) should return None."""
|
||||
with mock_cuda_platform(capability=(8, 0)):
|
||||
result = get_sequence_parallelism_threshold(
|
||||
hidden_size=8192, tp_size=2, element_size=2
|
||||
)
|
||||
assert result is None
|
||||
|
||||
def test_small_hidden_size_returns_none(self, mock_cuda_platform):
|
||||
"""H100 with hidden_size below threshold should return None."""
|
||||
with mock_cuda_platform(capability=(9, 0)):
|
||||
result = get_sequence_parallelism_threshold(
|
||||
hidden_size=4096,
|
||||
tp_size=2,
|
||||
element_size=2, # 4096 < 8192
|
||||
)
|
||||
assert result is None
|
||||
|
||||
def test_h100_large_model_returns_threshold(self, mock_cuda_platform):
|
||||
"""H100 with large enough hidden_size should return calculated threshold."""
|
||||
with mock_cuda_platform(capability=(9, 0)):
|
||||
hidden_size = 8192
|
||||
tp_size = 2
|
||||
element_size = 2 # float16/bfloat16
|
||||
|
||||
result = get_sequence_parallelism_threshold(
|
||||
hidden_size=hidden_size,
|
||||
tp_size=tp_size,
|
||||
element_size=element_size,
|
||||
)
|
||||
|
||||
# Verify calculation: (8 * 2 * 1024 * 1024) // (8192 * 2) = 1024
|
||||
MiB = 1024 * 1024
|
||||
expected = int(
|
||||
(SP_MIN_PER_GPU_SIZE_MB[90] * tp_size * MiB)
|
||||
// (hidden_size * element_size)
|
||||
)
|
||||
assert result == expected
|
||||
assert result == 1024
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"hidden_size,tp_size,element_size,expected",
|
||||
[
|
||||
# Boundary: exactly at min hidden size threshold, tp_size=1
|
||||
# (8 * 1 * 1024 * 1024) // (8192 * 2) = 512
|
||||
(8192, 1, 2, 512),
|
||||
# Larger hidden size reduces token threshold
|
||||
# (8 * 1 * 1024 * 1024) // (16384 * 2) = 256
|
||||
(16384, 1, 2, 256),
|
||||
# Larger tp_size increases token threshold
|
||||
# (8 * 4 * 1024 * 1024) // (8192 * 2) = 2048
|
||||
(8192, 4, 2, 2048),
|
||||
# Larger element_size (fp32) reduces token threshold
|
||||
# (8 * 2 * 1024 * 1024) // (8192 * 4) = 512
|
||||
(8192, 2, 4, 512),
|
||||
],
|
||||
)
|
||||
def test_threshold_calculation_variations(
|
||||
self, mock_cuda_platform, hidden_size, tp_size, element_size, expected
|
||||
):
|
||||
"""Test threshold calculation with various parameter combinations."""
|
||||
with mock_cuda_platform(capability=(9, 0)):
|
||||
result = get_sequence_parallelism_threshold(
|
||||
hidden_size=hidden_size,
|
||||
tp_size=tp_size,
|
||||
element_size=element_size,
|
||||
)
|
||||
assert result == expected
|
||||
|
||||
def test_hidden_size_boundary(self, mock_cuda_platform):
|
||||
"""Test behavior at the exact hidden_size boundary."""
|
||||
with mock_cuda_platform(capability=(9, 0)):
|
||||
# Just below threshold
|
||||
result = get_sequence_parallelism_threshold(
|
||||
hidden_size=SP_MIN_HIDDEN_SIZE[90] - 1,
|
||||
tp_size=2,
|
||||
element_size=2,
|
||||
)
|
||||
assert result is None
|
||||
|
||||
# Exactly at threshold
|
||||
result = get_sequence_parallelism_threshold(
|
||||
hidden_size=SP_MIN_HIDDEN_SIZE[90],
|
||||
tp_size=2,
|
||||
element_size=2,
|
||||
)
|
||||
assert result is not None
|
||||
Reference in New Issue
Block a user