Support sequence parallelism combined with pipeline parallelism (#18243)
Signed-off-by: cascade812 <cascade812@outlook.com>
This commit is contained in:
@@ -26,6 +26,7 @@ VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
|
||||
|
||||
class ParallelSetup(NamedTuple):
|
||||
tp_size: int
|
||||
pp_size: int
|
||||
sp_enabled: bool
|
||||
eager_mode: bool
|
||||
chunked_prefill: bool
|
||||
@@ -60,6 +61,7 @@ class SPTestSettings:
|
||||
def detailed(
|
||||
*,
|
||||
tp_base: int = 2,
|
||||
pp_base: int = 1,
|
||||
multi_node_only: bool = False,
|
||||
task: TaskOption = "auto",
|
||||
load_format: Optional[str] = None,
|
||||
@@ -67,18 +69,42 @@ class SPTestSettings:
|
||||
return SPTestSettings(
|
||||
parallel_setups=[
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
pp_size=pp_base,
|
||||
sp_enabled=True,
|
||||
eager_mode=False,
|
||||
chunked_prefill=False),
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
pp_size=pp_base,
|
||||
sp_enabled=True,
|
||||
eager_mode=False,
|
||||
chunked_prefill=True),
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
pp_size=pp_base,
|
||||
sp_enabled=True,
|
||||
eager_mode=True,
|
||||
chunked_prefill=False),
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
pp_size=pp_base,
|
||||
sp_enabled=True,
|
||||
eager_mode=True,
|
||||
chunked_prefill=True),
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
pp_size=2 * pp_base,
|
||||
sp_enabled=True,
|
||||
eager_mode=False,
|
||||
chunked_prefill=False),
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
pp_size=2 * pp_base,
|
||||
sp_enabled=True,
|
||||
eager_mode=False,
|
||||
chunked_prefill=True),
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
pp_size=2 * pp_base,
|
||||
sp_enabled=True,
|
||||
eager_mode=True,
|
||||
chunked_prefill=False),
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
pp_size=2 * pp_base,
|
||||
sp_enabled=True,
|
||||
eager_mode=True,
|
||||
chunked_prefill=True)
|
||||
@@ -94,6 +120,7 @@ class SPTestSettings:
|
||||
def fast(
|
||||
*,
|
||||
tp_base: int = 2,
|
||||
pp_base: int = 1,
|
||||
task: TaskOption = "auto",
|
||||
multi_node_only: bool = False,
|
||||
load_format: Optional[str] = None,
|
||||
@@ -101,6 +128,12 @@ class SPTestSettings:
|
||||
return SPTestSettings(
|
||||
parallel_setups=[
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
pp_size=pp_base,
|
||||
sp_enabled=True,
|
||||
eager_mode=False,
|
||||
chunked_prefill=False),
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
pp_size=2 * pp_base,
|
||||
sp_enabled=True,
|
||||
eager_mode=False,
|
||||
chunked_prefill=False),
|
||||
@@ -136,6 +169,7 @@ def _compare_sp(
|
||||
):
|
||||
(
|
||||
tp_size,
|
||||
pp_size,
|
||||
sp_enabled,
|
||||
eager_mode,
|
||||
chunked_prefill,
|
||||
@@ -167,7 +201,6 @@ def _compare_sp(
|
||||
else:
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
|
||||
pp_size = 1
|
||||
if num_gpus_available < tp_size * pp_size:
|
||||
pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
|
||||
if VLLM_MULTI_NODE and distributed_backend == "mp":
|
||||
@@ -256,7 +289,7 @@ def _compare_sp(
|
||||
|
||||
SP_TEXT_GENERATION_MODELS = {
|
||||
# [Decoder-only]
|
||||
"meta-llama/Llama-3.2-1B-Instruct": SPTestSettings.detailed(),
|
||||
"meta-llama/Llama-3.2-1B-Instruct": SPTestSettings.fast(),
|
||||
}
|
||||
|
||||
SP_TEST_MODELS = [
|
||||
|
||||
Reference in New Issue
Block a user