Signed-off-by: Robert Shaw <robshaw@redhat.com> Signed-off-by: Robert Shaw <rshaw@neuralmagic.com> Signed-off-by: Robert Shaw <robertgshaw2@gmail.com> Co-authored-by: Robert Shaw <robshaw@redhat.com> Co-authored-by: Robert Shaw <rshaw@neuralmagic.com>
213 lines
7.9 KiB
Python
213 lines
7.9 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
|
|
from typing import Any
|
|
|
|
import torch
|
|
|
|
from vllm.distributed import (
|
|
get_ep_group,
|
|
)
|
|
from vllm.logger import init_logger
|
|
from vllm.model_executor.layers.fused_moe.config import (
|
|
FusedMoEConfig,
|
|
FusedMoEParallelConfig,
|
|
FusedMoEQuantConfig,
|
|
)
|
|
from vllm.model_executor.layers.fused_moe.flashinfer_a2a_prepare_finalize import (
|
|
FlashInferA2APrepareAndFinalize,
|
|
)
|
|
from vllm.model_executor.layers.fused_moe.modular_kernel import (
|
|
FusedMoEPrepareAndFinalize,
|
|
)
|
|
from vllm.model_executor.layers.fused_moe.prepare_finalize import (
|
|
make_moe_prepare_and_finalize_naive_dp_ep,
|
|
make_moe_prepare_and_finalize_no_dp_ep,
|
|
)
|
|
from vllm.platforms import current_platform
|
|
from vllm.utils.import_utils import has_deep_ep, has_mori
|
|
|
|
logger = init_logger(__name__)
|
|
|
|
if current_platform.is_cuda_alike():
|
|
if has_deep_ep():
|
|
from .deepep_ht_prepare_finalize import DeepEPHTPrepareAndFinalize
|
|
from .deepep_ll_prepare_finalize import (
|
|
DEEPEP_QUANT_BLOCK_SHAPE,
|
|
DeepEPLLPrepareAndFinalize,
|
|
)
|
|
if has_mori():
|
|
from .mori_prepare_finalize import MoriPrepareAndFinalize
|
|
|
|
|
|
def maybe_roundup_layer_hidden_size(
|
|
hidden_size: int,
|
|
act_dtype: torch.dtype,
|
|
moe_parallel_config: FusedMoEParallelConfig,
|
|
) -> int:
|
|
"""
|
|
Given layer hidden size and MoE configurations, round up hidden_size
|
|
if necessary.
|
|
|
|
Args:
|
|
hidden_size: Layer hidden-size
|
|
act_dtype: Data type of the layer activations.
|
|
moe_parallel_config: Fused MoE parallelization strategy configuration.
|
|
|
|
Return:
|
|
Rounded up hidden_size if rounding up is required based on the configs
|
|
and all2all backend.
|
|
Original hidden size otherwise.
|
|
"""
|
|
if moe_parallel_config.use_deepep_ht_kernels:
|
|
hidden_size = DeepEPHTPrepareAndFinalize.maybe_roundup_layer_hidden_size(
|
|
hidden_size, act_dtype
|
|
)
|
|
|
|
if moe_parallel_config.use_deepep_ll_kernels:
|
|
hidden_size = DeepEPLLPrepareAndFinalize.maybe_roundup_layer_hidden_size(
|
|
hidden_size
|
|
)
|
|
|
|
return hidden_size
|
|
|
|
|
|
def maybe_make_prepare_finalize(
|
|
moe: FusedMoEConfig,
|
|
quant_config: FusedMoEQuantConfig | None,
|
|
routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
|
|
allow_new_interface: bool = False,
|
|
use_monolithic: bool = False,
|
|
) -> FusedMoEPrepareAndFinalize | None:
|
|
# NOTE(rob): we are migrating each quant_method to hold the MK
|
|
# in all cases. The allow_new_interface=False flag allow us to fall
|
|
# back to the old method for methods that have not yet been migrated.
|
|
#
|
|
# In old method:
|
|
# * maybe_init_modular_kernel() calls this function. If we are
|
|
# using no Dp/Ep or naive all2all, we return None this function
|
|
# returns None and no ModularKernelMethod is created. If non-naive
|
|
# all2all is used, this returns a PrepareAndFinalize object and
|
|
# a ModularKernelMethod is created.
|
|
# In new method:
|
|
# * maybe_make_prepare_finalize() is called from the oracle. We
|
|
# always return a PrepareAndFinalize object and the quant method
|
|
# holds the ModularKernel.
|
|
if not moe.moe_parallel_config.use_all2all_kernels:
|
|
if not allow_new_interface:
|
|
return None
|
|
|
|
# For DP/TP case, fall back to naive P/F.
|
|
if moe.moe_parallel_config.dp_size > 1:
|
|
logger.info_once(
|
|
"Detected DP deployment with no --enable-expert-parallel. "
|
|
"Falling back to AllGather+ReduceScatter dispatch/combine."
|
|
)
|
|
return make_moe_prepare_and_finalize_naive_dp_ep(
|
|
is_sequence_parallel=moe.moe_parallel_config.is_sequence_parallel,
|
|
num_dispatchers=(
|
|
get_ep_group().device_communicator.all2all_manager.world_size
|
|
),
|
|
use_monolithic=use_monolithic,
|
|
)
|
|
else:
|
|
return make_moe_prepare_and_finalize_no_dp_ep(use_monolithic)
|
|
|
|
all2all_manager = get_ep_group().device_communicator.all2all_manager
|
|
assert all2all_manager is not None
|
|
|
|
prepare_finalize: FusedMoEPrepareAndFinalize | None = None
|
|
|
|
if moe.use_deepep_ht_kernels:
|
|
assert moe.dp_size == all2all_manager.dp_world_size
|
|
|
|
all_to_all_args: dict[str, Any] = dict()
|
|
handle = all2all_manager.get_handle(all_to_all_args)
|
|
prepare_finalize = DeepEPHTPrepareAndFinalize(
|
|
handle,
|
|
num_dispatchers=all2all_manager.world_size,
|
|
dp_size=all2all_manager.dp_world_size,
|
|
rank_expert_offset=all2all_manager.rank * moe.num_local_experts,
|
|
)
|
|
|
|
elif moe.use_deepep_ll_kernels:
|
|
assert quant_config is not None
|
|
global_to_physical = physical_to_global = local_expert_global_ids = None
|
|
if routing_tables is not None:
|
|
(
|
|
global_to_physical,
|
|
physical_to_global,
|
|
local_expert_global_ids,
|
|
) = routing_tables
|
|
all_to_all_args = dict(
|
|
max_num_tokens_per_dp_rank=moe.max_num_tokens,
|
|
token_hidden_size=moe.hidden_dim,
|
|
num_ep_ranks=all2all_manager.world_size,
|
|
num_global_experts=moe.num_experts,
|
|
num_local_experts=moe.num_experts // all2all_manager.world_size,
|
|
)
|
|
handle = all2all_manager.get_handle(all_to_all_args)
|
|
|
|
# Note: We may want to use FP8 dispatch just to reduce
|
|
# data movement.
|
|
use_fp8_dispatch = (
|
|
quant_config.quant_dtype == current_platform.fp8_dtype()
|
|
and quant_config.block_shape == DEEPEP_QUANT_BLOCK_SHAPE
|
|
)
|
|
|
|
prepare_finalize = DeepEPLLPrepareAndFinalize(
|
|
handle,
|
|
max_tokens_per_rank=moe.max_num_tokens,
|
|
num_dispatchers=all2all_manager.world_size,
|
|
use_fp8_dispatch=use_fp8_dispatch,
|
|
global_to_physical=global_to_physical,
|
|
physical_to_global=physical_to_global,
|
|
local_expert_global_ids=local_expert_global_ids,
|
|
)
|
|
elif moe.use_mori_kernels:
|
|
assert quant_config is not None
|
|
|
|
# Note: We may want to use FP8 dispatch just to reduce
|
|
# data movement.
|
|
use_fp8_dispatch = (
|
|
quant_config.is_per_act_token or quant_config.is_block_quantized
|
|
)
|
|
# For PTPC (per token per channel) quant, the scale dim for each token is 1
|
|
# For 1x128 quant, the scale dim for each token is hidden_dim // 128
|
|
scale_dim = 1 if quant_config.is_per_act_token else moe.hidden_dim // 128
|
|
all_to_all_args = dict(
|
|
rank=all2all_manager.rank,
|
|
num_ep_ranks=all2all_manager.world_size,
|
|
quant_dtype=quant_config.quant_dtype,
|
|
token_hidden_size=moe.hidden_dim,
|
|
scale_dim=scale_dim,
|
|
scale_type_size=torch.float32.itemsize,
|
|
max_num_tokens_per_dp_rank=moe.max_num_tokens,
|
|
input_dtype=moe.in_dtype,
|
|
num_local_experts=moe.num_experts // all2all_manager.world_size,
|
|
num_experts_per_token=moe.experts_per_token,
|
|
)
|
|
handle = all2all_manager.get_handle(all_to_all_args)
|
|
|
|
prepare_finalize = MoriPrepareAndFinalize(
|
|
handle,
|
|
max_tokens_per_rank=moe.max_num_tokens,
|
|
num_dispatchers=all2all_manager.world_size,
|
|
use_fp8_dispatch=use_fp8_dispatch,
|
|
)
|
|
|
|
elif moe.use_fi_all2allv_kernels:
|
|
assert quant_config is not None
|
|
prepare_finalize = FlashInferA2APrepareAndFinalize(
|
|
num_dispatchers=all2all_manager.world_size,
|
|
)
|
|
|
|
elif moe.use_naive_all2all_kernels and allow_new_interface:
|
|
prepare_finalize = make_moe_prepare_and_finalize_naive_dp_ep(
|
|
use_monolithic=use_monolithic,
|
|
is_sequence_parallel=moe.moe_parallel_config.is_sequence_parallel,
|
|
num_dispatchers=all2all_manager.world_size,
|
|
)
|
|
|
|
return prepare_finalize
|