diff --git a/tests/kernels/moe/test_gpt_oss_triton_kernels.py b/tests/kernels/moe/test_gpt_oss_triton_kernels.py index 4900949ad..630ea2e3f 100644 --- a/tests/kernels/moe/test_gpt_oss_triton_kernels.py +++ b/tests/kernels/moe/test_gpt_oss_triton_kernels.py @@ -26,9 +26,10 @@ from vllm.model_executor.layers.fused_moe.config import mxfp4_w4a16_moe_quant_co from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import ( triton_kernel_moe_forward, ) -from vllm.model_executor.layers.utils import shuffle_weight from vllm.utils.math_utils import round_up +from .utils import shuffle_weight + def deshuffle(w: torch.Tensor): first = w[..., ::2] diff --git a/tests/kernels/moe/test_modular_oai_triton_moe.py b/tests/kernels/moe/test_modular_oai_triton_moe.py index cf9ff1863..99d96e970 100644 --- a/tests/kernels/moe/test_modular_oai_triton_moe.py +++ b/tests/kernels/moe/test_modular_oai_triton_moe.py @@ -33,11 +33,10 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularK from vllm.model_executor.layers.fused_moe.prepare_finalize import ( MoEPrepareAndFinalizeNoEP, ) -from vllm.model_executor.layers.utils import shuffle_weight from vllm.platforms import current_platform from vllm.utils.torch_utils import set_random_seed -from .utils import make_dummy_moe_config +from .utils import make_dummy_moe_config, shuffle_weight MNK = [ (1, 512, 384), diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py index ef72b96be..e0a234111 100644 --- a/tests/kernels/moe/utils.py +++ b/tests/kernels/moe/utils.py @@ -33,6 +33,16 @@ from vllm.utils.deep_gemm import per_block_cast_to_fp8 from vllm.utils.math_utils import round_up +def shuffle_weight(w: torch.Tensor) -> torch.Tensor: + """Fold weights to adjacent locations for Triton MoE / SwiGLU kernel layout.""" + shape = w.shape + n = shape[-1] + first = w[..., : n // 2] + second = w[..., n // 2 :] + stacked = torch.stack((first, second), dim=-1) + return stacked.reshape(shape) + + def make_dummy_moe_config( num_experts: int = 1, experts_per_token: int = 1, diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py index 79d48a203..d1e35f583 100644 --- a/vllm/model_executor/layers/utils.py +++ b/vllm/model_executor/layers/utils.py @@ -31,27 +31,6 @@ def is_layer_moe_router_gate(prefix: str) -> bool: return prefix.rsplit(".", 1)[-1] in MOE_LAYER_ROUTER_GATE_SUFFIXES -def shuffle_weight(w: torch.Tensor) -> torch.Tensor: - # Shuffle weight along the last dimension so that - # we folded the weights to adjance location - # Example: - # input: - # [[1, 2, 3, 4, 5, 6], - # [7, 8, 9, 10, 11, 12]] - # output: - # [[1, 4, 2, 5, 3, 6], - # [7, 10, 8, 11, 9, 12]] - # This will be used together with triton swiglu kernel - shape = w.shape - N = shape[-1] - first = w[..., : N // 2] - second = w[..., N // 2 :] - - stacked = torch.stack((first, second), dim=-1) - w_shuffled = stacked.reshape(shape) - return w_shuffled - - def get_token_bin_counts_and_mask( tokens: torch.Tensor, vocab_size: int,