[MoE] Move DEEP_GEMM into experts/ subdirectory (#39005)
Signed-off-by: Jackmin801 <ongjackm@gmail.com> Signed-off-by: Robert Shaw <robshaw@redhat.com> Co-authored-by: Robert Shaw <robshaw@redhat.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
This commit is contained in:
@@ -20,7 +20,7 @@ import matplotlib.pyplot as plt
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
|
from vllm.model_executor.layers.fused_moe.experts.batched_deep_gemm_moe import (
|
||||||
persistent_masked_m_silu_mul_quant,
|
persistent_masked_m_silu_mul_quant,
|
||||||
)
|
)
|
||||||
from vllm.triton_utils import tl, triton
|
from vllm.triton_utils import tl, triton
|
||||||
|
|||||||
@@ -82,7 +82,7 @@ To be used with a particular `FusedMoEPrepareAndFinalizeModular` subclass, MoE k
|
|||||||
| ------ | ----------------- | ------------ | ------------- | ------------------- | --------------------- | ------- | ------ |
|
| ------ | ----------------- | ------------ | ------------- | ------------------- | --------------------- | ------- | ------ |
|
||||||
| triton | standard | all<sup>1</sup> | G,A,T | silu, gelu,</br>swigluoai,</br>silu_no_mul,</br>gelu_no_mul | Y | Y | [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts],</br>[`TritonExperts`][vllm.model_executor.layers.fused_moe.fused_moe.TritonExperts] |
|
| triton | standard | all<sup>1</sup> | G,A,T | silu, gelu,</br>swigluoai,</br>silu_no_mul,</br>gelu_no_mul | Y | Y | [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts],</br>[`TritonExperts`][vllm.model_executor.layers.fused_moe.fused_moe.TritonExperts] |
|
||||||
| triton (batched) | batched | all<sup>1</sup> | G,A,T | silu, gelu | <sup>6</sup> | Y | [`BatchedTritonExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedTritonExperts] |
|
| triton (batched) | batched | all<sup>1</sup> | G,A,T | silu, gelu | <sup>6</sup> | Y | [`BatchedTritonExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedTritonExperts] |
|
||||||
| deep gemm | standard,</br>batched | fp8 | G(128),A,T | silu, gelu | <sup>6</sup> | Y | </br>[`DeepGemmExperts`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.DeepGemmExperts],</br>[`BatchedDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe.BatchedDeepGemmExperts] |
|
| deep gemm | standard,</br>batched | fp8 | G(128),A,T | silu, gelu | <sup>6</sup> | Y | </br>[`DeepGemmExperts`][vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe.DeepGemmExperts],</br>[`BatchedDeepGemmExperts`][vllm.model_executor.layers.fused_moe.experts.batched_deep_gemm_moe.BatchedDeepGemmExperts] |
|
||||||
| cutlass_fp4 | standard,</br>batched | nvfp4 | A,T | silu | Y | Y | [`CutlassExpertsFp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp4] |
|
| cutlass_fp4 | standard,</br>batched | nvfp4 | A,T | silu | Y | Y | [`CutlassExpertsFp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp4] |
|
||||||
| cutlass_fp8 | standard,</br>batched | fp8 | A,T | silu, gelu | Y | Y | [`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp8],</br>[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassBatchedExpertsFp8] |
|
| cutlass_fp8 | standard,</br>batched | fp8 | A,T | silu, gelu | Y | Y | [`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp8],</br>[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassBatchedExpertsFp8] |
|
||||||
| flashinfer | standard | nvfp4,</br>fp8 | T | <sup>5</sup> | N | Y | [`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] |
|
| flashinfer | standard | nvfp4,</br>fp8 | T | <sup>5</sup> | N | Y | [`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] |
|
||||||
|
|||||||
@@ -7,14 +7,14 @@ import torch
|
|||||||
# Fused experts and PrepareFinalize imports
|
# Fused experts and PrepareFinalize imports
|
||||||
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
||||||
from vllm.model_executor.layers.fused_moe import TritonExperts
|
from vllm.model_executor.layers.fused_moe import TritonExperts
|
||||||
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
|
|
||||||
BatchedDeepGemmExperts,
|
|
||||||
)
|
|
||||||
from vllm.model_executor.layers.fused_moe.config import (
|
from vllm.model_executor.layers.fused_moe.config import (
|
||||||
FusedMoEConfig,
|
FusedMoEConfig,
|
||||||
FusedMoEQuantConfig,
|
FusedMoEQuantConfig,
|
||||||
)
|
)
|
||||||
from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts
|
from vllm.model_executor.layers.fused_moe.experts.batched_deep_gemm_moe import (
|
||||||
|
BatchedDeepGemmExperts,
|
||||||
|
)
|
||||||
|
from vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe import DeepGemmExperts
|
||||||
from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
|
from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
|
||||||
BatchedTritonExperts,
|
BatchedTritonExperts,
|
||||||
NaiveBatchedExperts,
|
NaiveBatchedExperts,
|
||||||
|
|||||||
@@ -5,10 +5,10 @@ import pytest
|
|||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm.model_executor.layers.fused_moe.activation import MoEActivation
|
from vllm.model_executor.layers.fused_moe.activation import MoEActivation
|
||||||
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
|
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
|
||||||
|
from vllm.model_executor.layers.fused_moe.experts.batched_deep_gemm_moe import (
|
||||||
BatchedDeepGemmExperts,
|
BatchedDeepGemmExperts,
|
||||||
)
|
)
|
||||||
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
|
|
||||||
from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
|
from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
|
||||||
BatchedPrepareAndFinalize,
|
BatchedPrepareAndFinalize,
|
||||||
BatchedTritonExperts,
|
BatchedTritonExperts,
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ from vllm.model_executor.layers.fused_moe.all2all_utils import (
|
|||||||
from vllm.model_executor.layers.fused_moe.config import (
|
from vllm.model_executor.layers.fused_moe.config import (
|
||||||
fp8_w8a8_moe_quant_config,
|
fp8_w8a8_moe_quant_config,
|
||||||
)
|
)
|
||||||
from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
|
from vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe import (
|
||||||
_valid_deep_gemm_shape,
|
_valid_deep_gemm_shape,
|
||||||
)
|
)
|
||||||
from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
|
from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
|
||||||
|
|||||||
@@ -47,10 +47,12 @@ if has_deep_ep():
|
|||||||
from .parallel_utils import DeepEPHTArgs, DeepEPLLArgs, make_deepep_a2a
|
from .parallel_utils import DeepEPHTArgs, DeepEPLLArgs, make_deepep_a2a
|
||||||
|
|
||||||
if has_deep_gemm():
|
if has_deep_gemm():
|
||||||
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
|
from vllm.model_executor.layers.fused_moe.experts.batched_deep_gemm_moe import (
|
||||||
BatchedDeepGemmExperts,
|
BatchedDeepGemmExperts,
|
||||||
)
|
)
|
||||||
from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts
|
from vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe import (
|
||||||
|
DeepGemmExperts,
|
||||||
|
)
|
||||||
|
|
||||||
requires_deep_ep = pytest.mark.skipif(
|
requires_deep_ep = pytest.mark.skipif(
|
||||||
not has_deep_ep(),
|
not has_deep_ep(),
|
||||||
|
|||||||
@@ -175,7 +175,7 @@ def test_deepgemm_vs_triton(m, n, k, topk, num_experts, monkeypatch, workspace_i
|
|||||||
mp.setenv("VLLM_USE_DEEP_GEMM", "1")
|
mp.setenv("VLLM_USE_DEEP_GEMM", "1")
|
||||||
|
|
||||||
_DeepGemmExperts = importlib.import_module(
|
_DeepGemmExperts = importlib.import_module(
|
||||||
"vllm.model_executor.layers.fused_moe.deep_gemm_moe"
|
"vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe"
|
||||||
).DeepGemmExperts
|
).DeepGemmExperts
|
||||||
|
|
||||||
call_counter = {"cnt": 0}
|
call_counter = {"cnt": 0}
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ import random
|
|||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
|
from vllm.model_executor.layers.fused_moe.experts.batched_deep_gemm_moe import (
|
||||||
persistent_masked_m_silu_mul_quant,
|
persistent_masked_m_silu_mul_quant,
|
||||||
)
|
)
|
||||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||||
|
|||||||
@@ -77,16 +77,18 @@ __all__ = [
|
|||||||
|
|
||||||
if HAS_TRITON:
|
if HAS_TRITON:
|
||||||
# import to register the custom ops
|
# import to register the custom ops
|
||||||
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
|
|
||||||
BatchedDeepGemmExperts,
|
|
||||||
)
|
|
||||||
from vllm.model_executor.layers.fused_moe.cutlass_moe import (
|
from vllm.model_executor.layers.fused_moe.cutlass_moe import (
|
||||||
CutlassBatchedExpertsFp8,
|
CutlassBatchedExpertsFp8,
|
||||||
CutlassExpertsFp8,
|
CutlassExpertsFp8,
|
||||||
CutlassExpertsW4A8Fp8,
|
CutlassExpertsW4A8Fp8,
|
||||||
cutlass_moe_w4a8_fp8,
|
cutlass_moe_w4a8_fp8,
|
||||||
)
|
)
|
||||||
from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts
|
from vllm.model_executor.layers.fused_moe.experts.batched_deep_gemm_moe import (
|
||||||
|
BatchedDeepGemmExperts,
|
||||||
|
)
|
||||||
|
from vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe import (
|
||||||
|
DeepGemmExperts,
|
||||||
|
)
|
||||||
from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
|
from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
|
||||||
BatchedTritonExperts,
|
BatchedTritonExperts,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -131,7 +131,7 @@ def backend_to_kernel_cls(
|
|||||||
return [TritonOrDeepGemmExperts]
|
return [TritonOrDeepGemmExperts]
|
||||||
|
|
||||||
elif backend == Fp8MoeBackend.BATCHED_DEEPGEMM:
|
elif backend == Fp8MoeBackend.BATCHED_DEEPGEMM:
|
||||||
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
|
from vllm.model_executor.layers.fused_moe.experts.batched_deep_gemm_moe import (
|
||||||
BatchedDeepGemmExperts,
|
BatchedDeepGemmExperts,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ from vllm.model_executor.layers.fused_moe.config import (
|
|||||||
FusedMoEConfig,
|
FusedMoEConfig,
|
||||||
FusedMoEQuantConfig,
|
FusedMoEQuantConfig,
|
||||||
)
|
)
|
||||||
from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
|
from vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe import (
|
||||||
DeepGemmExperts,
|
DeepGemmExperts,
|
||||||
_valid_deep_gemm,
|
_valid_deep_gemm,
|
||||||
_valid_deep_gemm_shape,
|
_valid_deep_gemm_shape,
|
||||||
|
|||||||
@@ -11,8 +11,8 @@ from tqdm import tqdm
|
|||||||
|
|
||||||
import vllm.envs as envs
|
import vllm.envs as envs
|
||||||
from vllm.distributed.parallel_state import get_dp_group, is_global_first_rank
|
from vllm.distributed.parallel_state import get_dp_group, is_global_first_rank
|
||||||
from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts
|
|
||||||
from vllm.model_executor.layers.fused_moe.deep_gemm_utils import compute_aligned_M
|
from vllm.model_executor.layers.fused_moe.deep_gemm_utils import compute_aligned_M
|
||||||
|
from vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe import DeepGemmExperts
|
||||||
from vllm.model_executor.layers.fused_moe.layer import FusedMoE
|
from vllm.model_executor.layers.fused_moe.layer import FusedMoE
|
||||||
from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
|
from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
|
||||||
TritonOrDeepGemmExperts,
|
TritonOrDeepGemmExperts,
|
||||||
|
|||||||
Reference in New Issue
Block a user