[Refactor] Remove Duplicate per_block_cast_to_fp8, Remove Dependencies of DeepGEMM (#21787)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
This commit is contained in:
Wentao Ye
2025-07-31 21:13:27 -04:00
committed by GitHub
parent 0bd409cf01
commit 3700642013
8 changed files with 55 additions and 132 deletions

View File

@@ -5,8 +5,7 @@ from typing import Optional
import torch
import vllm._custom_ops as ops
from tests.kernels.quant_utils import (per_block_cast_to_fp8,
per_block_cast_to_int8)
from tests.kernels.quant_utils import per_block_cast_to_int8
from vllm.model_executor.layers.fused_moe import fused_experts
from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
BatchedPrepareAndFinalize, BatchedTritonExperts, NaiveBatchedExperts)
@@ -15,6 +14,7 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
from vllm.model_executor.layers.fused_moe.utils import (
moe_kernel_quantize_input)
from vllm.utils import round_up
from vllm.utils.deep_gemm import per_block_cast_to_fp8
def triton_moe(