[Feature] Add env var VLLM_MOE_USE_DEEP_GEMM (#28422)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
This commit is contained in:
@@ -966,10 +966,18 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
|
||||
max_num_tokens=max_num_tokens_per_rank,
|
||||
num_dispatchers=prepare_finalize.num_dispatchers(),
|
||||
quant_config=self.moe_quant_config,
|
||||
allow_deep_gemm=(
|
||||
envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM
|
||||
),
|
||||
)
|
||||
else:
|
||||
logger.debug("TritonOrDeepGemmExperts(%s)", self.__class__.__name__)
|
||||
return TritonOrDeepGemmExperts(self.moe_quant_config, allow_deep_gemm=True)
|
||||
return TritonOrDeepGemmExperts(
|
||||
self.moe_quant_config,
|
||||
allow_deep_gemm=(
|
||||
envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM
|
||||
),
|
||||
)
|
||||
|
||||
def get_fused_moe_quant_config(
|
||||
self, layer: torch.nn.Module
|
||||
|
||||
@@ -158,7 +158,7 @@ def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend:
|
||||
return Fp8MoeBackend.MARLIN
|
||||
|
||||
# deepGEMM on supported platforms with block-quantized weights
|
||||
if envs.VLLM_USE_DEEP_GEMM and block_quant:
|
||||
if envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM and block_quant:
|
||||
if not has_deep_gemm():
|
||||
logger.warning_once("DeepGEMM backend requested but not available.")
|
||||
elif is_deep_gemm_supported():
|
||||
|
||||
Reference in New Issue
Block a user