[Bugfix] Allow vllm to still work if triton is not installed. (#6786)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2024-07-29 23:51:27 +02:00
parent 7f8d612d24
commit 9a7e2d0534
13 changed files with 65 additions and 37 deletions
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -6,8 +6,7 @@ from torch.nn.parameter import Parameter

 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
-from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
-                                                  fused_moe)
+from vllm.model_executor.layers.fused_moe import FusedMoE, FusedMoEMethodBase
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                               UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization.base_config import (
@@ -404,6 +403,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
              num_expert_group: Optional[int] = None,
              topk_group: Optional[int] = None) -> torch.Tensor:

+        from vllm.model_executor.layers.fused_moe import fused_moe
        return fused_moe(x,
                         layer.w13_weight,
                         layer.w2_weight,