[Misc] Support FP8 MoE for compressed-tensors (#8588)

2024-09-25 12:43:36 -04:00
parent 64840dfae4
commit 873edda6cf
5 changed files with 226 additions and 8 deletions
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -321,13 +321,13 @@ class PhiMoEAttention(nn.Module):
            self.total_num_heads,
            self.total_num_kv_heads,
            bias=True,
-            quant_config=None,
+            quant_config=quant_config,
        )
        self.o_proj = RowParallelLinear(
            self.total_num_heads * self.head_dim,
            hidden_size,
            bias=True,
-            quant_config=None,
+            quant_config=quant_config,
        )
        self.rotary_emb = get_rope(
            self.head_dim,