[Kernel][Attention] Separate Attention.kv_scale into k_scale and v_scale (#6081)

2024-07-16 18:31:32 -04:00
parent 160e1d8c99
commit 978aed5300
33 changed files with 317 additions and 185 deletions
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -196,6 +196,15 @@ class ReplicatedLinear(LinearBase):
        else:
            self.register_parameter("bias", None)

+    def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
+        # If the weight on disk does not have a shape, give it one
+        # (such scales for AutoFp8).
+        if len(loaded_weight.shape) == 0:
+            loaded_weight = loaded_weight.reshape(1)
+
+        assert param.size() == loaded_weight.size()
+        param.data.copy_(loaded_weight)
+
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        bias = self.bias if not self.skip_bias_add else None
        assert self.quant_method is not None