[CPU] Refactor CPU unquantized linear (#24150)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-09-04 14:28:45 +08:00
parent cb55ad86fe
commit 57b1ce94f7
9 changed files with 466 additions and 26 deletions
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -40,6 +40,12 @@ class UnquantizedEmbeddingMethod(QuantizeMethodBase):
        layer.register_parameter("weight", weight)
        set_weight_attrs(weight, extra_weight_attrs)

+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        if current_platform.is_cpu():
+            from vllm.model_executor.layers.utils import (
+                dispatch_cpu_unquantized_gemm)
+            dispatch_cpu_unquantized_gemm(layer, remove_weight=False)
+
    def apply(self,
              layer: torch.nn.Module,
              x: torch.Tensor,