[Hardware][CPU] Add embedding models support for CPU backend (#10193)

Signed-off-by: Isotr0py <2037008807@qq.com>
This commit is contained in:
Isotr0py
2024-11-11 16:54:28 +08:00
committed by GitHub
parent 9804ac7c7c
commit 58170d6503
9 changed files with 185 additions and 52 deletions

View File

@@ -5,7 +5,6 @@ from torch import nn
from transformers import BertConfig
from vllm.attention import Attention, AttentionMetadata, AttentionType
from vllm.attention.backends.xformers import XFormersImpl
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.model_executor.layers.activation import get_act_fn
@@ -218,11 +217,6 @@ class BertSelfAttention(nn.Module):
quant_config=quant_config,
prefix=f"{prefix}.attn")
if not isinstance(self.attn.impl, XFormersImpl):
raise ValueError(
"Encoder-only models currently require XFORMERS attention "
"backend. Set VLLM_ATTENTION_BACKEND=XFORMERS to use BERT.")
def forward(
self,
hidden_states: torch.Tensor,