[Hardware][CPU] Add embedding models support for CPU backend (#10193)
Signed-off-by: Isotr0py <2037008807@qq.com>
This commit is contained in:
@@ -5,7 +5,6 @@ from torch import nn
|
||||
from transformers import BertConfig
|
||||
|
||||
from vllm.attention import Attention, AttentionMetadata, AttentionType
|
||||
from vllm.attention.backends.xformers import XFormersImpl
|
||||
from vllm.config import CacheConfig, VllmConfig
|
||||
from vllm.distributed import get_tensor_model_parallel_world_size
|
||||
from vllm.model_executor.layers.activation import get_act_fn
|
||||
@@ -218,11 +217,6 @@ class BertSelfAttention(nn.Module):
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.attn")
|
||||
|
||||
if not isinstance(self.attn.impl, XFormersImpl):
|
||||
raise ValueError(
|
||||
"Encoder-only models currently require XFORMERS attention "
|
||||
"backend. Set VLLM_ATTENTION_BACKEND=XFORMERS to use BERT.")
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
|
||||
Reference in New Issue
Block a user