Separate attention backends (#3005)
This commit is contained in:
@@ -43,7 +43,7 @@ import torch.nn.functional as F
|
||||
from torch import nn
|
||||
|
||||
from vllm.model_executor.input_metadata import InputMetadata
|
||||
from vllm.model_executor.layers.attention import PagedAttention
|
||||
from vllm.model_executor.layers.attention import Attention
|
||||
from vllm.model_executor.layers.linear import (
|
||||
ColumnParallelLinear,
|
||||
LinearMethodBase,
|
||||
@@ -126,9 +126,9 @@ class OlmoAttention(nn.Module):
|
||||
base=rope_theta,
|
||||
)
|
||||
self.scaling = self.head_dim**-0.5
|
||||
self.attn = PagedAttention(self.num_heads,
|
||||
self.head_dim,
|
||||
scale=self.scaling)
|
||||
self.attn = Attention(self.num_heads,
|
||||
self.head_dim,
|
||||
scale=self.scaling)
|
||||
|
||||
# Attention output projection.
|
||||
self.attn_out = RowParallelLinear(
|
||||
|
||||
Reference in New Issue
Block a user