Separate attention backends (#3005)

This commit is contained in:
Woosuk Kwon
2024-03-07 01:45:50 -08:00
committed by GitHub
parent cbf4c05b15
commit 2daf23ab0c
35 changed files with 561 additions and 271 deletions

View File

@@ -43,7 +43,7 @@ import torch.nn.functional as F
from torch import nn
from vllm.model_executor.input_metadata import InputMetadata
from vllm.model_executor.layers.attention import PagedAttention
from vllm.model_executor.layers.attention import Attention
from vllm.model_executor.layers.linear import (
ColumnParallelLinear,
LinearMethodBase,
@@ -126,9 +126,9 @@ class OlmoAttention(nn.Module):
base=rope_theta,
)
self.scaling = self.head_dim**-0.5
self.attn = PagedAttention(self.num_heads,
self.head_dim,
scale=self.scaling)
self.attn = Attention(self.num_heads,
self.head_dim,
scale=self.scaling)
# Attention output projection.
self.attn_out = RowParallelLinear(