Support Roberta embedding models (#9387)

Signed-off-by: Max de Bayser <mbayser@br.ibm.com> Signed-off-by: Flavia Beo <flavia.beo@ibm.com> Co-authored-by: Flavia Beo <flavia.beo@ibm.com>
2024-11-14 18:23:29 -03:00
parent 1dbae0329c
commit 4a18fd14ba
10 changed files with 202 additions and 14 deletions
--- a/csrc/cpu/attention.cpp
+++ b/csrc/cpu/attention.cpp
@@ -385,6 +385,9 @@ void paged_attention_v1_impl_launcher(
  int* seq_lens_ptr = seq_lens.data_ptr<int>();

  switch (head_size) {
+    case 32:
+      LAUNCH_V1_ATTENTION_KERNEL(T, 32, BLOCK_SIZE);
+      break;
    case 64:
      LAUNCH_V1_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
      break;
@@ -702,6 +705,9 @@ void paged_attention_v2_impl_launcher(
  int* seq_lens_ptr = seq_lens.data_ptr<int>();

  switch (head_size) {
+    case 32:
+      LAUNCH_V2_ATTENTION_KERNEL(T, 32, BLOCK_SIZE);
+      break;
    case 64:
      LAUNCH_V2_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
      break;