Remove hardcoded device="cuda" to support more devices (#2503)

Co-authored-by: Jiang Li <jiang1.li@intel.com> Co-authored-by: Kunshang Ji <kunshang.ji@intel.com>
2024-02-02 07:46:39 +08:00
parent c410f5d020
commit 96b6f475dd
32 changed files with 343 additions and 292 deletions
--- a/tests/kernels/test_prefix_prefill.py
+++ b/tests/kernels/test_prefix_prefill.py
@@ -11,19 +11,27 @@ from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask
 NUM_HEADS = [12]
 HEAD_SIZES = [128]
 DTYPES = [torch.float16]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]


@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
@torch.inference_mode()
 def test_contexted_kv_attention(
    num_heads: int,
    head_size: int,
    dtype: torch.dtype,
+    device: str,
 ) -> None:
    random.seed(0)
    torch.manual_seed(0)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(0)
+    torch.set_default_device(device)
    MAX_SEQ_LEN = 1024
    MAX_CTX_LEN = 1024
    BS = 10
@@ -35,24 +43,11 @@ def test_contexted_kv_attention(
    seq_lens = [a + b for a, b in zip(subquery_lens, ctx_lens)]

    num_tokens = sum(subquery_lens)
-    query = torch.empty(num_tokens,
-                        num_heads,
-                        head_size,
-                        dtype=dtype,
-                        device='cuda')
+    query = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
    query.uniform_(-1e-3, 1e-3)
-    output = torch.empty(num_tokens,
-                         num_heads,
-                         head_size,
-                         dtype=dtype,
-                         device='cuda')
+    output = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)

-    kv = torch.empty(sum(seq_lens),
-                     2,
-                     num_heads,
-                     head_size,
-                     dtype=dtype,
-                     device='cuda')
+    kv = torch.empty(sum(seq_lens), 2, num_heads, head_size, dtype=dtype)
    kv.uniform_(-1e-3, 1e-3)
    key, value = kv.unbind(dim=1)

@@ -60,39 +55,27 @@ def test_contexted_kv_attention(
                          block_size,
                          num_heads,
                          head_size,
-                          dtype=dtype,
-                          device='cuda')
+                          dtype=dtype)
    v_cache = torch.zeros(cache_size,
                          block_size,
                          num_heads,
                          head_size,
-                          dtype=dtype,
-                          device='cuda')
-    k = torch.zeros(sum(subquery_lens),
-                    num_heads,
-                    head_size,
-                    dtype=dtype,
-                    device='cuda')
-    v = torch.zeros(sum(subquery_lens),
-                    num_heads,
-                    head_size,
-                    dtype=dtype,
-                    device='cuda')
-    values = torch.arange(0, cache_size, dtype=torch.long, device='cuda')
+                          dtype=dtype)
+    k = torch.zeros(sum(subquery_lens), num_heads, head_size, dtype=dtype)
+    v = torch.zeros(sum(subquery_lens), num_heads, head_size, dtype=dtype)
+    values = torch.arange(0, cache_size, dtype=torch.long)
    values = values[torch.randperm(cache_size)]
    block_table = values[:BS * max_block_per_request].view(
        BS, max_block_per_request)
-    b_seq_len = torch.tensor(seq_lens, dtype=torch.long, device='cuda')
-    b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long, device='cuda')
+    b_seq_len = torch.tensor(seq_lens, dtype=torch.long)
+    b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long)
    b_start_loc = torch.cumsum(torch.tensor([0] + subquery_lens[:-1],
-                                            dtype=torch.long,
-                                            device='cuda'),
+                                            dtype=torch.long),
                               dim=0)
    max_input_len = MAX_SEQ_LEN
    # copy kv to cache
    b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1],
-                                                dtype=torch.long,
-                                                device='cuda'),
+                                                dtype=torch.long),
                                   dim=0)
    for i in range(BS):
        for j in range(subquery_lens[i]):