Memcpy kernel for flash attention (#29)

* optimize * add benchmark * add assert * add test
2023-04-10 18:22:49 -07:00
parent b9926f7f66
commit e3cec88aa5
4 changed files with 293 additions and 0 deletions
--- a/tests/kernels/cache.py
+++ b/tests/kernels/cache.py
@@ -99,6 +99,47 @@ def test_reshape_and_cache(
    assert torch.allclose(value_cache, cloned_value_cache)


+def test_gather_cached_kv(
+    num_tokens: int,
+    num_heads: int,
+    head_size: int,
+    block_size: int,
+    num_blocks: int,
+    dtype: torch.dtype,
+) -> None:
+    num_slots = block_size * num_blocks
+    slot_mapping = random.sample(range(num_slots), num_tokens)
+    slot_mapping = torch.tensor(slot_mapping, dtype=torch.int, device='cuda')
+
+    qkv = torch.randn(
+        num_tokens, 3, num_heads, head_size, dtype=dtype, device='cuda')
+    _, key, value = qkv.unbind(dim=1)
+
+    qkv_clone = qkv.clone()
+    _, cloned_key, cloned_value = qkv_clone.unbind(dim=1)
+
+    x = 16 // torch.tensor([], dtype=dtype).element_size()
+    key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x)
+    key_cache = torch.randn(size=key_cache_shape, dtype=dtype, device='cuda')
+
+    value_cache_shape = (num_blocks, num_heads, head_size, block_size)
+    value_cache = torch.randn(
+        size=value_cache_shape, dtype=dtype, device='cuda')
+
+    cache_ops.gather_cached_kv(key, value, key_cache, value_cache, slot_mapping)
+
+    # Reference implementation.
+    for i in range(num_tokens):
+        reshaped_key = cloned_key.reshape(num_tokens, num_heads, head_size // x, x)
+        block_idx = torch.div(slot_mapping[i], block_size, rounding_mode='floor')
+        block_offset = slot_mapping[i] % block_size
+        reshaped_key[i] = key_cache[block_idx, :, :, block_offset, :]
+        cloned_value[i] = value_cache[block_idx, :, :, block_offset]
+
+    assert torch.allclose(key, cloned_key)
+    assert torch.allclose(value, cloned_value)
+
+
@torch.inference_mode()
 def test_cache() -> None:
    test_copy_blocks(
@@ -107,6 +148,9 @@ def test_cache() -> None:
    test_reshape_and_cache(
        num_tokens=3, num_heads=2, head_size=16, block_size=8, num_blocks=2,
        dtype=torch.half)
+    test_gather_cached_kv(
+        num_tokens=3, num_heads=2, head_size=16, block_size=8, num_blocks=2,
+        dtype=torch.half)


 if __name__ == '__main__':