[Model] H2O Danube3-4b (#6451)

2024-07-26 20:47:50 -07:00
parent ed94e4f427
commit 14dbd5a767
10 changed files with 79 additions and 7 deletions
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -28,7 +28,7 @@ NUM_HEADS = [(40, 40), (64, 8)]  # Arbitrary values for testing

 # FlashAttention forward only supports head dimension at most 128
 # https://github.com/ROCmSoftwarePlatform/flash-attention/blob/3d2b6f5d037782cc2c906909a46fb7e2e1b48b25/csrc/flash_attn_rocm/flash_api.cpp#L62
-HEAD_SIZES = [64, 80, 96, 112, 128, 192, 256
+HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256
              ] if not is_hip() else [64, 80, 96, 112, 128]

 BLOCK_SIZES = [16, 32]
@@ -134,6 +134,8 @@ def test_paged_attention(
    seed: int,
    device: str,
 ) -> None:
+    if kv_cache_dtype == "fp8" and head_size % 16:
+        pytest.skip()
    random.seed(seed)
    torch.random.manual_seed(seed)
    if torch.cuda.is_available():
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -11,7 +11,7 @@ DTYPES = [torch.half, torch.bfloat16, torch.float]
 NUM_TOKENS = [42]  # Arbitrary values for testing
 NUM_LAYERS = [1]  # Arbitrary values for testing
 NUM_HEADS = [8]  # Arbitrary values for testing
-HEAD_SIZES = [64, 80, 96, 112, 128, 192, 256]
+HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256]
 BLOCK_SIZES = [8, 16, 32]

 # Arbitrary values for testing
@@ -52,6 +52,8 @@ def test_copy_blocks(
    kv_cache_dtype: str,
    device: str,
 ) -> None:
+    if kv_cache_dtype == "fp8" and head_size % 16:
+        pytest.skip()
    random.seed(seed)
    torch.random.manual_seed(seed)
    if torch.cuda.is_available():
@@ -124,6 +126,8 @@ def test_reshape_and_cache(
    device: str,
    kv_cache_dtype: str,
 ) -> None:
+    if kv_cache_dtype == "fp8" and head_size % 16:
+        pytest.skip()
    random.seed(seed)
    torch.random.manual_seed(seed)
    if torch.cuda.is_available():
@@ -325,6 +329,8 @@ def test_swap_blocks(
 ) -> None:
    if kv_cache_dtype == "fp8" and "cpu" in direction:
        pytest.skip()
+    if kv_cache_dtype == "fp8" and head_size % 16:
+        pytest.skip()
    random.seed(seed)
    torch.random.manual_seed(seed)
    if torch.cuda.is_available():
--- a/tests/kernels/test_pos_encoding.py
+++ b/tests/kernels/test_pos_encoding.py
@@ -10,7 +10,7 @@ from .allclose_default import get_default_atol, get_default_rtol

 IS_NEOX_STYLE = [True, False]
 DTYPES = [torch.half, torch.bfloat16, torch.float]
-HEAD_SIZES = [64, 80, 96, 112, 128, 192, 256]
+HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256]
 ROTARY_DIMS = [None, 32]  # None means rotary dim == head size
 NUM_HEADS = [7, 17]  # Arbitrary values for testing
 BATCH_SIZES = [1, 5]  # Arbitrary values for testing