[Kernel] Layernorm performance optimization (#3662)

2024-03-30 14:26:38 -07:00
parent 51c31bc10c
commit b6d103542c
4 changed files with 285 additions and 47 deletions
--- a/tests/kernels/test_layernorm.py
+++ b/tests/kernels/test_layernorm.py
@@ -5,7 +5,8 @@ from vllm.model_executor.layers.layernorm import RMSNorm

 DTYPES = [torch.half, torch.bfloat16, torch.float]
 NUM_TOKENS = [7, 83, 4096]  # Arbitrary values for testing
-HIDDEN_SIZES = [768, 5120, 8192]  # Arbitrary values for testing
+HIDDEN_SIZES = [768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192,
+                8199]  # Arbitrary values for testing
 ADD_RESIDUAL = [False, True]
 SEEDS = [0]
 CUDA_DEVICES = [