[Kernel] Layernorm performance optimization (#3662)

This commit is contained in:
mawong-amd
2024-03-30 14:26:38 -07:00
committed by GitHub
parent 51c31bc10c
commit b6d103542c
4 changed files with 285 additions and 47 deletions

View File

@@ -5,7 +5,8 @@ from vllm.model_executor.layers.layernorm import RMSNorm
DTYPES = [torch.half, torch.bfloat16, torch.float]
NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing
HIDDEN_SIZES = [768, 5120, 8192] # Arbitrary values for testing
HIDDEN_SIZES = [768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192,
8199] # Arbitrary values for testing
ADD_RESIDUAL = [False, True]
SEEDS = [0]
CUDA_DEVICES = [