[FP8][Kernel] Dynamic kv cache scaling factors computation (#11906)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com> Co-authored-by: Micah Williamson <micah.williamson@amd.com>
This commit is contained in:
committed by
GitHub
parent
6e650f56a1
commit
e97f802b2d
@@ -1,90 +0,0 @@
|
||||
{
|
||||
"model_type": "llama",
|
||||
"kv_cache": {
|
||||
"dtype": "float8_e4m3fn",
|
||||
"scaling_factor": {
|
||||
"0": {
|
||||
"0": 0.0230364128947258,
|
||||
"1": 0.01979283057153225,
|
||||
"2": 0.0241350457072258,
|
||||
"3": 0.0308314748108387,
|
||||
"4": 0.0430733822286129,
|
||||
"5": 0.0370396226644516,
|
||||
"6": 0.0306222103536129,
|
||||
"7": 0.0357491634786129,
|
||||
"8": 0.0358189195394516,
|
||||
"9": 0.0443289652466774,
|
||||
"10": 0.0433175228536129,
|
||||
"11": 0.0416782945394516,
|
||||
"12": 0.0366908498108387,
|
||||
"13": 0.0432477705180645,
|
||||
"14": 0.0410505048930645,
|
||||
"15": 0.0457589291036129,
|
||||
"16": 0.0418526791036129,
|
||||
"17": 0.0432477705180645,
|
||||
"18": 0.0469447560608387,
|
||||
"19": 0.0514787957072258,
|
||||
"20": 0.0541294664144516,
|
||||
"21": 0.0587681382894516,
|
||||
"22": 0.0625,
|
||||
"23": 0.0585588738322258,
|
||||
"24": 0.0600237175822258,
|
||||
"25": 0.0588030144572258,
|
||||
"26": 0.0531180277466774,
|
||||
"27": 0.06396484375,
|
||||
"28": 0.0603027381002903,
|
||||
"29": 0.0582101047039032,
|
||||
"30": 0.0625348836183548,
|
||||
"31": 0.0585588738322258,
|
||||
"32": 0.0582798570394516,
|
||||
"33": 0.0575125589966774,
|
||||
"34": 0.0590820349752903,
|
||||
"35": 0.0614188089966774,
|
||||
"36": 0.0631975457072258,
|
||||
"37": 0.0615931935608387,
|
||||
"38": 0.0601283498108387,
|
||||
"39": 0.0571986623108387,
|
||||
"40": 0.0670340433716774,
|
||||
"41": 0.0523507259786129,
|
||||
"42": 0.0547223798930645,
|
||||
"43": 0.0631975457072258,
|
||||
"44": 0.0663713738322258,
|
||||
"45": 0.0603376142680645,
|
||||
"46": 0.0652204304933548,
|
||||
"47": 0.0734514519572258,
|
||||
"48": 0.0693708211183548,
|
||||
"49": 0.0725446492433548,
|
||||
"50": 0.0627790242433548,
|
||||
"51": 0.0691266804933548,
|
||||
"52": 0.0688825398683548,
|
||||
"53": 0.068429134786129,
|
||||
"54": 0.0605119988322258,
|
||||
"55": 0.0799386203289032,
|
||||
"56": 0.0853097140789032,
|
||||
"57": 0.0661969929933548,
|
||||
"58": 0.0689871683716774,
|
||||
"59": 0.0724051371216774,
|
||||
"60": 0.0541643425822258,
|
||||
"61": 0.0626743882894516,
|
||||
"62": 0.0628487765789032,
|
||||
"63": 0.0607212632894516,
|
||||
"64": 0.0589076466858387,
|
||||
"65": 0.0451660193502903,
|
||||
"66": 0.0453055277466774,
|
||||
"67": 0.0414341539144516,
|
||||
"68": 0.0385044664144516,
|
||||
"69": 0.0414341539144516,
|
||||
"70": 0.0466308631002903,
|
||||
"71": 0.0399693101644516,
|
||||
"72": 0.0437011756002903,
|
||||
"73": 0.0434221550822258,
|
||||
"74": 0.0428989976644516,
|
||||
"75": 0.0401785746216774,
|
||||
"76": 0.0431082621216774,
|
||||
"77": 0.0484444759786129,
|
||||
"78": 0.0417829267680645,
|
||||
"79": 0.0418178029358387
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,42 +0,0 @@
|
||||
{
|
||||
"model_type": "llama",
|
||||
"kv_cache": {
|
||||
"dtype": "float8_e4m3fn",
|
||||
"scaling_factor": {
|
||||
"0": {
|
||||
"0": 0.0152239128947258,
|
||||
"1": 0.0188860222697258,
|
||||
"2": 0.0354178324341774,
|
||||
"3": 0.0376674123108387,
|
||||
"4": 0.0418526791036129,
|
||||
"5": 0.0433175228536129,
|
||||
"6": 0.0397600457072258,
|
||||
"7": 0.0424455925822258,
|
||||
"8": 0.0415387861430645,
|
||||
"9": 0.0408412404358387,
|
||||
"10": 0.0395856611430645,
|
||||
"11": 0.0377371683716774,
|
||||
"12": 0.0400739423930645,
|
||||
"13": 0.040771484375,
|
||||
"14": 0.0393415205180645,
|
||||
"15": 0.0369001142680645,
|
||||
"16": 0.03857421875,
|
||||
"17": 0.0387486070394516,
|
||||
"18": 0.0403180830180645,
|
||||
"19": 0.0396205373108387,
|
||||
"20": 0.0375627800822258,
|
||||
"21": 0.0407366082072258,
|
||||
"22": 0.0432477705180645,
|
||||
"23": 0.0377022884786129,
|
||||
"24": 0.0399693101644516,
|
||||
"25": 0.0374581478536129,
|
||||
"26": 0.0413295216858387,
|
||||
"27": 0.0442243330180645,
|
||||
"28": 0.0424804724752903,
|
||||
"29": 0.0456891767680645,
|
||||
"30": 0.0409109964966774,
|
||||
"31": 0.0482352152466774
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -182,7 +182,7 @@ def test_paged_attention(
|
||||
key_cache, value_cache = key_caches[0], value_caches[0]
|
||||
|
||||
# Using default kv_scale
|
||||
k_scale = v_scale = 1.0
|
||||
k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device)
|
||||
|
||||
# Call the paged attention kernel.
|
||||
output = torch.empty_like(query)
|
||||
|
||||
@@ -210,7 +210,7 @@ def test_paged_attention(
|
||||
key_cache, value_cache = key_caches[0], value_caches[0]
|
||||
|
||||
# Using default kv_scale
|
||||
k_scale = v_scale = 1.0
|
||||
k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device)
|
||||
tp_rank = 0
|
||||
|
||||
# Call the paged attention kernel.
|
||||
|
||||
@@ -160,7 +160,7 @@ def test_reshape_and_cache(
|
||||
cloned_value_cache = value_cache.clone()
|
||||
|
||||
# Using default kv_scale
|
||||
k_scale = v_scale = 1.0
|
||||
k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device)
|
||||
|
||||
# Call the reshape_and_cache kernel.
|
||||
opcheck(torch.ops._C_cache_ops.reshape_and_cache,
|
||||
@@ -258,8 +258,8 @@ def test_reshape_and_cache_flash(
|
||||
del key_caches
|
||||
del value_caches
|
||||
|
||||
k_scale = key.amax().item() / 256
|
||||
v_scale = value.amax().item() / 256
|
||||
k_scale = (key.amax() / 256.0).to(torch.float32)
|
||||
v_scale = (value.amax() / 256.0).to(torch.float32)
|
||||
|
||||
# Clone the KV caches.
|
||||
if kv_cache_dtype == "fp8":
|
||||
@@ -284,12 +284,12 @@ def test_reshape_and_cache_flash(
|
||||
result_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
|
||||
ops.convert_fp8(result_key_cache,
|
||||
key_cache,
|
||||
k_scale,
|
||||
k_scale.item(),
|
||||
kv_dtype=kv_cache_dtype)
|
||||
result_value_cache = torch.empty_like(value_cache, dtype=torch.float16)
|
||||
ops.convert_fp8(result_value_cache,
|
||||
value_cache,
|
||||
v_scale,
|
||||
v_scale.item(),
|
||||
kv_dtype=kv_cache_dtype)
|
||||
|
||||
# Run the reference implementation.
|
||||
|
||||
@@ -138,6 +138,7 @@ def test_contexted_kv_attention(
|
||||
# to V_cache[num_blocks, num_kv_heads, head_size, block_size]
|
||||
v_cache = v_cache.view(-1, block_size, num_kv_heads,
|
||||
head_size).permute(0, 2, 3, 1).contiguous()
|
||||
k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device)
|
||||
|
||||
# Warm up the Triton kernel by calling it once before actually measuring
|
||||
# generation time
|
||||
@@ -153,6 +154,8 @@ def test_contexted_kv_attention(
|
||||
b_seq_len,
|
||||
b_ctx_len,
|
||||
max_input_len,
|
||||
k_scale,
|
||||
v_scale,
|
||||
sliding_window=sliding_window)
|
||||
torch.cuda.synchronize()
|
||||
start_time = time.time()
|
||||
@@ -168,6 +171,8 @@ def test_contexted_kv_attention(
|
||||
b_seq_len,
|
||||
b_ctx_len,
|
||||
max_input_len,
|
||||
k_scale,
|
||||
v_scale,
|
||||
sliding_window=sliding_window)
|
||||
torch.cuda.synchronize()
|
||||
end_time = time.time()
|
||||
@@ -366,6 +371,7 @@ def test_contexted_kv_attention_alibi(
|
||||
# to V_cache[num_blocks, num_kv_heads, head_size, block_size]
|
||||
v_cache = v_cache.view(-1, block_size, num_kv_heads,
|
||||
head_size).permute(0, 2, 3, 1).contiguous()
|
||||
k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device)
|
||||
|
||||
# Warm up the Triton kernel by calling it once before actually measuring
|
||||
# generation time
|
||||
@@ -381,6 +387,8 @@ def test_contexted_kv_attention_alibi(
|
||||
b_seq_len,
|
||||
b_ctx_len,
|
||||
max_input_len,
|
||||
k_scale,
|
||||
v_scale,
|
||||
alibi_slopes=alibi_slopes)
|
||||
torch.cuda.synchronize()
|
||||
start_time = time.time()
|
||||
@@ -396,6 +404,8 @@ def test_contexted_kv_attention_alibi(
|
||||
b_seq_len,
|
||||
b_ctx_len,
|
||||
max_input_len,
|
||||
k_scale,
|
||||
v_scale,
|
||||
alibi_slopes=alibi_slopes)
|
||||
torch.cuda.synchronize()
|
||||
end_time = time.time()
|
||||
|
||||
@@ -909,6 +909,7 @@ def make_test_metadata(
|
||||
num_prefills=num_prefills,
|
||||
slot_mapping=(None if kv_mmap is None else kv_mmap.slot_mapping),
|
||||
multi_modal_placeholder_index_maps=None,
|
||||
enable_kv_scales_calculation=True,
|
||||
num_prefill_tokens=num_prefill_tokens,
|
||||
num_decode_tokens=num_decode_tokens,
|
||||
seq_lens=seq_lens,
|
||||
@@ -958,6 +959,7 @@ def make_test_metadata(
|
||||
num_prefills=num_prefills,
|
||||
slot_mapping=kv_mmap.slot_mapping,
|
||||
multi_modal_placeholder_index_maps=None,
|
||||
enable_kv_scales_calculation=True,
|
||||
num_prefill_tokens=num_prefill_tokens,
|
||||
num_decode_tokens=num_decode_tokens,
|
||||
seq_lens=seq_lens,
|
||||
|
||||
@@ -19,18 +19,17 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
||||
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
|
||||
reason="fp8 is not supported on this GPU type.")
|
||||
@pytest.mark.parametrize(
|
||||
"kv_cache_dtype,base_model,test_model,scale_path",
|
||||
"kv_cache_dtype,base_model,test_model",
|
||||
[
|
||||
# Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
|
||||
("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
|
||||
"nm-testing/Llama-3.2-1B-Instruct-FP8-KV", None),
|
||||
"nm-testing/Llama-3.2-1B-Instruct-FP8-KV"),
|
||||
# Test FP16 checkpoint w. fp8_e5m2 kv-cache.
|
||||
("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct",
|
||||
"meta-llama/Llama-3.2-1B-Instruct", None),
|
||||
"meta-llama/Llama-3.2-1B-Instruct"),
|
||||
# Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
|
||||
("fp8_e4m3", "meta-llama/Llama-2-7b-chat-hf",
|
||||
"meta-llama/Llama-2-7b-chat-hf",
|
||||
"./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json")
|
||||
"meta-llama/Llama-2-7b-chat-hf")
|
||||
])
|
||||
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
|
||||
@pytest.mark.parametrize("max_tokens", [4])
|
||||
@@ -48,7 +47,6 @@ def test_models(
|
||||
kv_cache_dtype: str,
|
||||
base_model: str,
|
||||
test_model: str,
|
||||
scale_path: Optional[str],
|
||||
max_tokens: int,
|
||||
enforce_eager: bool,
|
||||
backend: str,
|
||||
@@ -76,10 +74,6 @@ def test_models(
|
||||
baseline_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, NUM_LOG_PROBS)
|
||||
|
||||
extra_kwargs = {}
|
||||
if scale_path is not None:
|
||||
extra_kwargs["quantization_param_path"] = scale_path
|
||||
|
||||
with vllm_runner(
|
||||
test_model,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
@@ -87,7 +81,6 @@ def test_models(
|
||||
enforce_eager=enforce_eager,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
disable_async_output_proc=disable_async_output_proc,
|
||||
**extra_kwargs,
|
||||
) as vllm_model:
|
||||
test_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, NUM_LOG_PROBS)
|
||||
|
||||
@@ -74,6 +74,7 @@ def test_model_runner_input():
|
||||
num_decode_tokens=3,
|
||||
slot_mapping=torch.zeros(1),
|
||||
multi_modal_placeholder_index_maps=None,
|
||||
enable_kv_scales_calculation=True,
|
||||
)
|
||||
model_input = ModelInputForGPUWithSamplingMetadata(
|
||||
input_tokens=torch.ones(10),
|
||||
@@ -126,6 +127,7 @@ def test_embedding_model_runner_input():
|
||||
num_decode_tokens=3,
|
||||
slot_mapping=torch.zeros(1),
|
||||
multi_modal_placeholder_index_maps=None,
|
||||
enable_kv_scales_calculation=True,
|
||||
)
|
||||
model_input = ModelInputForGPUWithPoolingMetadata(
|
||||
input_tokens=torch.ones(10),
|
||||
@@ -177,6 +179,7 @@ def test_multi_step_model_runner_input():
|
||||
num_decode_tokens=3,
|
||||
slot_mapping=torch.zeros(1),
|
||||
multi_modal_placeholder_index_maps=None,
|
||||
enable_kv_scales_calculation=True,
|
||||
)
|
||||
frozen_model_input = ModelInputForGPUWithSamplingMetadata(
|
||||
input_tokens=torch.ones(10),
|
||||
|
||||
Reference in New Issue
Block a user