Fix trtllm-gen attention env and add attention sink (#22378)

Signed-off-by: Siyuan Fu <siyuanf@nvidia.com>
Signed-off-by: Lain <fusiyuan2000@hotmail.com>
Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Yongye Zhu <zyy1102000@gmail.com>
This commit is contained in:
Lain
2025-08-06 18:07:41 -07:00
committed by GitHub
parent 5c7cc33f4d
commit 9a3835aaa9
5 changed files with 21 additions and 28 deletions

View File

@@ -254,8 +254,7 @@ def get_kv_cache_layout():
# Override with format specified by the user.
cache_layout = envs.VLLM_KV_CACHE_LAYOUT
if cache_layout is None:
if (envs.VLLM_USE_TRTLLM_CONTEXT_ATTENTION
or envs.VLLM_USE_TRTLLM_DECODE_ATTENTION):
if envs.VLLM_USE_TRTLLM_ATTENTION:
cache_layout = "HND"
else:
cache_layout = get_kv_connector_cache_layout()
@@ -333,8 +332,7 @@ def infer_global_hyperparameters(
global_params = param_sets[0]
# trtllm attention doesn't need global hyper params so disable the check
if (not envs.VLLM_USE_TRTLLM_CONTEXT_ATTENTION
and not envs.VLLM_USE_TRTLLM_DECODE_ATTENTION):
if not envs.VLLM_USE_TRTLLM_ATTENTION:
for params in param_sets:
if params.window_left != global_params.window_left:
raise ValueError(