Fix trtllm-gen attention env and add attention sink (#22378)
Signed-off-by: Siyuan Fu <siyuanf@nvidia.com> Signed-off-by: Lain <fusiyuan2000@hotmail.com> Signed-off-by: Yongye Zhu <zyy1102000@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Yongye Zhu <zyy1102000@gmail.com>
This commit is contained in:
@@ -254,8 +254,7 @@ def get_kv_cache_layout():
|
||||
# Override with format specified by the user.
|
||||
cache_layout = envs.VLLM_KV_CACHE_LAYOUT
|
||||
if cache_layout is None:
|
||||
if (envs.VLLM_USE_TRTLLM_CONTEXT_ATTENTION
|
||||
or envs.VLLM_USE_TRTLLM_DECODE_ATTENTION):
|
||||
if envs.VLLM_USE_TRTLLM_ATTENTION:
|
||||
cache_layout = "HND"
|
||||
else:
|
||||
cache_layout = get_kv_connector_cache_layout()
|
||||
@@ -333,8 +332,7 @@ def infer_global_hyperparameters(
|
||||
global_params = param_sets[0]
|
||||
|
||||
# trtllm attention doesn't need global hyper params so disable the check
|
||||
if (not envs.VLLM_USE_TRTLLM_CONTEXT_ATTENTION
|
||||
and not envs.VLLM_USE_TRTLLM_DECODE_ATTENTION):
|
||||
if not envs.VLLM_USE_TRTLLM_ATTENTION:
|
||||
for params in param_sets:
|
||||
if params.window_left != global_params.window_left:
|
||||
raise ValueError(
|
||||
|
||||
Reference in New Issue
Block a user