[Frontend] Update the warning log when using VLLM_ALLOW_LONG_MAX_MODEL_LEN (#20904)
Signed-off-by: wang.yuqi <noooop@126.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -3021,16 +3021,20 @@ def _get_and_verify_max_len(
|
|||||||
f"User-specified max_model_len ({max_model_len}) is greater "
|
f"User-specified max_model_len ({max_model_len}) is greater "
|
||||||
f"than the derived max_model_len ({max_len_key}="
|
f"than the derived max_model_len ({max_len_key}="
|
||||||
f"{derived_max_model_len} or model_max_length="
|
f"{derived_max_model_len} or model_max_length="
|
||||||
f"{model_max_length} in model's config.json). This may lead "
|
f"{model_max_length} in model's config.json).")
|
||||||
"to incorrect model outputs or CUDA errors.")
|
warning = (
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN must be used with extreme "
|
||||||
|
"caution. If the model uses relative position encoding (RoPE), "
|
||||||
|
"positions exceeding derived_max_model_len lead to nan. If the "
|
||||||
|
"model uses absolute position encoding, positions exceeding "
|
||||||
|
"derived_max_model_len will cause a CUDA array out-of-bounds "
|
||||||
|
"error.")
|
||||||
if envs.VLLM_ALLOW_LONG_MAX_MODEL_LEN:
|
if envs.VLLM_ALLOW_LONG_MAX_MODEL_LEN:
|
||||||
logger.warning(
|
logger.warning_once("%s %s", msg, warning)
|
||||||
"%s Make sure the value is correct and within the "
|
|
||||||
"model context size.", msg)
|
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"{msg} To allow overriding this maximum, set "
|
f"{msg} To allow overriding this maximum, set "
|
||||||
"the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN=1")
|
f"the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN=1. {warning}")
|
||||||
return int(max_model_len)
|
return int(max_model_len)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user