[Frontend] Warn if user max_model_len is greater than derived max_model_len (#7080)

Signed-off-by: Jefferson Fialho <jfialho@ibm.com> Co-authored-by: Nick Hill <nickhill@us.ibm.com>
2024-08-03 20:01:38 -03:00
parent 44dcb52e39
commit 825b044863
2 changed files with 23 additions and 6 deletions
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -50,6 +50,7 @@ if TYPE_CHECKING:
    VLLM_NO_DEPRECATION_WARNING: bool = False
    CMAKE_BUILD_TYPE: Optional[str] = None
    VERBOSE: bool = False
+    VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False


 def get_default_cache_root():
@@ -331,6 +332,15 @@ environment_variables: Dict[str, Callable[[], Any]] = {
    # If set, vllm will skip the deprecation warnings.
    "VLLM_NO_DEPRECATION_WARNING":
    lambda: bool(int(os.getenv("VLLM_NO_DEPRECATION_WARNING", "0"))),
+
+    # If the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN is set, it allows
+    # the user to specify a max sequence length greater than
+    # the max length derived from the model's config.json.
+    # To enable this, set VLLM_ALLOW_LONG_MAX_MODEL_LEN=1.
+    "VLLM_ALLOW_LONG_MAX_MODEL_LEN":
+    lambda:
+    (os.environ.get("VLLM_ALLOW_LONG_MAX_MODEL_LEN", "0").strip().lower() in
+     ("1", "true")),
 }

 # end-env-vars-definition