[Frontend][TPU] Add TPU default max-num-batched-tokens based on device name (#17508)

Signed-off-by: Chenyaaang <chenyangli@google.com>
2025-05-02 21:42:44 -07:00
parent e3d0a1d190
commit 87baebebd8
3 changed files with 40 additions and 4 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3140,6 +3140,14 @@ def _get_and_verify_max_len(
    # derived length from the HF model config.
    if max_model_len is None:
        max_model_len = int(derived_max_model_len)
+        if current_platform.is_tpu():
+            logger.warning(
+                "--max-model-len is not specified, "
+                "it's currently using model's default length %s, "
+                "which might be too large."
+                "Please input with --max-model-len based on your "
+                "request input length and output length, to avoid "
+                "unnecessary degradation.", max_model_len)
    elif max_model_len > derived_max_model_len:
        # Some models might have a separate key for specifying model_max_length
        # that will be bigger than derived_max_model_len. We compare user input