[Hardware][TPU][V1] Multi-LoRA implementation for the V1 TPU backend (#14238)

Signed-off-by: Akshat Tripathi <akshat@krai.ai> Signed-off-by: Chengji Yao <chengjiyao@google.com> Co-authored-by: Chengji Yao <chengjiyao@google.com>
2025-05-07 21:28:47 +01:00
parent db593aa67f
commit c20ef40fd0
19 changed files with 929 additions and 46 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2694,8 +2694,8 @@ class LoRAConfig:
    lora_extra_vocab_size: int = 256
    """Maximum size of extra vocabulary that can be present in a LoRA adapter
    (added to the base model vocabulary)."""
-    # This is a constant.
-    lora_vocab_padding_size: ClassVar[int] = 256
+    lora_vocab_padding_size: ClassVar[int] = current_platform\
+        .get_lora_vocab_padding_size()
    long_lora_scaling_factors: Optional[tuple[float, ...]] = None
    """Specify multiple scaling factors (which can be different from base model
    scaling factor - see eg. Long LoRA) to allow for multiple LoRA adapters
@@ -2723,6 +2723,7 @@ class LoRAConfig:
        factors.append(self.fully_sharded_loras)
        factors.append(self.lora_dtype)
        factors.append(self.lora_extra_vocab_size)
+        factors.append(self.lora_vocab_padding_size)
        factors.append(self.long_lora_scaling_factors)
        factors.append(self.bias_enabled)
        hash_str = hashlib.md5(str(factors).encode(),