[Hardware][TPU][V1] Multi-LoRA implementation for the V1 TPU backend (#14238)

Signed-off-by: Akshat Tripathi <akshat@krai.ai>
Signed-off-by: Chengji Yao <chengjiyao@google.com>
Co-authored-by: Chengji Yao <chengjiyao@google.com>
This commit is contained in:
Akshat Tripathi
2025-05-07 21:28:47 +01:00
committed by GitHub
parent db593aa67f
commit c20ef40fd0
19 changed files with 929 additions and 46 deletions

View File

@@ -2694,8 +2694,8 @@ class LoRAConfig:
lora_extra_vocab_size: int = 256
"""Maximum size of extra vocabulary that can be present in a LoRA adapter
(added to the base model vocabulary)."""
# This is a constant.
lora_vocab_padding_size: ClassVar[int] = 256
lora_vocab_padding_size: ClassVar[int] = current_platform\
.get_lora_vocab_padding_size()
long_lora_scaling_factors: Optional[tuple[float, ...]] = None
"""Specify multiple scaling factors (which can be different from base model
scaling factor - see eg. Long LoRA) to allow for multiple LoRA adapters
@@ -2723,6 +2723,7 @@ class LoRAConfig:
factors.append(self.fully_sharded_loras)
factors.append(self.lora_dtype)
factors.append(self.lora_extra_vocab_size)
factors.append(self.lora_vocab_padding_size)
factors.append(self.long_lora_scaling_factors)
factors.append(self.bias_enabled)
hash_str = hashlib.md5(str(factors).encode(),