[Core] Support load and unload LoRA in api server (#6566)
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
This commit is contained in:
@@ -61,6 +61,7 @@ if TYPE_CHECKING:
|
||||
VLLM_ALLOW_ENGINE_USE_RAY: bool = False
|
||||
VLLM_PLUGINS: Optional[List[str]] = None
|
||||
VLLM_TORCH_PROFILER_DIR: Optional[str] = None
|
||||
VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
|
||||
|
||||
|
||||
def get_default_cache_root():
|
||||
@@ -409,6 +410,12 @@ environment_variables: Dict[str, Callable[[], Any]] = {
|
||||
# If set, vLLM will use Triton implementations of AWQ.
|
||||
"VLLM_USE_TRITON_AWQ":
|
||||
lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))),
|
||||
|
||||
# If set, allow loading or unloading lora adapters in runtime,
|
||||
"VLLM_ALLOW_RUNTIME_LORA_UPDATING":
|
||||
lambda:
|
||||
(os.environ.get("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "0").strip().lower() in
|
||||
("1", "true")),
|
||||
}
|
||||
|
||||
# end-env-vars-definition
|
||||
|
||||
Reference in New Issue
Block a user