[Bugfix] Update Run:AI Model Streamer Loading Integration (#23845)

Signed-off-by: Omer Dayan (SW-GPU) <omer@run.ai>
Signed-off-by: Peter Schuurman <psch@google.com>
Co-authored-by: Omer Dayan (SW-GPU) <omer@run.ai>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
This commit is contained in:
pwschuurman
2025-09-09 21:37:17 -07:00
committed by GitHub
parent 009d689b0c
commit 4377b1ae3b
7 changed files with 187 additions and 122 deletions

View File

@@ -48,8 +48,9 @@ from vllm.transformers_utils.config import (
is_interleaved, maybe_override_with_speculators_target_model,
try_get_generation_config, try_get_safetensors_metadata,
try_get_tokenizer_config, uses_mrope)
from vllm.transformers_utils.s3_utils import S3Model
from vllm.transformers_utils.utils import is_s3, maybe_model_redirect
from vllm.transformers_utils.runai_utils import (ObjectStorageModel,
is_runai_obj_uri)
from vllm.transformers_utils.utils import maybe_model_redirect
from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS,
STR_DUAL_CHUNK_FLASH_ATTN_VAL, LayerBlockType,
LazyLoader, common_broadcastable_dtype, random_uuid)
@@ -556,15 +557,6 @@ class ModelConfig:
"affect the random state of the Python process that "
"launched vLLM.", self.seed)
if self.runner != "draft":
# If we're not running the draft model, check for speculators config
# If speculators config, set model / tokenizer to be target model
self.model, self.tokenizer = maybe_override_with_speculators_target_model( # noqa: E501
model=self.model,
tokenizer=self.tokenizer,
revision=self.revision,
trust_remote_code=self.trust_remote_code)
# Keep set served_model_name before maybe_model_redirect(self.model)
self.served_model_name = get_served_model_name(self.model,
self.served_model_name)
@@ -603,7 +595,16 @@ class ModelConfig:
f"'Please instead use `--hf-overrides '{hf_overrides_str}'`")
warnings.warn(DeprecationWarning(msg), stacklevel=2)
self.maybe_pull_model_tokenizer_for_s3(self.model, self.tokenizer)
self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer)
if self.runner != "draft":
# If we're not running the draft model, check for speculators config
# If speculators config, set model / tokenizer to be target model
self.model, self.tokenizer = maybe_override_with_speculators_target_model( # noqa: E501
model=self.model,
tokenizer=self.tokenizer,
revision=self.revision,
trust_remote_code=self.trust_remote_code)
if (backend := envs.VLLM_ATTENTION_BACKEND
) and backend == "FLASHINFER" and find_spec("flashinfer") is None:
@@ -832,41 +833,42 @@ class ModelConfig:
"""The architecture vllm actually used."""
return self._architecture
def maybe_pull_model_tokenizer_for_s3(self, model: str,
tokenizer: str) -> None:
"""Pull model/tokenizer from S3 to temporary directory when needed.
def maybe_pull_model_tokenizer_for_runai(self, model: str,
tokenizer: str) -> None:
"""Pull model/tokenizer from Object Storage to temporary
directory when needed.
Args:
model: Model name or path
tokenizer: Tokenizer name or path
"""
if not (is_s3(model) or is_s3(tokenizer)):
if not (is_runai_obj_uri(model) or is_runai_obj_uri(tokenizer)):
return
if is_s3(model):
s3_model = S3Model()
s3_model.pull_files(model,
allow_pattern=["*.model", "*.py", "*.json"])
if is_runai_obj_uri(model):
object_storage_model = ObjectStorageModel()
object_storage_model.pull_files(
model, allow_pattern=["*.model", "*.py", "*.json"])
self.model_weights = model
self.model = s3_model.dir
self.model = object_storage_model.dir
# If tokenizer is same as model, download to same directory
if model == tokenizer:
s3_model.pull_files(model,
ignore_pattern=[
"*.pt", "*.safetensors", "*.bin",
"*.tensors"
])
self.tokenizer = s3_model.dir
object_storage_model.pull_files(model,
ignore_pattern=[
"*.pt", "*.safetensors",
"*.bin", "*.tensors"
])
self.tokenizer = object_storage_model.dir
return
# Only download tokenizer if needed and not already handled
if is_s3(tokenizer):
s3_tokenizer = S3Model()
s3_tokenizer.pull_files(
if is_runai_obj_uri(tokenizer):
object_storage_tokenizer = ObjectStorageModel()
object_storage_tokenizer.pull_files(
model,
ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors"])
self.tokenizer = s3_tokenizer.dir
self.tokenizer = object_storage_tokenizer.dir
def _init_multimodal_config(self) -> Optional["MultiModalConfig"]:
if self._model_info.supports_multimodal: