[Feature] Add Azure Blob Storage support for RunAI Model Streamer (#34614)
Signed-off-by: hasethuraman <hsethuraman@microsoft.com>
This commit is contained in:
@@ -620,7 +620,7 @@ RUN set -eux; \
|
|||||||
ARG BITSANDBYTES_VERSION_X86=0.46.1
|
ARG BITSANDBYTES_VERSION_X86=0.46.1
|
||||||
ARG BITSANDBYTES_VERSION_ARM64=0.42.0
|
ARG BITSANDBYTES_VERSION_ARM64=0.42.0
|
||||||
ARG TIMM_VERSION=">=1.0.17"
|
ARG TIMM_VERSION=">=1.0.17"
|
||||||
ARG RUNAI_MODEL_STREAMER_VERSION=">=0.15.3"
|
ARG RUNAI_MODEL_STREAMER_VERSION=">=0.15.7"
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
||||||
BITSANDBYTES_VERSION="${BITSANDBYTES_VERSION_ARM64}"; \
|
BITSANDBYTES_VERSION="${BITSANDBYTES_VERSION_ARM64}"; \
|
||||||
@@ -628,7 +628,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
BITSANDBYTES_VERSION="${BITSANDBYTES_VERSION_X86}"; \
|
BITSANDBYTES_VERSION="${BITSANDBYTES_VERSION_X86}"; \
|
||||||
fi; \
|
fi; \
|
||||||
uv pip install --system accelerate hf_transfer modelscope \
|
uv pip install --system accelerate hf_transfer modelscope \
|
||||||
"bitsandbytes>=${BITSANDBYTES_VERSION}" "timm${TIMM_VERSION}" "runai-model-streamer[s3,gcs]${RUNAI_MODEL_STREAMER_VERSION}"
|
"bitsandbytes>=${BITSANDBYTES_VERSION}" "timm${TIMM_VERSION}" "runai-model-streamer[s3,gcs,azure]${RUNAI_MODEL_STREAMER_VERSION}"
|
||||||
|
|
||||||
# ============================================================
|
# ============================================================
|
||||||
# VLLM INSTALLATION (depends on build stage)
|
# VLLM INSTALLATION (depends on build stage)
|
||||||
|
|||||||
@@ -83,7 +83,7 @@
|
|||||||
"default": ">=1.0.17"
|
"default": ">=1.0.17"
|
||||||
},
|
},
|
||||||
"RUNAI_MODEL_STREAMER_VERSION": {
|
"RUNAI_MODEL_STREAMER_VERSION": {
|
||||||
"default": ">=0.15.3"
|
"default": ">=0.15.7"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -31,6 +31,16 @@ vllm serve gs://core-llm/Llama-3-8b \
|
|||||||
--load-format runai_streamer
|
--load-format runai_streamer
|
||||||
```
|
```
|
||||||
|
|
||||||
|
To run model from Azure Blob Storage run:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
AZURE_STORAGE_ACCOUNT_NAME=<account> \
|
||||||
|
vllm serve az://<container>/<model-path> \
|
||||||
|
--load-format runai_streamer
|
||||||
|
```
|
||||||
|
|
||||||
|
Authentication uses `DefaultAzureCredential`, which supports `az login`, managed identity, environment variables (`AZURE_CLIENT_ID`, `AZURE_TENANT_ID`, `AZURE_CLIENT_SECRET`), and other methods.
|
||||||
|
|
||||||
To run model from a S3 compatible object store run:
|
To run model from a S3 compatible object store run:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
|||||||
@@ -42,7 +42,7 @@ tritonclient>=2.51.0
|
|||||||
|
|
||||||
numba == 0.61.2 # Required for N-gram speculative decoding
|
numba == 0.61.2 # Required for N-gram speculative decoding
|
||||||
numpy
|
numpy
|
||||||
runai-model-streamer[s3,gcs]==0.15.3
|
runai-model-streamer[s3,gcs,azure]==0.15.7
|
||||||
fastsafetensors>=0.2.2
|
fastsafetensors>=0.2.2
|
||||||
instanttensor>=0.1.5
|
instanttensor>=0.1.5
|
||||||
pydantic>=2.12 # 2.11 leads to error on python 3.13
|
pydantic>=2.12 # 2.11 leads to error on python 3.13
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ tensorizer==2.10.1
|
|||||||
packaging>=24.2
|
packaging>=24.2
|
||||||
setuptools>=77.0.3,<80.0.0
|
setuptools>=77.0.3,<80.0.0
|
||||||
setuptools-scm>=8
|
setuptools-scm>=8
|
||||||
runai-model-streamer[s3,gcs]==0.15.3
|
runai-model-streamer[s3,gcs,azure]==0.15.7
|
||||||
conch-triton-kernels==1.2.1
|
conch-triton-kernels==1.2.1
|
||||||
timm>=1.0.17
|
timm>=1.0.17
|
||||||
# amd-quark: required for Quark quantization on ROCm
|
# amd-quark: required for Quark quantization on ROCm
|
||||||
|
|||||||
@@ -56,7 +56,7 @@ grpcio-reflection==1.78.0
|
|||||||
arctic-inference == 0.1.1 # Required for suffix decoding test
|
arctic-inference == 0.1.1 # Required for suffix decoding test
|
||||||
numba == 0.61.2 # Required for N-gram speculative decoding
|
numba == 0.61.2 # Required for N-gram speculative decoding
|
||||||
numpy
|
numpy
|
||||||
runai-model-streamer[s3,gcs]==0.15.3
|
runai-model-streamer[s3,gcs,azure]==0.15.7
|
||||||
fastsafetensors>=0.2.2 # 0.2.2 contains important fixes for multi-GPU mem usage
|
fastsafetensors>=0.2.2 # 0.2.2 contains important fixes for multi-GPU mem usage
|
||||||
instanttensor>=0.1.5
|
instanttensor>=0.1.5
|
||||||
pydantic>=2.12 # 2.11 leads to error on python 3.13
|
pydantic>=2.12 # 2.11 leads to error on python 3.13
|
||||||
|
|||||||
@@ -64,6 +64,14 @@ audioread==3.0.1
|
|||||||
# via librosa
|
# via librosa
|
||||||
av==16.1.0
|
av==16.1.0
|
||||||
# via -r requirements/test.in
|
# via -r requirements/test.in
|
||||||
|
azure-core==1.38.2
|
||||||
|
# via
|
||||||
|
# azure-identity
|
||||||
|
# azure-storage-blob
|
||||||
|
azure-identity==1.25.2
|
||||||
|
# via runai-model-streamer-azure
|
||||||
|
azure-storage-blob==12.28.0
|
||||||
|
# via runai-model-streamer-azure
|
||||||
backoff==2.2.1
|
backoff==2.2.1
|
||||||
# via
|
# via
|
||||||
# -r requirements/test.in
|
# -r requirements/test.in
|
||||||
@@ -103,8 +111,10 @@ certifi==2024.8.30
|
|||||||
# rasterio
|
# rasterio
|
||||||
# requests
|
# requests
|
||||||
# sentry-sdk
|
# sentry-sdk
|
||||||
cffi==1.17.1
|
cffi==2.0.0
|
||||||
# via soundfile
|
# via
|
||||||
|
# cryptography
|
||||||
|
# soundfile
|
||||||
chardet==5.2.0
|
chardet==5.2.0
|
||||||
# via mbstrdecoder
|
# via mbstrdecoder
|
||||||
charset-normalizer==3.4.0
|
charset-normalizer==3.4.0
|
||||||
@@ -148,6 +158,12 @@ coverage==7.10.6
|
|||||||
# via pytest-cov
|
# via pytest-cov
|
||||||
cramjam==2.9.0
|
cramjam==2.9.0
|
||||||
# via fastparquet
|
# via fastparquet
|
||||||
|
cryptography==46.0.5
|
||||||
|
# via
|
||||||
|
# azure-identity
|
||||||
|
# azure-storage-blob
|
||||||
|
# msal
|
||||||
|
# pyjwt
|
||||||
cuda-bindings==12.9.4
|
cuda-bindings==12.9.4
|
||||||
# via torch
|
# via torch
|
||||||
cuda-pathfinder==1.3.3
|
cuda-pathfinder==1.3.3
|
||||||
@@ -379,6 +395,8 @@ iniconfig==2.0.0
|
|||||||
# via pytest
|
# via pytest
|
||||||
instanttensor==0.1.5
|
instanttensor==0.1.5
|
||||||
# via -r requirements/test.in
|
# via -r requirements/test.in
|
||||||
|
isodate==0.7.2
|
||||||
|
# via azure-storage-blob
|
||||||
isoduration==20.11.0
|
isoduration==20.11.0
|
||||||
# via jsonschema
|
# via jsonschema
|
||||||
isort==5.13.2
|
isort==5.13.2
|
||||||
@@ -492,6 +510,12 @@ more-itertools==10.5.0
|
|||||||
# via lm-eval
|
# via lm-eval
|
||||||
mpmath==1.3.0
|
mpmath==1.3.0
|
||||||
# via sympy
|
# via sympy
|
||||||
|
msal==1.34.0
|
||||||
|
# via
|
||||||
|
# azure-identity
|
||||||
|
# msal-extensions
|
||||||
|
msal-extensions==1.3.1
|
||||||
|
# via azure-identity
|
||||||
msgpack==1.1.0
|
msgpack==1.1.0
|
||||||
# via
|
# via
|
||||||
# librosa
|
# librosa
|
||||||
@@ -828,6 +852,8 @@ pydantic-extra-types==2.10.5
|
|||||||
# via mistral-common
|
# via mistral-common
|
||||||
pygments==2.18.0
|
pygments==2.18.0
|
||||||
# via rich
|
# via rich
|
||||||
|
pyjwt==2.11.0
|
||||||
|
# via msal
|
||||||
pyogrio==0.11.0
|
pyogrio==0.11.0
|
||||||
# via geopandas
|
# via geopandas
|
||||||
pyparsing==3.2.0
|
pyparsing==3.2.0
|
||||||
@@ -945,6 +971,7 @@ regex==2024.9.11
|
|||||||
# transformers
|
# transformers
|
||||||
requests==2.32.3
|
requests==2.32.3
|
||||||
# via
|
# via
|
||||||
|
# azure-core
|
||||||
# buildkite-test-collector
|
# buildkite-test-collector
|
||||||
# datasets
|
# datasets
|
||||||
# diffusers
|
# diffusers
|
||||||
@@ -957,6 +984,7 @@ requests==2.32.3
|
|||||||
# lightly
|
# lightly
|
||||||
# lm-eval
|
# lm-eval
|
||||||
# mistral-common
|
# mistral-common
|
||||||
|
# msal
|
||||||
# mteb
|
# mteb
|
||||||
# pooch
|
# pooch
|
||||||
# ray
|
# ray
|
||||||
@@ -993,11 +1021,13 @@ rsa==4.9.1
|
|||||||
# via google-auth
|
# via google-auth
|
||||||
rtree==1.4.0
|
rtree==1.4.0
|
||||||
# via torchgeo
|
# via torchgeo
|
||||||
runai-model-streamer==0.15.3
|
runai-model-streamer==0.15.7
|
||||||
# via -r requirements/test.in
|
# via -r requirements/test.in
|
||||||
runai-model-streamer-gcs==0.15.3
|
runai-model-streamer-azure==0.15.7
|
||||||
# via runai-model-streamer
|
# via runai-model-streamer
|
||||||
runai-model-streamer-s3==0.15.3
|
runai-model-streamer-gcs==0.15.7
|
||||||
|
# via runai-model-streamer
|
||||||
|
runai-model-streamer-s3==0.15.7
|
||||||
# via runai-model-streamer
|
# via runai-model-streamer
|
||||||
s3transfer==0.10.3
|
s3transfer==0.10.3
|
||||||
# via boto3
|
# via boto3
|
||||||
@@ -1266,6 +1296,9 @@ typing-extensions==4.15.0
|
|||||||
# aiosignal
|
# aiosignal
|
||||||
# albumentations
|
# albumentations
|
||||||
# alembic
|
# alembic
|
||||||
|
# azure-core
|
||||||
|
# azure-identity
|
||||||
|
# azure-storage-blob
|
||||||
# chz
|
# chz
|
||||||
# fastapi
|
# fastapi
|
||||||
# grpcio
|
# grpcio
|
||||||
|
|||||||
2
setup.py
2
setup.py
@@ -970,7 +970,7 @@ setup(
|
|||||||
"tensorizer": ["tensorizer==2.10.1"],
|
"tensorizer": ["tensorizer==2.10.1"],
|
||||||
"fastsafetensors": ["fastsafetensors >= 0.2.2"],
|
"fastsafetensors": ["fastsafetensors >= 0.2.2"],
|
||||||
"instanttensor": ["instanttensor >= 0.1.5"],
|
"instanttensor": ["instanttensor >= 0.1.5"],
|
||||||
"runai": ["runai-model-streamer[s3,gcs] >= 0.15.3"],
|
"runai": ["runai-model-streamer[s3,gcs,azure] >= 0.15.7"],
|
||||||
"audio": [
|
"audio": [
|
||||||
"librosa",
|
"librosa",
|
||||||
"scipy",
|
"scipy",
|
||||||
|
|||||||
@@ -19,6 +19,7 @@ from vllm.transformers_utils.runai_utils import (
|
|||||||
def test_is_runai_obj_uri():
|
def test_is_runai_obj_uri():
|
||||||
assert is_runai_obj_uri("gs://some-gcs-bucket/path")
|
assert is_runai_obj_uri("gs://some-gcs-bucket/path")
|
||||||
assert is_runai_obj_uri("s3://some-s3-bucket/path")
|
assert is_runai_obj_uri("s3://some-s3-bucket/path")
|
||||||
|
assert is_runai_obj_uri("az://some-azure-container/path")
|
||||||
assert not is_runai_obj_uri("nfs://some-nfs-path")
|
assert not is_runai_obj_uri("nfs://some-nfs-path")
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ from vllm.transformers_utils.gguf_utils import (
|
|||||||
split_remote_gguf,
|
split_remote_gguf,
|
||||||
)
|
)
|
||||||
from vllm.transformers_utils.utils import (
|
from vllm.transformers_utils.utils import (
|
||||||
|
is_azure,
|
||||||
is_cloud_storage,
|
is_cloud_storage,
|
||||||
is_gcs,
|
is_gcs,
|
||||||
is_s3,
|
is_s3,
|
||||||
@@ -31,9 +32,17 @@ def test_is_s3():
|
|||||||
assert not is_s3("nfs://nfs-fqdn.local")
|
assert not is_s3("nfs://nfs-fqdn.local")
|
||||||
|
|
||||||
|
|
||||||
|
def test_is_azure():
|
||||||
|
assert is_azure("az://model-container/path")
|
||||||
|
assert not is_azure("s3://model-path/path-to-model")
|
||||||
|
assert not is_azure("/unix/local/path")
|
||||||
|
assert not is_azure("nfs://nfs-fqdn.local")
|
||||||
|
|
||||||
|
|
||||||
def test_is_cloud_storage():
|
def test_is_cloud_storage():
|
||||||
assert is_cloud_storage("gs://model-path")
|
assert is_cloud_storage("gs://model-path")
|
||||||
assert is_cloud_storage("s3://model-path/path-to-model")
|
assert is_cloud_storage("s3://model-path/path-to-model")
|
||||||
|
assert is_cloud_storage("az://model-container/path")
|
||||||
assert not is_cloud_storage("/unix/local/path")
|
assert not is_cloud_storage("/unix/local/path")
|
||||||
assert not is_cloud_storage("nfs://nfs-fqdn.local")
|
assert not is_cloud_storage("nfs://nfs-fqdn.local")
|
||||||
|
|
||||||
|
|||||||
@@ -1574,8 +1574,9 @@ class VllmConfig:
|
|||||||
"runai_streamer_sharded",
|
"runai_streamer_sharded",
|
||||||
):
|
):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"To load a model from S3, 'load_format' "
|
f"To load a model from object storage (S3/GCS/Azure), "
|
||||||
f"must be 'runai_streamer' or 'runai_streamer_sharded', "
|
f"'load_format' must be 'runai_streamer' or "
|
||||||
|
f"'runai_streamer_sharded', "
|
||||||
f"but got '{self.load_config.load_format}'. "
|
f"but got '{self.load_config.load_format}'. "
|
||||||
f"Model: {self.model_config.model}"
|
f"Model: {self.model_config.model}"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ from vllm.transformers_utils.runai_utils import is_runai_obj_uri, list_safetenso
|
|||||||
class RunaiModelStreamerLoader(BaseModelLoader):
|
class RunaiModelStreamerLoader(BaseModelLoader):
|
||||||
"""
|
"""
|
||||||
Model loader that can load safetensors
|
Model loader that can load safetensors
|
||||||
files from local FS or S3 bucket.
|
files from local FS, S3, GCS, or Azure Blob Storage.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, load_config: LoadConfig):
|
def __init__(self, load_config: LoadConfig):
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ from vllm.utils.import_utils import PlaceholderModule
|
|||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
SUPPORTED_SCHEMES = ["s3://", "gs://"]
|
SUPPORTED_SCHEMES = ["s3://", "gs://", "az://"]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from runai_model_streamer import list_safetensors as runai_list_safetensors
|
from runai_model_streamer import list_safetensors as runai_list_safetensors
|
||||||
|
|||||||
@@ -23,8 +23,12 @@ def is_gcs(model_or_path: str) -> bool:
|
|||||||
return model_or_path.lower().startswith("gs://")
|
return model_or_path.lower().startswith("gs://")
|
||||||
|
|
||||||
|
|
||||||
|
def is_azure(model_or_path: str) -> bool:
|
||||||
|
return model_or_path.lower().startswith("az://")
|
||||||
|
|
||||||
|
|
||||||
def is_cloud_storage(model_or_path: str) -> bool:
|
def is_cloud_storage(model_or_path: str) -> bool:
|
||||||
return is_s3(model_or_path) or is_gcs(model_or_path)
|
return is_s3(model_or_path) or is_gcs(model_or_path) or is_azure(model_or_path)
|
||||||
|
|
||||||
|
|
||||||
def without_trust_remote_code(kwargs: dict[str, Any]) -> dict[str, Any]:
|
def without_trust_remote_code(kwargs: dict[str, Any]) -> dict[str, Any]:
|
||||||
|
|||||||
Reference in New Issue
Block a user