Compare commits
2 Commits
custom-wei
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 0698298d13 | |||
| 6e03b5d357 |
@@ -102,7 +102,7 @@ ARG FLASHINFER_ENABLE_AOT=1
|
|||||||
# - v0.6.7 works with vLLM v0.18.2rc0 (Build #43)
|
# - v0.6.7 works with vLLM v0.18.2rc0 (Build #43)
|
||||||
# - v0.6.6 works with vLLM v0.19.0 (for Gemma 4 support)
|
# - v0.6.6 works with vLLM v0.19.0 (for Gemma 4 support)
|
||||||
# ARG FLASHINFER_REF=v0.6.7 # For vLLM v0.18.2rc0
|
# ARG FLASHINFER_REF=v0.6.7 # For vLLM v0.18.2rc0
|
||||||
ARG FLASHINFER_REF=v0.6.6
|
ARG FLASHINFER_REF=v0.6.7
|
||||||
ARG FLASHINFER_BUILD_SUFFIX=cu132
|
ARG FLASHINFER_BUILD_SUFFIX=cu132
|
||||||
ENV FLASHINFER_LOCAL_VERSION=${FLASHINFER_BUILD_SUFFIX:-}
|
ENV FLASHINFER_LOCAL_VERSION=${FLASHINFER_BUILD_SUFFIX:-}
|
||||||
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
|
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
|
||||||
@@ -151,10 +151,10 @@ RUN apt-get update && apt-get install -y build-essential cmake gcc && \
|
|||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
FROM build-base AS build-vllm
|
FROM build-base AS build-vllm
|
||||||
# vLLM version to build
|
# vLLM version to build
|
||||||
ARG VLLM_REF=v0.19.0
|
ARG VLLM_REF=main
|
||||||
# Install ccache for faster compilation
|
# Install ccache for faster compilation
|
||||||
RUN apt-get update && apt-get install -y ccache
|
RUN apt-get update && apt-get install -y ccache
|
||||||
RUN git clone https://github.com/vllm-project/vllm.git
|
RUN git clone https://sweetapi.com/biondizzle/vllm.git
|
||||||
RUN cd vllm && \
|
RUN cd vllm && \
|
||||||
git checkout ${VLLM_REF} && \
|
git checkout ${VLLM_REF} && \
|
||||||
echo "\n\n========================================" && \
|
echo "\n\n========================================" && \
|
||||||
|
|||||||
@@ -72,7 +72,7 @@ def download_file(url: str, dest: str):
|
|||||||
for attempt in range(1, MAX_DOWNLOAD_RETRIES + 1):
|
for attempt in range(1, MAX_DOWNLOAD_RETRIES + 1):
|
||||||
try:
|
try:
|
||||||
log(f"Downloading {url} -> {dest} (attempt {attempt}/{MAX_DOWNLOAD_RETRIES})")
|
log(f"Downloading {url} -> {dest} (attempt {attempt}/{MAX_DOWNLOAD_RETRIES})")
|
||||||
urllib.request.urlretrieve(url, dest, reporthook=_download_progress)
|
urllib.request.urlretrieve(url, dest)
|
||||||
log(f"Download complete: {dest}")
|
log(f"Download complete: {dest}")
|
||||||
return
|
return
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -88,19 +88,6 @@ def download_file(url: str, dest: str):
|
|||||||
raise
|
raise
|
||||||
|
|
||||||
|
|
||||||
def _download_progress(block_num, block_size, total_size):
|
|
||||||
"""Simple download progress callback."""
|
|
||||||
if total_size <= 0:
|
|
||||||
return
|
|
||||||
downloaded = block_num * block_size
|
|
||||||
pct = min(downloaded * 100 // total_size, 100)
|
|
||||||
if pct % 10 == 0 and pct > 0:
|
|
||||||
mb_down = downloaded / (1024 * 1024)
|
|
||||||
mb_total = total_size / (1024 * 1024)
|
|
||||||
sys.stdout.write(f"\r {pct}% ({mb_down:.0f}/{mb_total:.0f} MB)")
|
|
||||||
sys.stdout.flush()
|
|
||||||
|
|
||||||
|
|
||||||
def extract_archive(archive_path: str, dest_dir: str, archive_type: str):
|
def extract_archive(archive_path: str, dest_dir: str, archive_type: str):
|
||||||
"""Extract archive to dest_dir based on archive_type."""
|
"""Extract archive to dest_dir based on archive_type."""
|
||||||
log(f"Extracting {archive_path} ({archive_type}) -> {dest_dir}")
|
log(f"Extracting {archive_path} ({archive_type}) -> {dest_dir}")
|
||||||
@@ -264,12 +251,30 @@ def strip_shim_from_pythonpath():
|
|||||||
log(f"Stripped {SHIM_DIR} from PYTHONPATH (was: {pp!r}, now: {new_pp!r})")
|
log(f"Stripped {SHIM_DIR} from PYTHONPATH (was: {pp!r}, now: {new_pp!r})")
|
||||||
|
|
||||||
|
|
||||||
|
def invoked_module_path() -> str:
|
||||||
|
"""
|
||||||
|
Derive the dotted module path from this file's location in the shadow package.
|
||||||
|
|
||||||
|
When invoked via `python -m vllm.entrypoints.openai.api_server`, __name__ is
|
||||||
|
"__main__" — useless for re-invocation. Instead, figure out the module path
|
||||||
|
from the file path relative to the shim root (/opt/vllm-shim).
|
||||||
|
"""
|
||||||
|
# e.g. /opt/vllm-shim/vllm/entrypoints/openai/api_server.py
|
||||||
|
filepath = os.path.abspath(__file__)
|
||||||
|
# Strip the shim root + trailing .py, convert / to .
|
||||||
|
rel = os.path.relpath(filepath, SHIM_DIR)
|
||||||
|
# Remove .py extension
|
||||||
|
if rel.endswith(".py"):
|
||||||
|
rel = rel[:-3]
|
||||||
|
return rel.replace(os.sep, ".")
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
args = sys.argv[1:]
|
args = sys.argv[1:]
|
||||||
|
|
||||||
# Determine which vllm module was actually invoked so we exec the real one
|
# Determine which vllm module was actually invoked so we exec the real one
|
||||||
# (could be vllm.entrypoints.cli.main, vllm.entrypoints.openai.api_server, etc.)
|
# (could be vllm.entrypoints.cli.main, vllm.entrypoints.openai.api_server, etc.)
|
||||||
invoked_module = __name__ # e.g. "vllm.entrypoints.cli.main" or "vllm.entrypoints.openai.api_server"
|
invoked_module = invoked_module_path()
|
||||||
log("=" * 50)
|
log("=" * 50)
|
||||||
log("vLLM Custom Weights Shim")
|
log("vLLM Custom Weights Shim")
|
||||||
log(f" Invoked as: python -m {invoked_module} {' '.join(args)}")
|
log(f" Invoked as: python -m {invoked_module} {' '.join(args)}")
|
||||||
|
|||||||
Reference in New Issue
Block a user