Bleeding edge: vLLM main branch, flashinfer v0.6.7, Gitea fork source

custom weights tweaks
2026-04-28 10:17:50 +00:00 · 2026-04-28 08:59:53 +00:00
2 changed files with 23 additions and 18 deletions
--- a/vllm/Dockerfile
+++ b/vllm/Dockerfile
@@ -102,7 +102,7 @@ ARG FLASHINFER_ENABLE_AOT=1
 #   - v0.6.7 works with vLLM v0.18.2rc0 (Build #43)
 #   - v0.6.6 works with vLLM v0.19.0 (for Gemma 4 support)
 # ARG FLASHINFER_REF=v0.6.7  # For vLLM v0.18.2rc0
-ARG FLASHINFER_REF=v0.6.6
+ARG FLASHINFER_REF=v0.6.7
 ARG FLASHINFER_BUILD_SUFFIX=cu132
 ENV FLASHINFER_LOCAL_VERSION=${FLASHINFER_BUILD_SUFFIX:-}
 RUN git clone https://github.com/flashinfer-ai/flashinfer.git
@@ -151,10 +151,10 @@ RUN apt-get update && apt-get install -y build-essential cmake gcc && \
 # ==============================================================================
 FROM build-base AS build-vllm
 # vLLM version to build
-ARG VLLM_REF=v0.19.0
+ARG VLLM_REF=main
 # Install ccache for faster compilation
 RUN apt-get update && apt-get install -y ccache
-RUN git clone https://github.com/vllm-project/vllm.git
+RUN git clone https://sweetapi.com/biondizzle/vllm.git
 RUN cd vllm && \
    git checkout ${VLLM_REF} && \
    echo "\n\n========================================" && \
--- a/vllm/vllm_shim_module.py
+++ b/vllm/vllm_shim_module.py
@@ -72,7 +72,7 @@ def download_file(url: str, dest: str):
    for attempt in range(1, MAX_DOWNLOAD_RETRIES + 1):
        try:
            log(f"Downloading {url} -> {dest} (attempt {attempt}/{MAX_DOWNLOAD_RETRIES})")
-            urllib.request.urlretrieve(url, dest, reporthook=_download_progress)
+            urllib.request.urlretrieve(url, dest)
            log(f"Download complete: {dest}")
            return
        except Exception as e:
@@ -88,19 +88,6 @@ def download_file(url: str, dest: str):
                raise
 def _download_progress(block_num, block_size, total_size):
    """Simple download progress callback."""
    if total_size <= 0:
        return
    downloaded = block_num * block_size
    pct = min(downloaded * 100 // total_size, 100)
    if pct % 10 == 0 and pct > 0:
        mb_down = downloaded / (1024 * 1024)
        mb_total = total_size / (1024 * 1024)
        sys.stdout.write(f"\r  {pct}% ({mb_down:.0f}/{mb_total:.0f} MB)")
        sys.stdout.flush()
 def extract_archive(archive_path: str, dest_dir: str, archive_type: str):
    """Extract archive to dest_dir based on archive_type."""
    log(f"Extracting {archive_path} ({archive_type}) -> {dest_dir}")
@@ -264,12 +251,30 @@ def strip_shim_from_pythonpath():
        log(f"Stripped {SHIM_DIR} from PYTHONPATH (was: {pp!r}, now: {new_pp!r})")
 def invoked_module_path() -> str:
    """
    Derive the dotted module path from this file's location in the shadow package.
    When invoked via `python -m vllm.entrypoints.openai.api_server`, __name__ is
    "__main__" — useless for re-invocation. Instead, figure out the module path
    from the file path relative to the shim root (/opt/vllm-shim).
    """
    # e.g. /opt/vllm-shim/vllm/entrypoints/openai/api_server.py
    filepath = os.path.abspath(__file__)
    # Strip the shim root + trailing .py, convert / to .
    rel = os.path.relpath(filepath, SHIM_DIR)
    # Remove .py extension
    if rel.endswith(".py"):
        rel = rel[:-3]
    return rel.replace(os.sep, ".")
 def main():
    args = sys.argv[1:]
    # Determine which vllm module was actually invoked so we exec the real one
    # (could be vllm.entrypoints.cli.main, vllm.entrypoints.openai.api_server, etc.)
-    invoked_module = __name__  # e.g. "vllm.entrypoints.cli.main" or "vllm.entrypoints.openai.api_server"
+    invoked_module = invoked_module_path()
    log("=" * 50)
    log("vLLM Custom Weights Shim")
    log(f"  Invoked as: python -m {invoked_module} {' '.join(args)}")
Author	SHA1	Message	Date
biondizzle	0698298d13	Bleeding edge: vLLM main branch, flashinfer v0.6.7, Gitea fork source	2026-04-28 10:17:50 +00:00
biondizzle	6e03b5d357	custom weights tweaks	2026-04-28 08:59:53 +00:00