add an env var for path to pre-downloaded flashinfer cubin files (#22675)
This commit is contained in:
@@ -158,6 +158,7 @@ if TYPE_CHECKING:
|
|||||||
VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = False
|
VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = False
|
||||||
VLLM_ENABLE_RESPONSES_API_STORE: bool = False
|
VLLM_ENABLE_RESPONSES_API_STORE: bool = False
|
||||||
VLLM_USE_TRTLLM_ATTENTION: Optional[str] = None
|
VLLM_USE_TRTLLM_ATTENTION: Optional[str] = None
|
||||||
|
VLLM_HAS_FLASHINFER_CUBIN: bool = False
|
||||||
VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: bool = False
|
VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: bool = False
|
||||||
VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False
|
VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False
|
||||||
VLLM_TUNED_CONFIG_FOLDER: Optional[str] = None
|
VLLM_TUNED_CONFIG_FOLDER: Optional[str] = None
|
||||||
@@ -1105,6 +1106,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
"VLLM_USE_TRTLLM_ATTENTION":
|
"VLLM_USE_TRTLLM_ATTENTION":
|
||||||
lambda: os.getenv("VLLM_USE_TRTLLM_ATTENTION", None),
|
lambda: os.getenv("VLLM_USE_TRTLLM_ATTENTION", None),
|
||||||
|
|
||||||
|
# If set, it means we pre-downloaded cubin files and flashinfer will
|
||||||
|
# read the cubin files directly.
|
||||||
|
"VLLM_HAS_FLASHINFER_CUBIN":
|
||||||
|
lambda: os.getenv("VLLM_HAS_FLASHINFER_CUBIN", False),
|
||||||
|
|
||||||
# If set to 1, force the use of TRTLLM FP4 GEMM backend in flashinfer.
|
# If set to 1, force the use of TRTLLM FP4 GEMM backend in flashinfer.
|
||||||
# Otherwise, uses the first available of: flashinfer cutlass GEMM,
|
# Otherwise, uses the first available of: flashinfer cutlass GEMM,
|
||||||
# vllm cutlass GEMM, marlin GEMM.
|
# vllm cutlass GEMM, marlin GEMM.
|
||||||
|
|||||||
@@ -132,6 +132,11 @@ def has_nvidia_artifactory() -> bool:
|
|||||||
This checks connectivity to the kernel inference library artifactory
|
This checks connectivity to the kernel inference library artifactory
|
||||||
which is required for downloading certain cubin kernels like TRTLLM FHMA.
|
which is required for downloading certain cubin kernels like TRTLLM FHMA.
|
||||||
"""
|
"""
|
||||||
|
# Since FLASHINFER_CUBIN_DIR defines the pre-downloaded cubins path, when
|
||||||
|
# it's true, we could assume the cubins are available.
|
||||||
|
if envs.VLLM_HAS_FLASHINFER_CUBIN:
|
||||||
|
return True
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Use a short timeout to avoid blocking for too long
|
# Use a short timeout to avoid blocking for too long
|
||||||
response = requests.get(FLASHINFER_CUBINS_REPOSITORY, timeout=5)
|
response = requests.get(FLASHINFER_CUBINS_REPOSITORY, timeout=5)
|
||||||
|
|||||||
Reference in New Issue
Block a user