[CI][amd] Revert NIXL connector change to avoid crash (#32570)
Signed-off-by: Qiang Li <qiang.li2@amd.com> Signed-off-by: Matthew Wong <Matthew.Wong2@amd.com>
This commit is contained in:
@@ -1451,7 +1451,7 @@ steps:
|
|||||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt
|
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt
|
||||||
|
|
||||||
- label: NixlConnector PD accuracy tests (Distributed) # 30min
|
- label: NixlConnector PD accuracy tests (Distributed) # 30min
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
agent_pool: mi325_4
|
agent_pool: mi325_4
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 30
|
||||||
@@ -1465,7 +1465,7 @@ steps:
|
|||||||
- VLLM_ATTENTION_BACKEND=ROCM_ATTN bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
- VLLM_ATTENTION_BACKEND=ROCM_ATTN bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
|
||||||
|
|
||||||
- label: DP EP NixlConnector PD accuracy tests (Distributed) # 15min
|
- label: DP EP NixlConnector PD accuracy tests (Distributed) # 15min
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
agent_pool: mi325_4
|
agent_pool: mi325_4
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
timeout_in_minutes: 15
|
timeout_in_minutes: 15
|
||||||
|
|||||||
@@ -385,5 +385,5 @@ RUN echo "VLLM_BASE_IMAGE=${BASE_IMAGE}" >> ${COMMON_WORKDIR}/versions.txt
|
|||||||
CMD ["/bin/bash"]
|
CMD ["/bin/bash"]
|
||||||
|
|
||||||
#Set entrypoint for vllm-openai official images
|
#Set entrypoint for vllm-openai official images
|
||||||
FROM final As vllm-openai
|
FROM final AS vllm-openai
|
||||||
ENTRYPOINT ["vllm", "serve"]
|
ENTRYPOINT ["vllm", "serve"]
|
||||||
|
|||||||
@@ -89,20 +89,20 @@ logger = init_logger(__name__)
|
|||||||
|
|
||||||
# Lazy import nixl_wrapper to avoid loading nixl_bindings if nixl is not used
|
# Lazy import nixl_wrapper to avoid loading nixl_bindings if nixl is not used
|
||||||
try:
|
try:
|
||||||
if "UCX_MEM_MMAP_HOOK_MODE" not in os.environ:
|
if "UCX_RCACHE_MAX_UNRELEASED" not in os.environ:
|
||||||
# avoid a memory leak in UCX when using NIXL on some models
|
# avoid a memory leak in UCX when using NIXL on some models
|
||||||
# see: https://github.com/vllm-project/vllm/issues/24264
|
# see: https://github.com/vllm-project/vllm/issues/24264
|
||||||
if "nixl" in sys.modules or "rixl" in sys.modules:
|
if "nixl" in sys.modules or "rixl" in sys.modules:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"NIXL was already imported, we can't disable UCX mmap hooks. "
|
"NIXL was already imported, we can't reset UCX_RCACHE_MAX_UNRELEASED. "
|
||||||
"Please set UCX_MEM_MMAP_HOOK_MODE to 'none' manually."
|
"Please set it to '1024' manually."
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
logger.info(
|
logger.info(
|
||||||
"Setting UCX_MEM_MMAP_HOOK_MODE to 'none' to avoid a rare "
|
"Setting UCX_RCACHE_MAX_UNRELEASED to '1024' to avoid a rare "
|
||||||
"memory leak in UCX when using NIXL."
|
"memory leak in UCX when using NIXL."
|
||||||
)
|
)
|
||||||
os.environ["UCX_MEM_MMAP_HOOK_MODE"] = "none"
|
os.environ["UCX_RCACHE_MAX_UNRELEASED"] = "1024"
|
||||||
|
|
||||||
if not current_platform.is_rocm():
|
if not current_platform.is_rocm():
|
||||||
from nixl._api import nixl_agent as NixlWrapper
|
from nixl._api import nixl_agent as NixlWrapper
|
||||||
|
|||||||
Reference in New Issue
Block a user