[BugFix] Fix memory spike in workspace allocation (#30744)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-12-16 09:46:22 -05:00
parent 4de08ad698
commit 00a8d7628c
2 changed files with 13 additions and 3 deletions
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -1223,6 +1223,8 @@ steps:
    # FIXIT: find out which code initialize cuda before running the test
    # before the fix, we need to use spawn to test it
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    # Alot of these tests are on the edge of OOMing
+    - export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
    # There is some Tensor Parallelism related processing logic in LoRA that
    # requires multi-GPU testing for validation.
    - pytest -v -s -x lora/test_chatglm3_tp.py