[ROCm] fix sleep mode not releasing GPU memory problem on ROCm (#37533)

Signed-off-by: bingzhaodong <aaab8b@gmail.com>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
This commit is contained in:
Zhaodong Bing
2026-03-23 21:07:19 +08:00
committed by GitHub
parent aec2dc6c0d
commit 10a1018c12

View File

@@ -232,6 +232,28 @@ void unmap_and_release(unsigned long long device, ssize_t size,
}
}
// ROCm workaround: hipMemRelease does not return physical VRAM to the
// free pool while the virtual-address reservation is still held.
// Cycling cuMemAddressFree → cuMemAddressReserve (at the same address)
// forces the driver to actually release the physical pages while keeping
// the same VA available for a later create_and_map.
if (first_error == no_error) {
first_error = cuMemAddressFree(d_mem, size);
if (first_error == no_error) {
CUdeviceptr d_mem_new = 0;
first_error = cuMemAddressReserve(&d_mem_new, size, 0, d_mem, 0);
if (first_error == no_error && d_mem_new != d_mem) {
cuMemAddressFree(d_mem_new, size);
snprintf(error_msg, sizeof(error_msg),
"ROCm: VA re-reserve got %p instead of %p", (void*)d_mem_new,
(void*)d_mem);
error_code = CUresult(1);
std::cerr << error_msg << std::endl;
return;
}
}
}
if (first_error != no_error) {
CUDA_CHECK(first_error);
}