[BUGFIX]fix CUDA OOM ERROR : invalid argument at cumem_allocator.cpp:119 (#35594)
Signed-off-by: xjx <493337577@qq.com>
This commit is contained in:
@@ -109,16 +109,18 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,
|
|||||||
|
|
||||||
#ifndef USE_ROCM
|
#ifndef USE_ROCM
|
||||||
int flag = 0;
|
int flag = 0;
|
||||||
CUDA_CHECK(cuDeviceGetAttribute(
|
CUresult rdma_result = cuDeviceGetAttribute(
|
||||||
&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED,
|
&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED,
|
||||||
device));
|
device);
|
||||||
if (flag) { // support GPUDirect RDMA if possible
|
if (rdma_result == CUDA_SUCCESS &&
|
||||||
|
flag) { // support GPUDirect RDMA if possible
|
||||||
prop.allocFlags.gpuDirectRDMACapable = 1;
|
prop.allocFlags.gpuDirectRDMACapable = 1;
|
||||||
}
|
}
|
||||||
int fab_flag = 0;
|
int fab_flag = 0;
|
||||||
CUDA_CHECK(cuDeviceGetAttribute(
|
CUresult fab_result = cuDeviceGetAttribute(
|
||||||
&fab_flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device));
|
&fab_flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device);
|
||||||
if (fab_flag) { // support fabric handle if possible
|
if (fab_result == CUDA_SUCCESS &&
|
||||||
|
fab_flag) { // support fabric handle if possible
|
||||||
prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
|
prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
Reference in New Issue
Block a user