[misc][cuda] use nvml to avoid accidentally cuda initialization (#6007)

2024-06-30 20:07:34 -07:00
parent af9ad46fca
commit 614aa51203
13 changed files with 86 additions and 68 deletions
--- a/vllm/attention/ops/blocksparse_attention/interface.py
+++ b/vllm/attention/ops/blocksparse_attention/interface.py
@@ -2,13 +2,13 @@ import math

 import torch

-from vllm.utils import is_cpu, is_hip
+from vllm.utils import get_device_capability_stateless, is_cpu, is_hip

 from .utils import (dense_to_crow_col, get_head_sliding_step,
                    get_sparse_attn_mask)

 IS_COMPUTE_8_OR_ABOVE = (torch.cuda.is_available()
-                         and torch.cuda.get_device_capability()[0] >= 8)
+                         and get_device_capability_stateless()[0] >= 8)

 if IS_COMPUTE_8_OR_ABOVE:
    from .blocksparse_attention_kernel import blocksparse_flash_attn_varlen_fwd
@@ -235,4 +235,4 @@ class LocalStridedBlockSparseAttn(torch.nn.Module):
                                v,
                                cu_seqlens_k,
                                cu_seqlens_q=cu_seqlens_q,
-                                sm_scale=sm_scale)
+                                sm_scale=sm_scale)