diff --git a/CMakeLists.txt b/CMakeLists.txt index 5f2c43fea..5ebea1c42 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -667,7 +667,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") endif() if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) - cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}") else() cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}") endif() diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu index 04b64a35d..1001af05f 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu @@ -254,7 +254,7 @@ void cutlass_moe_mm( bool per_act_token, bool per_out_ch) { int32_t version_num = get_sm_version_num(); #if defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100 - if (version_num >= 100) { + if (version_num >= 100 && version_num < 110) { cutlass_moe_mm_sm100(out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets, problem_sizes, a_strides, b_strides, c_strides, per_act_token, per_out_ch); @@ -262,7 +262,7 @@ void cutlass_moe_mm( } #endif #if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90 - if (version_num >= 90) { + if (version_num >= 90 && version_num < 100) { cutlass_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets, problem_sizes, a_strides, b_strides, c_strides, per_act_token, per_out_ch); diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 8c69870b2..6b208bca6 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -2747,6 +2747,8 @@ class MemorySnapshot: self.measure() def measure(self): + from vllm.platforms import current_platform + # we measure the torch peak memory usage via allocated_bytes, # rather than `torch.cuda.memory_reserved()` . # After `torch.cuda.reset_peak_memory_stats()`, @@ -2756,6 +2758,24 @@ class MemorySnapshot: "allocated_bytes.all.peak", 0) self.free_memory, self.total_memory = torch.cuda.mem_get_info() + shared_sysmem_device_mem_sms = ( + (8, 7), (11, 0), (12, 1)) # Orin, Thor, Spark + if current_platform.is_cuda() and \ + current_platform.get_device_capability() in \ + shared_sysmem_device_mem_sms: + # On UMA (Orin, Thor and Spark) platform, + # where both CPU and GPU rely on system memory, + # the cudaMemGetInfo function shows the amount of free system memory + # rather than what’s actually available. + # In the case, + # torch.cuda.mem_get_info() only reports "free" memory, + # which can be lower than what is actually + # available due to not including cache memory. + # There’s also a comprehensive reference page + # that explains how you can compute the proper value yourself. + # https://docs.nvidia.com/cuda/cuda-for-tegra-appnote/#estimating-total-allocatable-device-memory-on-an-integrated-gpu-device + self.free_memory = psutil.virtual_memory().available + self.cuda_memory = self.total_memory - self.free_memory # torch.cuda.memory_reserved() is how many bytes