Allocate more shared memory to attention kernel (#1154)

This commit is contained in:
Antoni Baum
2023-09-26 22:27:13 -07:00
committed by GitHub
parent 03ffd0a022
commit cf5cb1e33e
7 changed files with 87 additions and 3 deletions

View File

@@ -195,6 +195,17 @@ quantization_extension = CUDAExtension(
)
ext_modules.append(quantization_extension)
# Misc. CUDA utils.
cuda_utils_extension = CUDAExtension(
name="vllm.cuda_utils",
sources=["csrc/cuda_utils.cpp", "csrc/cuda_utils_kernels.cu"],
extra_compile_args={
"cxx": CXX_FLAGS,
"nvcc": NVCC_FLAGS,
},
)
ext_modules.append(cuda_utils_extension)
def get_path(*filepath) -> str:
return os.path.join(ROOT_DIR, *filepath)