[ROCm] Apply FP8 weights padding to values not divisible by 512 bytes on ROCm (#13231)
This commit is contained in:
committed by
GitHub
parent
558db8083c
commit
c904fdddf6
@@ -74,6 +74,7 @@ if TYPE_CHECKING:
|
||||
VLLM_SKIP_P2P_CHECK: bool = False
|
||||
VLLM_DISABLED_KERNELS: List[str] = []
|
||||
VLLM_USE_V1: bool = False
|
||||
VLLM_ROCM_FP8_PADDING: bool = True
|
||||
VLLM_ENABLE_V1_MULTIPROCESSING: bool = True
|
||||
VLLM_LOG_BATCHSIZE_INTERVAL: float = -1
|
||||
VLLM_DISABLE_COMPILE_CACHE: bool = False
|
||||
@@ -507,6 +508,9 @@ environment_variables: Dict[str, Callable[[], Any]] = {
|
||||
"VLLM_USE_V1":
|
||||
lambda: bool(int(os.getenv("VLLM_USE_V1", "0"))),
|
||||
|
||||
# Pad the fp8 weights to 256 bytes for ROCm
|
||||
"VLLM_ROCM_FP8_PADDING":
|
||||
lambda: bool(int(os.getenv("VLLM_ROCM_FP8_PADDING", "1"))),
|
||||
# Divisor for dynamic key scale factor calculation for FP8 KV Cache
|
||||
"K_SCALE_CONSTANT":
|
||||
lambda: int(os.getenv("K_SCALE_CONSTANT", "200")),
|
||||
|
||||
Reference in New Issue
Block a user