[FP8][Kernel] Dynamic kv cache scaling factors computation (#11906)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Co-authored-by: Micah Williamson <micah.williamson@amd.com>
This commit is contained in:
Gregory Shtrasberg
2025-01-23 13:04:03 -05:00
committed by GitHub
parent 6e650f56a1
commit e97f802b2d
60 changed files with 276 additions and 1365 deletions

View File

@@ -120,11 +120,6 @@ class ModelConfig:
decoding draft models.
quantization: Quantization method that was used to quantize the model
weights. If None, we assume the model weights are not quantized.
quantization_param_path: Path to JSON file containing scaling factors.
Used to load KV cache scaling factors into the model when KV cache
type is FP8_E4M3 on ROCm (AMD GPU). In the future these will also
be used to load activation and weight scaling factors when the
model dtype is FP8_E4M3 on ROCm.
enforce_eager: Whether to enforce eager execution. If True, we will
disable CUDA graph and always execute the model in eager mode.
If False, we will use CUDA graph and eager execution in hybrid.
@@ -187,7 +182,6 @@ class ModelConfig:
factors.append(self.model)
factors.append(self.dtype)
factors.append(self.quantization)
factors.append(self.quantization_param_path)
factors.append(self.revision)
factors.append(self.code_revision)
factors.append(self.trust_remote_code)
@@ -213,7 +207,6 @@ class ModelConfig:
max_model_len: Optional[int] = None,
spec_target_max_model_len: Optional[int] = None,
quantization: Optional[str] = None,
quantization_param_path: Optional[str] = None,
enforce_eager: Optional[bool] = None,
max_seq_len_to_capture: Optional[int] = None,
max_logprobs: int = 20,
@@ -274,7 +267,6 @@ class ModelConfig:
else:
self.tokenizer_revision = tokenizer_revision
self.quantization = quantization
self.quantization_param_path = quantization_param_path
self.enforce_eager = enforce_eager
self.max_seq_len_to_capture = max_seq_len_to_capture
self.max_logprobs = max_logprobs
@@ -1002,6 +994,7 @@ class CacheConfig:
sliding_window: Optional[int] = None,
enable_prefix_caching: bool = False,
cpu_offload_gb: float = 0,
calculate_kv_scales: Optional[bool] = None,
) -> None:
self.block_size = block_size
self.gpu_memory_utilization = gpu_memory_utilization
@@ -1012,7 +1005,7 @@ class CacheConfig:
self.sliding_window = sliding_window
self.enable_prefix_caching = enable_prefix_caching
self.cpu_offload_gb = cpu_offload_gb
self.calculate_kv_scales = calculate_kv_scales
self._verify_args()
self._verify_cache_dtype()
self._verify_prefix_caching()
@@ -1021,6 +1014,10 @@ class CacheConfig:
self.num_gpu_blocks: Optional[int] = None
self.num_cpu_blocks: Optional[int] = None
# Set calculate_kv_scales to False if the value is unset.
if self.calculate_kv_scales is None:
self.calculate_kv_scales = False
def metrics_info(self):
# convert cache_config to dict(key: str, value: str) for prometheus
# metrics info
@@ -3297,7 +3294,6 @@ class VllmConfig:
f"quantization={self.model_config.quantization}, "
f"enforce_eager={self.model_config.enforce_eager}, "
f"kv_cache_dtype={self.cache_config.cache_dtype}, "
f"quantization_param_path={self.model_config.quantization_param_path},"
f" device_config={self.device_config.device}, "
f"decoding_config={self.decoding_config!r}, "
f"observability_config={self.observability_config!r}, "