[FP8][Kernel] Dynamic kv cache scaling factors computation (#11906)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com> Co-authored-by: Micah Williamson <micah.williamson@amd.com>
2025-01-23 13:04:03 -05:00
parent 6e650f56a1
commit e97f802b2d
60 changed files with 276 additions and 1365 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -120,11 +120,6 @@ class ModelConfig:
            decoding draft models.
        quantization: Quantization method that was used to quantize the model
            weights. If None, we assume the model weights are not quantized.
-        quantization_param_path: Path to JSON file containing scaling factors.
-            Used to load KV cache scaling factors into the model when KV cache
-            type is FP8_E4M3 on ROCm (AMD GPU). In the future these will also
-            be used to load activation and weight scaling factors when the
-            model dtype is FP8_E4M3 on ROCm.
        enforce_eager: Whether to enforce eager execution. If True, we will
            disable CUDA graph and always execute the model in eager mode.
            If False, we will use CUDA graph and eager execution in hybrid.
@@ -187,7 +182,6 @@ class ModelConfig:
        factors.append(self.model)
        factors.append(self.dtype)
        factors.append(self.quantization)
-        factors.append(self.quantization_param_path)
        factors.append(self.revision)
        factors.append(self.code_revision)
        factors.append(self.trust_remote_code)
@@ -213,7 +207,6 @@ class ModelConfig:
        max_model_len: Optional[int] = None,
        spec_target_max_model_len: Optional[int] = None,
        quantization: Optional[str] = None,
-        quantization_param_path: Optional[str] = None,
        enforce_eager: Optional[bool] = None,
        max_seq_len_to_capture: Optional[int] = None,
        max_logprobs: int = 20,
@@ -274,7 +267,6 @@ class ModelConfig:
        else:
            self.tokenizer_revision = tokenizer_revision
        self.quantization = quantization
-        self.quantization_param_path = quantization_param_path
        self.enforce_eager = enforce_eager
        self.max_seq_len_to_capture = max_seq_len_to_capture
        self.max_logprobs = max_logprobs
@@ -1002,6 +994,7 @@ class CacheConfig:
        sliding_window: Optional[int] = None,
        enable_prefix_caching: bool = False,
        cpu_offload_gb: float = 0,
+        calculate_kv_scales: Optional[bool] = None,
    ) -> None:
        self.block_size = block_size
        self.gpu_memory_utilization = gpu_memory_utilization
@@ -1012,7 +1005,7 @@ class CacheConfig:
        self.sliding_window = sliding_window
        self.enable_prefix_caching = enable_prefix_caching
        self.cpu_offload_gb = cpu_offload_gb
-
+        self.calculate_kv_scales = calculate_kv_scales
        self._verify_args()
        self._verify_cache_dtype()
        self._verify_prefix_caching()
@@ -1021,6 +1014,10 @@ class CacheConfig:
        self.num_gpu_blocks: Optional[int] = None
        self.num_cpu_blocks: Optional[int] = None

+        # Set calculate_kv_scales to False if the value is unset.
+        if self.calculate_kv_scales is None:
+            self.calculate_kv_scales = False
+
    def metrics_info(self):
        # convert cache_config to dict(key: str, value: str) for prometheus
        # metrics info
@@ -3297,7 +3294,6 @@ class VllmConfig:
            f"quantization={self.model_config.quantization}, "
            f"enforce_eager={self.model_config.enforce_eager}, "
            f"kv_cache_dtype={self.cache_config.cache_dtype}, "
-            f"quantization_param_path={self.model_config.quantization_param_path},"
            f" device_config={self.device_config.device}, "
            f"decoding_config={self.decoding_config!r}, "
            f"observability_config={self.observability_config!r}, "