[Fix]Load kv-cache dtype from hf_quant_config.json automatically (fix for reverted PR) (#30785)
Signed-off-by: <> Co-authored-by: root <root@gpu-937.slurm-workers-slurm.slurm.svc.cluster.local>
This commit is contained in:
@@ -93,6 +93,7 @@ from vllm.transformers_utils.utils import is_cloud_storage
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
from vllm.utils.mem_constants import GiB_bytes
|
||||
from vllm.utils.network_utils import get_ip
|
||||
from vllm.utils.torch_utils import resolve_kv_cache_dtype_string
|
||||
from vllm.v1.sample.logits_processor import LogitsProcessor
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -106,6 +107,7 @@ else:
|
||||
LoadFormats = Any
|
||||
UsageContext = Any
|
||||
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
# object is used to allow for special typing forms
|
||||
@@ -1361,12 +1363,17 @@ class EngineArgs:
|
||||
f"dcp_size={self.decode_context_parallel_size}."
|
||||
)
|
||||
|
||||
# Resolve "auto" kv_cache_dtype to actual value from model config
|
||||
resolved_cache_dtype = resolve_kv_cache_dtype_string(
|
||||
self.kv_cache_dtype, model_config
|
||||
)
|
||||
|
||||
cache_config = CacheConfig(
|
||||
block_size=self.block_size,
|
||||
gpu_memory_utilization=self.gpu_memory_utilization,
|
||||
kv_cache_memory_bytes=self.kv_cache_memory_bytes,
|
||||
swap_space=self.swap_space,
|
||||
cache_dtype=self.kv_cache_dtype,
|
||||
cache_dtype=resolved_cache_dtype,
|
||||
is_attention_free=model_config.is_attention_free,
|
||||
num_gpu_blocks_override=self.num_gpu_blocks_override,
|
||||
sliding_window=sliding_window,
|
||||
|
||||
Reference in New Issue
Block a user