Refactor sliding window configuration to Transformers best practice (#21927)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -39,6 +39,7 @@ from vllm.plugins import load_general_plugins
|
||||
from vllm.ray.lazy_utils import is_ray_initialized
|
||||
from vllm.reasoning import ReasoningParserManager
|
||||
from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
|
||||
from vllm.transformers_utils.config import is_interleaved
|
||||
from vllm.transformers_utils.utils import check_gguf_file
|
||||
from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser,
|
||||
GiB_bytes, get_ip, is_in_ray_actor)
|
||||
@@ -1081,6 +1082,13 @@ class EngineArgs:
|
||||
"DualChunkFlashAttention is not supported on V1 engine. "
|
||||
"To run the model in V0 engine, try set 'VLLM_USE_V1=0'")
|
||||
|
||||
sliding_window: Optional[int] = None
|
||||
if not is_interleaved(model_config.hf_text_config):
|
||||
# Only set CacheConfig.sliding_window if the model is all sliding
|
||||
# window. Otherwise CacheConfig.sliding_window will override the
|
||||
# global layers in interleaved sliding window models.
|
||||
sliding_window = model_config.get_sliding_window()
|
||||
|
||||
cache_config = CacheConfig(
|
||||
block_size=self.block_size,
|
||||
gpu_memory_utilization=self.gpu_memory_utilization,
|
||||
@@ -1088,7 +1096,7 @@ class EngineArgs:
|
||||
cache_dtype=self.kv_cache_dtype,
|
||||
is_attention_free=model_config.is_attention_free,
|
||||
num_gpu_blocks_override=self.num_gpu_blocks_override,
|
||||
sliding_window=model_config.get_sliding_window(),
|
||||
sliding_window=sliding_window,
|
||||
enable_prefix_caching=self.enable_prefix_caching,
|
||||
prefix_caching_hash_algo=self.prefix_caching_hash_algo,
|
||||
cpu_offload_gb=self.cpu_offload_gb,
|
||||
|
||||
Reference in New Issue
Block a user