[Misc]: Implement CPU/GPU swapping in BlockManagerV2 (#3834)

This commit is contained in:
Kaiyang Chen
2024-06-04 04:37:11 +08:00
committed by GitHub
parent cafb8e06c5
commit 10c38e3e46
16 changed files with 529 additions and 49 deletions

View File

@@ -75,6 +75,7 @@ class EngineArgs:
num_gpu_blocks_override: Optional[int] = None
num_lookahead_slots: int = 0
model_loader_extra_config: Optional[dict] = None
preemption_mode: Optional[str] = None
# Related to Vision-language models such as llava
image_input_type: Optional[str] = None
@@ -564,6 +565,13 @@ class EngineArgs:
'corresponding to the chosen load_format. '
'This should be a JSON string that will be '
'parsed into a dictionary.')
parser.add_argument(
'--preemption_mode',
type=str,
default=None,
help='If \'recompute\', the engine performs preemption by block '
'swapping; If \'swap\', the engine performs preemption by block '
'swapping.')
parser.add_argument(
"--served-model-name",
@@ -667,6 +675,7 @@ class EngineArgs:
delay_factor=self.scheduler_delay_factor,
enable_chunked_prefill=self.enable_chunked_prefill,
embedding_mode=model_config.embedding_mode,
preemption_mode=self.preemption_mode,
)
lora_config = LoRAConfig(
max_lora_rank=self.max_lora_rank,