Support FP8-E5M2 KV Cache (#2279)

Co-authored-by: zhaoyang <zhao.yang16@zte.com.cn>
Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
This commit is contained in:
zhaoyang-star
2024-01-29 08:43:54 +08:00
committed by GitHub
parent 7d648418b8
commit 9090bf02e7
26 changed files with 912 additions and 196 deletions

View File

@@ -85,6 +85,7 @@ class LLMEngine:
f"disable_custom_all_reduce={parallel_config.disable_custom_all_reduce}, "
f"quantization={model_config.quantization}, "
f"enforce_eager={model_config.enforce_eager}, "
f"kv_cache_dtype={cache_config.cache_dtype}, "
f"seed={model_config.seed})")
# TODO(woosuk): Print more configs in debug mode.
@@ -144,6 +145,7 @@ class LLMEngine:
rank=0,
distributed_init_method=distributed_init_method,
lora_config=self.lora_config,
kv_cache_dtype=self.cache_config.cache_dtype,
is_driver_worker=True,
)
self._run_workers("init_model")
@@ -234,6 +236,7 @@ class LLMEngine:
model_config = copy.deepcopy(self.model_config)
parallel_config = copy.deepcopy(self.parallel_config)
scheduler_config = copy.deepcopy(self.scheduler_config)
cache_config = copy.deepcopy(self.cache_config)
for rank, (worker, (node_id,
_)) in enumerate(zip(self.workers,
@@ -249,6 +252,7 @@ class LLMEngine:
rank,
distributed_init_method,
lora_config=self.lora_config,
cache_config=cache_config,
))
driver_rank = 0
@@ -261,6 +265,7 @@ class LLMEngine:
driver_rank,
distributed_init_method,
lora_config=self.lora_config,
cache_config=cache_config,
is_driver_worker=True,
)
@@ -306,6 +311,7 @@ class LLMEngine:
block_size=self.cache_config.block_size,
gpu_memory_utilization=self.cache_config.gpu_memory_utilization,
cpu_swap_space=self.cache_config.swap_space_bytes,
cache_dtype=self.cache_config.cache_dtype,
)
# Since we use a shared centralized controller, we take the minimum