[Bugfix][sleepmode][fp8 kv cache]: Fix FP8 KV cache + sleep(level=2) gibberish output (#28783)
Signed-off-by: vensen <vensenmu@gmail.com> Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
This commit is contained in:
@@ -11,7 +11,7 @@ from vllm.device_allocator.cumem import CuMemAllocator
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.mem_constants import GiB_bytes
|
||||
|
||||
from ..utils import create_new_process_for_each_test
|
||||
from ..utils import create_new_process_for_each_test, requires_fp8
|
||||
|
||||
|
||||
@create_new_process_for_each_test("fork" if not current_platform.is_rocm() else "spawn")
|
||||
@@ -243,3 +243,34 @@ def test_deep_sleep_async():
|
||||
assert output.outputs[0].text == output2.outputs[0].text
|
||||
|
||||
asyncio.run(test())
|
||||
|
||||
|
||||
@requires_fp8
|
||||
def test_deep_sleep_fp8_kvcache():
|
||||
GiB_bytes = 1 << 30
|
||||
model = "Qwen/Qwen2-0.5B"
|
||||
used_bytes_baseline = current_platform.get_current_memory_usage()
|
||||
|
||||
llm = LLM(model, enable_sleep_mode=True, kv_cache_dtype="fp8")
|
||||
prompt = "How are you?"
|
||||
sampling_params = SamplingParams(temperature=0, max_tokens=10)
|
||||
output = llm.generate(prompt, sampling_params)
|
||||
|
||||
# Put the engine to deep sleep
|
||||
llm.sleep(level=2)
|
||||
|
||||
used_bytes = current_platform.get_current_memory_usage() - used_bytes_baseline
|
||||
assert used_bytes < 3 * GiB_bytes
|
||||
|
||||
llm.wake_up(tags=["weights"])
|
||||
llm.collective_rpc("reload_weights")
|
||||
|
||||
used_bytes = current_platform.get_current_memory_usage() - used_bytes_baseline
|
||||
assert used_bytes < 4 * GiB_bytes
|
||||
|
||||
# now allocate kv cache and cuda graph memory
|
||||
llm.wake_up(tags=["kv_cache"])
|
||||
output2 = llm.generate(prompt, sampling_params)
|
||||
|
||||
# cmp output
|
||||
assert output[0].outputs[0].text == output2[0].outputs[0].text
|
||||
|
||||
Reference in New Issue
Block a user