[Core] Enable sharded state loader for V1 engine and enhance test coverage (#25308)
Signed-off-by: pengdrumli <pengdrumli@tencent.com>
This commit is contained in:
@@ -57,10 +57,19 @@ def llama_3p2_1b_files():
|
|||||||
|
|
||||||
def _run_writer(input_dir, output_dir, weights_patterns, **kwargs):
|
def _run_writer(input_dir, output_dir, weights_patterns, **kwargs):
|
||||||
llm_sharded_writer = LLM(model=input_dir, **kwargs)
|
llm_sharded_writer = LLM(model=input_dir, **kwargs)
|
||||||
|
# Check which engine version is being used
|
||||||
|
is_v1_engine = hasattr(llm_sharded_writer.llm_engine, "engine_core")
|
||||||
# Dump worker states to output directory
|
# Dump worker states to output directory
|
||||||
llm_sharded_writer.llm_engine.model_executor.save_sharded_state(
|
if is_v1_engine:
|
||||||
path=output_dir)
|
# For V1 engine, we need to use engine_core.save_sharded_state
|
||||||
|
print("Using V1 engine save path")
|
||||||
|
llm_sharded_writer.llm_engine.engine_core.save_sharded_state(
|
||||||
|
path=output_dir)
|
||||||
|
else:
|
||||||
|
# For V0 engine
|
||||||
|
print("Using V0 engine save path")
|
||||||
|
model_executor = llm_sharded_writer.llm_engine.model_executor
|
||||||
|
model_executor.save_sharded_state(path=output_dir)
|
||||||
|
|
||||||
# Copy metadata files to output directory
|
# Copy metadata files to output directory
|
||||||
for file in os.listdir(input_dir):
|
for file in os.listdir(input_dir):
|
||||||
@@ -91,8 +100,6 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
|
|||||||
gpu_memory_utilization = 0.8
|
gpu_memory_utilization = 0.8
|
||||||
input_dir = llama_3p2_1b_files
|
input_dir = llama_3p2_1b_files
|
||||||
ctx = mp.get_context("spawn")
|
ctx = mp.get_context("spawn")
|
||||||
# The interface in v1 engine has changed, run in v1 engine will hang.
|
|
||||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
|
||||||
|
|
||||||
# Run in separate processes for memory & CUDA isolation
|
# Run in separate processes for memory & CUDA isolation
|
||||||
with TemporaryDirectory() as output_dir:
|
with TemporaryDirectory() as output_dir:
|
||||||
@@ -100,7 +107,6 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
|
|||||||
args=(input_dir, output_dir, weights_patterns),
|
args=(input_dir, output_dir, weights_patterns),
|
||||||
kwargs=dict(
|
kwargs=dict(
|
||||||
tensor_parallel_size=tp_size,
|
tensor_parallel_size=tp_size,
|
||||||
distributed_executor_backend="mp",
|
|
||||||
gpu_memory_utilization=gpu_memory_utilization,
|
gpu_memory_utilization=gpu_memory_utilization,
|
||||||
enforce_eager=True,
|
enforce_eager=True,
|
||||||
))
|
))
|
||||||
@@ -112,7 +118,6 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
|
|||||||
p = ctx.Process(target=_run_generate,
|
p = ctx.Process(target=_run_generate,
|
||||||
args=(input_dir, queue),
|
args=(input_dir, queue),
|
||||||
kwargs=dict(
|
kwargs=dict(
|
||||||
distributed_executor_backend="mp",
|
|
||||||
enable_lora=enable_lora,
|
enable_lora=enable_lora,
|
||||||
gpu_memory_utilization=gpu_memory_utilization,
|
gpu_memory_utilization=gpu_memory_utilization,
|
||||||
tensor_parallel_size=tp_size,
|
tensor_parallel_size=tp_size,
|
||||||
@@ -133,7 +138,6 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
|
|||||||
p = ctx.Process(target=_run_generate,
|
p = ctx.Process(target=_run_generate,
|
||||||
args=(output_dir, queue),
|
args=(output_dir, queue),
|
||||||
kwargs=dict(
|
kwargs=dict(
|
||||||
distributed_executor_backend="mp",
|
|
||||||
enable_lora=enable_lora,
|
enable_lora=enable_lora,
|
||||||
gpu_memory_utilization=gpu_memory_utilization,
|
gpu_memory_utilization=gpu_memory_utilization,
|
||||||
tensor_parallel_size=tp_size,
|
tensor_parallel_size=tp_size,
|
||||||
|
|||||||
@@ -1486,12 +1486,6 @@ class EngineArgs:
|
|||||||
#############################################################
|
#############################################################
|
||||||
# Unsupported Feature Flags on V1.
|
# Unsupported Feature Flags on V1.
|
||||||
|
|
||||||
if self.load_format == "sharded_state":
|
|
||||||
_raise_or_fallback(
|
|
||||||
feature_name=f"--load_format {self.load_format}",
|
|
||||||
recommend_to_remove=False)
|
|
||||||
return False
|
|
||||||
|
|
||||||
if (self.logits_processor_pattern
|
if (self.logits_processor_pattern
|
||||||
!= EngineArgs.logits_processor_pattern):
|
!= EngineArgs.logits_processor_pattern):
|
||||||
_raise_or_fallback(feature_name="--logits-processor-pattern",
|
_raise_or_fallback(feature_name="--logits-processor-pattern",
|
||||||
|
|||||||
Reference in New Issue
Block a user