[V1] Move OOM check into sampler run (#14728)
Signed-off-by: Roger Wang <ywang@roblox.com> Co-authored-by: Simon Mo <simon.mo@hey.com>
This commit is contained in:
@@ -1288,9 +1288,18 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
allowed_token_ids_mask=None,
|
allowed_token_ids_mask=None,
|
||||||
bad_words_token_ids={},
|
bad_words_token_ids={},
|
||||||
)
|
)
|
||||||
sampler_output = self.model.sample(logits=logits,
|
try:
|
||||||
sampling_metadata=dummy_metadata)
|
sampler_output = self.model.sample(
|
||||||
|
logits=logits, sampling_metadata=dummy_metadata)
|
||||||
|
except RuntimeError as e:
|
||||||
|
if 'out of memory' in str(e):
|
||||||
|
raise RuntimeError(
|
||||||
|
"CUDA out of memory occurred when warming up sampler with "
|
||||||
|
f"{num_reqs} dummy requests. Please try lowering "
|
||||||
|
"`max_num_seqs` or `gpu_memory_utilization` when "
|
||||||
|
"initializing the engine.") from e
|
||||||
|
else:
|
||||||
|
raise e
|
||||||
return sampler_output
|
return sampler_output
|
||||||
|
|
||||||
def profile_run(self) -> None:
|
def profile_run(self) -> None:
|
||||||
|
|||||||
@@ -221,21 +221,11 @@ class Worker(WorkerBase):
|
|||||||
# NOTE: This is called after `capture_model` on purpose to prevent
|
# NOTE: This is called after `capture_model` on purpose to prevent
|
||||||
# memory buffers from being cleared by `torch.cuda.empty_cache`.
|
# memory buffers from being cleared by `torch.cuda.empty_cache`.
|
||||||
if get_pp_group().is_last_rank:
|
if get_pp_group().is_last_rank:
|
||||||
try:
|
max_num_reqs = min(self.scheduler_config.max_num_seqs,
|
||||||
max_num_reqs = min(
|
self.scheduler_config.max_num_batched_tokens)
|
||||||
self.scheduler_config.max_num_seqs,
|
self.model_runner._dummy_sampler_run(
|
||||||
self.scheduler_config.max_num_batched_tokens)
|
hidden_states=self.model_runner._dummy_run(
|
||||||
self.model_runner._dummy_sampler_run(
|
num_tokens=max_num_reqs))
|
||||||
hidden_states=self.model_runner._dummy_run(
|
|
||||||
num_tokens=max_num_reqs))
|
|
||||||
except RuntimeError as e:
|
|
||||||
if 'out of memory' in str(e):
|
|
||||||
raise RuntimeError(
|
|
||||||
"CUDA out of memory occurred when warming up sampler. "
|
|
||||||
"Please try lowering `gpu_memory_utilization` when "
|
|
||||||
"initializing the engine.") from None
|
|
||||||
else:
|
|
||||||
raise e
|
|
||||||
|
|
||||||
# Reset the seed to ensure that the random state is not affected by
|
# Reset the seed to ensure that the random state is not affected by
|
||||||
# the model initialization and profiling.
|
# the model initialization and profiling.
|
||||||
|
|||||||
Reference in New Issue
Block a user