diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 57e54b66a..50c116f85 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -443,9 +443,10 @@ class EngineCore: deferred_scheduler_output = None if self.scheduler.has_requests(): scheduler_output = self.scheduler.schedule() - exec_future = self.model_executor.execute_model( - scheduler_output, non_block=True - ) + with self.log_error_detail(scheduler_output): + exec_future = self.model_executor.execute_model( + scheduler_output, non_block=True + ) if self.is_ec_consumer: model_executed = scheduler_output.total_num_scheduled_tokens > 0 diff --git a/vllm/v1/executor/uniproc_executor.py b/vllm/v1/executor/uniproc_executor.py index a110596b7..2ae982119 100644 --- a/vllm/v1/executor/uniproc_executor.py +++ b/vllm/v1/executor/uniproc_executor.py @@ -100,12 +100,17 @@ class UniProcExecutor(Executor): def execute_model( # type: ignore[override] self, scheduler_output: SchedulerOutput, non_block: bool = False ) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]: - return self.collective_rpc( + output = self.collective_rpc( "execute_model", args=(scheduler_output,), non_block=non_block, single_value=True, ) + # In non-blocking mode, surface any exception as early as possible. + if non_block and output.done(): + # Raise the exception in-line if the task failed. + output.result() + return output def sample_tokens( # type: ignore[override] self, grammar_output: GrammarOutput | None, non_block: bool = False