[BugFix][MRV2] Fix cuda event reuse race (#39115)
Signed-off-by: Nick Hill <nickhill123@gmail.com>
This commit is contained in:
@@ -17,7 +17,6 @@ class AsyncOutput(AsyncModelRunnerOutput):
|
||||
num_sampled_tokens: torch.Tensor,
|
||||
main_stream: torch.cuda.Stream,
|
||||
copy_stream: torch.cuda.Stream,
|
||||
copy_event: torch.cuda.Event,
|
||||
):
|
||||
# NOTE(woosuk): We must retain references to the GPU tensors,
|
||||
# as the copy operations are performed on a different CUDA stream than
|
||||
@@ -25,7 +24,7 @@ class AsyncOutput(AsyncModelRunnerOutput):
|
||||
self.model_runner_output = model_runner_output
|
||||
self.sampler_output = sampler_output
|
||||
self.num_sampled_tokens = num_sampled_tokens
|
||||
self.copy_event = copy_event
|
||||
self.copy_event = torch.cuda.Event()
|
||||
|
||||
with stream(copy_stream, main_stream):
|
||||
copy_stream.wait_stream(main_stream)
|
||||
@@ -78,12 +77,11 @@ class AsyncPoolingOutput(AsyncModelRunnerOutput):
|
||||
is_valid: torch.Tensor | None,
|
||||
main_stream: torch.cuda.Stream,
|
||||
copy_stream: torch.cuda.Stream,
|
||||
copy_event: torch.cuda.Event,
|
||||
):
|
||||
self.model_runner_output = model_runner_output
|
||||
self.pooler_output = pooler_output
|
||||
self.is_valid = is_valid
|
||||
self.copy_event = copy_event
|
||||
self.copy_event = torch.cuda.Event()
|
||||
|
||||
with stream(copy_stream, main_stream):
|
||||
copy_stream.wait_stream(main_stream)
|
||||
|
||||
@@ -130,7 +130,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
||||
|
||||
self.use_async_scheduling = self.scheduler_config.async_scheduling
|
||||
self.output_copy_stream = torch.cuda.Stream(self.device)
|
||||
self.output_copy_event = torch.cuda.Event()
|
||||
|
||||
# Pipeline parallelism.
|
||||
self.use_pp = self.parallel_config.pipeline_parallel_size > 1
|
||||
@@ -1180,7 +1179,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
||||
num_sampled_tokens=num_sampled,
|
||||
main_stream=self.main_stream,
|
||||
copy_stream=self.output_copy_stream,
|
||||
copy_event=self.output_copy_event,
|
||||
)
|
||||
|
||||
mm_inputs: tuple[list[torch.Tensor], torch.Tensor] | None = None
|
||||
@@ -1270,7 +1268,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
||||
is_valid=is_valid,
|
||||
main_stream=self.main_stream,
|
||||
copy_stream=self.output_copy_stream,
|
||||
copy_event=self.output_copy_event,
|
||||
)
|
||||
|
||||
self.postprocess_pool(input_batch)
|
||||
|
||||
Reference in New Issue
Block a user