[Hardware][Neuron] Refactor neuron support (#3471)
This commit is contained in:
@@ -2,7 +2,7 @@ import torch
|
||||
from dataclasses import dataclass
|
||||
from vllm.model_executor.layers.rejection_sampler import RejectionSampler
|
||||
from typing import Optional
|
||||
from vllm.utils import in_wsl
|
||||
from vllm.utils import is_pin_memory_available
|
||||
import time
|
||||
from typing import Callable
|
||||
|
||||
@@ -63,7 +63,7 @@ class AsyncMetricsCollector:
|
||||
|
||||
self._in_flight_copy: Optional[torch.cuda.Event] = None
|
||||
|
||||
pin_memory = not in_wsl()
|
||||
pin_memory = is_pin_memory_available()
|
||||
self._aggregate_num_accepted_tokens = torch.tensor(
|
||||
0, dtype=torch.long, device="cpu", pin_memory=pin_memory)
|
||||
self._aggregate_num_emitted_tokens = torch.tensor(
|
||||
|
||||
@@ -27,8 +27,8 @@ class MultiStepWorker(Worker):
|
||||
|
||||
self._proposer: Optional[DraftModelTop1Proposer] = None
|
||||
|
||||
def init_model(self):
|
||||
super().init_model()
|
||||
def init_device(self):
|
||||
super().init_device()
|
||||
|
||||
self._proposer = DraftModelTop1Proposer(
|
||||
self,
|
||||
|
||||
@@ -79,13 +79,13 @@ class SpecDecodeWorker:
|
||||
|
||||
self.scorer: SpeculativeScorer = None
|
||||
|
||||
def init_model(self) -> None:
|
||||
def init_device(self) -> None:
|
||||
"""Initialize both scorer and proposer models.
|
||||
"""
|
||||
# The scorer worker model is initialized first in case the proposer
|
||||
# model has a smaller TP degree than the target worker.
|
||||
self.scorer_worker.init_model()
|
||||
self.proposer_worker.init_model()
|
||||
self.scorer_worker.init_device()
|
||||
self.proposer_worker.init_device()
|
||||
|
||||
self._metrics.init_gpu_tensors(self.rank)
|
||||
self.rejection_sampler.init_gpu_tensors(self.rank)
|
||||
|
||||
Reference in New Issue
Block a user