[Hardware][Neuron] Refactor neuron support (#3471)

This commit is contained in:
Zhuohan Li
2024-03-21 18:22:17 -07:00
committed by GitHub
parent ea5f14e6ff
commit e90fc21f2e
33 changed files with 615 additions and 549 deletions

View File

@@ -2,7 +2,7 @@ import torch
from dataclasses import dataclass
from vllm.model_executor.layers.rejection_sampler import RejectionSampler
from typing import Optional
from vllm.utils import in_wsl
from vllm.utils import is_pin_memory_available
import time
from typing import Callable
@@ -63,7 +63,7 @@ class AsyncMetricsCollector:
self._in_flight_copy: Optional[torch.cuda.Event] = None
pin_memory = not in_wsl()
pin_memory = is_pin_memory_available()
self._aggregate_num_accepted_tokens = torch.tensor(
0, dtype=torch.long, device="cpu", pin_memory=pin_memory)
self._aggregate_num_emitted_tokens = torch.tensor(

View File

@@ -27,8 +27,8 @@ class MultiStepWorker(Worker):
self._proposer: Optional[DraftModelTop1Proposer] = None
def init_model(self):
super().init_model()
def init_device(self):
super().init_device()
self._proposer = DraftModelTop1Proposer(
self,

View File

@@ -79,13 +79,13 @@ class SpecDecodeWorker:
self.scorer: SpeculativeScorer = None
def init_model(self) -> None:
def init_device(self) -> None:
"""Initialize both scorer and proposer models.
"""
# The scorer worker model is initialized first in case the proposer
# model has a smaller TP degree than the target worker.
self.scorer_worker.init_model()
self.proposer_worker.init_model()
self.scorer_worker.init_device()
self.proposer_worker.init_device()
self._metrics.init_gpu_tensors(self.rank)
self.rejection_sampler.init_gpu_tensors(self.rank)