[Platform] Move async output check to platform (#10768)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2024-12-10 01:24:46 +08:00
committed by GitHub
parent e691b26f6f
commit aea2fc38c3
10 changed files with 66 additions and 22 deletions

View File

@@ -513,11 +513,10 @@ class ModelConfig:
# Reminder: Please update docs/source/usage/compatibility_matrix.rst # Reminder: Please update docs/source/usage/compatibility_matrix.rst
# If the feature combo become valid # If the feature combo become valid
if device_config.device_type not in ("cuda", "tpu", "xpu", "hpu"): if not current_platform.is_async_output_supported(self.enforce_eager):
logger.warning( logger.warning(
"Async output processing is only supported for CUDA, TPU, XPU " "Async output processing is not supported on the "
"and HPU." "current platform type %s.", current_platform.device_type)
"Disabling it for other platforms.")
self.use_async_output_proc = False self.use_async_output_proc = False
return return
@@ -527,16 +526,6 @@ class ModelConfig:
self.use_async_output_proc = False self.use_async_output_proc = False
return return
# Reminder: Please update docs/source/usage/compatibility_matrix.rst
# If the feature combo become valid
if device_config.device_type == "cuda" and self.enforce_eager:
logger.warning(
"To see benefits of async output processing, enable CUDA "
"graph. Since, enforce-eager is enabled, async output "
"processor cannot be used")
self.use_async_output_proc = not self.enforce_eager
return
# Async postprocessor is not necessary with embedding mode # Async postprocessor is not necessary with embedding mode
# since there is no token generation # since there is no token generation
if self.task == "embedding": if self.task == "embedding":

View File

@@ -1,4 +1,4 @@
from typing import TYPE_CHECKING from typing import TYPE_CHECKING, Optional
import psutil import psutil
import torch import torch
@@ -37,6 +37,10 @@ class CpuPlatform(Platform):
def get_device_total_memory(cls, device_id: int = 0) -> int: def get_device_total_memory(cls, device_id: int = 0) -> int:
return psutil.virtual_memory().total return psutil.virtual_memory().total
@classmethod
def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
return False
@classmethod @classmethod
def inference_mode(cls): def inference_mode(cls):
return torch.no_grad() return torch.no_grad()

View File

@@ -4,7 +4,7 @@ pynvml. However, it should not initialize cuda context.
import os import os
from functools import lru_cache, wraps from functools import lru_cache, wraps
from typing import TYPE_CHECKING, Callable, List, TypeVar from typing import TYPE_CHECKING, Callable, List, Optional, TypeVar
import pynvml import pynvml
import torch import torch
@@ -88,6 +88,16 @@ class CudaPlatformBase(Platform):
def get_device_total_memory(cls, device_id: int = 0) -> int: def get_device_total_memory(cls, device_id: int = 0) -> int:
raise NotImplementedError raise NotImplementedError
@classmethod
def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
if enforce_eager:
logger.warning(
"To see benefits of async output processing, enable CUDA "
"graph. Since, enforce-eager is enabled, async output "
"processor cannot be used")
return False
return True
@classmethod @classmethod
def is_full_nvlink(cls, device_ids: List[int]) -> bool: def is_full_nvlink(cls, device_ids: List[int]) -> bool:
raise NotImplementedError raise NotImplementedError

View File

@@ -1,4 +1,4 @@
from typing import TYPE_CHECKING from typing import TYPE_CHECKING, Optional
import torch import torch
@@ -20,6 +20,10 @@ class HpuPlatform(Platform):
def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend: def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
return _Backend.HPU_ATTN return _Backend.HPU_ATTN
@classmethod
def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
return True
@staticmethod @staticmethod
def inference_mode(): def inference_mode():
return torch.no_grad() return torch.no_grad()

View File

@@ -6,11 +6,15 @@ from typing import TYPE_CHECKING, NamedTuple, Optional, Tuple, Union
import numpy as np import numpy as np
import torch import torch
from vllm.logger import init_logger
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.config import VllmConfig from vllm.config import VllmConfig
else: else:
VllmConfig = None VllmConfig = None
logger = init_logger(__name__)
class _Backend(enum.Enum): class _Backend(enum.Enum):
FLASH_ATTN = enum.auto() FLASH_ATTN = enum.auto()
@@ -147,6 +151,13 @@ class Platform:
"""Get the total memory of a device in bytes.""" """Get the total memory of a device in bytes."""
raise NotImplementedError raise NotImplementedError
@classmethod
def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
"""
Check if the current platform supports async output.
"""
raise NotImplementedError
@classmethod @classmethod
def inference_mode(cls): def inference_mode(cls):
"""A device-specific wrapper of `torch.inference_mode`. """A device-specific wrapper of `torch.inference_mode`.

View File

@@ -1,4 +1,4 @@
from typing import TYPE_CHECKING from typing import TYPE_CHECKING, Optional
from .interface import Platform, PlatformEnum from .interface import Platform, PlatformEnum
@@ -18,6 +18,10 @@ class NeuronPlatform(Platform):
def get_device_name(cls, device_id: int = 0) -> str: def get_device_name(cls, device_id: int = 0) -> str:
return "neuron" return "neuron"
@classmethod
def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
return False
@classmethod @classmethod
def check_and_update_config(cls, vllm_config: VllmConfig) -> None: def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
parallel_config = vllm_config.parallel_config parallel_config = vllm_config.parallel_config

View File

@@ -1,4 +1,4 @@
from typing import TYPE_CHECKING from typing import TYPE_CHECKING, Optional
import torch import torch
@@ -37,6 +37,10 @@ class OpenVinoPlatform(Platform):
def get_device_name(self, device_id: int = 0) -> str: def get_device_name(self, device_id: int = 0) -> str:
return "openvino" return "openvino"
@classmethod
def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
return False
@classmethod @classmethod
def inference_mode(self): def inference_mode(self):
return torch.inference_mode(mode=True) return torch.inference_mode(mode=True)

View File

@@ -1,6 +1,6 @@
import os import os
from functools import lru_cache from functools import lru_cache
from typing import TYPE_CHECKING from typing import TYPE_CHECKING, Optional
import torch import torch
@@ -72,6 +72,16 @@ class RocmPlatform(Platform):
device_props = torch.cuda.get_device_properties(device_id) device_props = torch.cuda.get_device_properties(device_id)
return device_props.total_memory return device_props.total_memory
@classmethod
def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
if enforce_eager:
logger.warning(
"To see benefits of async output processing, enable CUDA "
"graph. Since, enforce-eager is enabled, async output "
"processor cannot be used")
return False
return True
@classmethod @classmethod
def check_and_update_config(cls, vllm_config: VllmConfig) -> None: def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
parallel_config = vllm_config.parallel_config parallel_config = vllm_config.parallel_config

View File

@@ -1,4 +1,4 @@
from typing import TYPE_CHECKING from typing import TYPE_CHECKING, Optional
import torch import torch
@@ -35,6 +35,10 @@ class TpuPlatform(Platform):
def get_device_total_memory(cls, device_id: int = 0) -> int: def get_device_total_memory(cls, device_id: int = 0) -> int:
raise NotImplementedError raise NotImplementedError
@classmethod
def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
return True
@classmethod @classmethod
def inference_mode(cls): def inference_mode(cls):
return torch.no_grad() return torch.no_grad()

View File

@@ -1,4 +1,4 @@
from typing import TYPE_CHECKING from typing import TYPE_CHECKING, Optional
import torch import torch
@@ -41,6 +41,10 @@ class XPUPlatform(Platform):
device_props = torch.xpu.get_device_properties(device_id) device_props = torch.xpu.get_device_properties(device_id)
return device_props.total_memory return device_props.total_memory
@classmethod
def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
return True
@staticmethod @staticmethod
def inference_mode(): def inference_mode():
return torch.no_grad() return torch.no_grad()