Refactor system architecture (#109)

This commit is contained in:
Woosuk Kwon
2023-05-20 13:06:59 -07:00
committed by GitHub
parent 7297fa6f7c
commit c3442c1f6f
24 changed files with 1017 additions and 1034 deletions

View File

@@ -1,12 +1,10 @@
from cacheflow.model_executor.input_metadata import InputMetadata
from cacheflow.model_executor.model_loader import get_model
from cacheflow.model_executor.utils import (set_random_seed,
get_cache_block_size)
from cacheflow.model_executor.utils import set_random_seed
__all__ = [
"InputMetadata",
"get_cache_block_size",
"get_model",
"set_random_seed",
]

View File

@@ -10,9 +10,9 @@ from cacheflow import cache_ops
from cacheflow import pos_encoding_ops
from cacheflow.model_executor.input_metadata import InputMetadata
_SUPPORTED_HEAD_SIZES = [32, 64, 80, 96, 128, 160, 192, 256]
class GPTCacheFlowAttention(nn.Module):
"""GPT-style multi-head attention.

View File

@@ -1,16 +1,13 @@
"""Utilities for selecting and loading models."""
from typing import Optional
import torch
import torch.nn as nn
from transformers import AutoConfig, PretrainedConfig
from transformers import PretrainedConfig
from cacheflow.config import ModelConfig
from cacheflow.model_executor.models import (
GPT2LMHeadModel, GPTNeoXForCausalLM, LlamaForCausalLM, OPTForCausalLM)
from cacheflow.model_executor.utils import get_torch_dtype
from cacheflow.model_executor.weight_utils import initialize_dummy_weights
# TODO(woosuk): Lazy-load the model classes.
_MODEL_REGISTRY = {
"GPT2LMHeadModel": GPT2LMHeadModel,
@@ -19,6 +16,7 @@ _MODEL_REGISTRY = {
"OPTForCausalLM": OPTForCausalLM,
}
def _get_model_architecture(config: PretrainedConfig) -> nn.Module:
architectures = getattr(config, "architectures", [])
for arch in architectures:
@@ -30,51 +28,22 @@ def _get_model_architecture(config: PretrainedConfig) -> nn.Module:
)
def _get_dtype(config: PretrainedConfig, dtype: str) -> torch.dtype:
# NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
# because config.torch_dtype can be None.
config_dtype = getattr(config, "torch_dtype", None)
if config_dtype is None:
config_dtype = torch.float32
if dtype == "default":
if config_dtype == torch.float32:
# Following the common practice, we use float16 for float32 models.
torch_dtype = torch.float16
else:
torch_dtype = config_dtype
else:
torch_dtype = get_torch_dtype(dtype)
if torch_dtype != config_dtype and config_dtype != torch.float32:
# TODO(woosuk): Allow using float16 for bfloat16 models and
# vice versa. Print a warning message and continue.
raise ValueError(
f"Cannot use {torch_dtype} for {config_dtype} model.")
return torch_dtype
def get_model(
model_name: str,
dtype: str,
cache_dir: Optional[str],
use_dummy_weights: bool,
use_np_cache: bool,
) -> nn.Module:
config = AutoConfig.from_pretrained(model_name)
torch_dtype = _get_dtype(config, dtype)
torch.set_default_dtype(torch_dtype)
model_class = _get_model_architecture(config)
def get_model(model_config: ModelConfig) -> nn.Module:
model_class = _get_model_architecture(model_config.hf_config)
torch.set_default_dtype(model_config.dtype)
# Create a model instance.
# The weights will be initialized as empty tensors.
model = model_class(config)
if use_dummy_weights:
model = model_class(model_config.hf_config)
if model_config.use_dummy_weights:
model = model.cuda()
# NOTE(woosuk): For accurate performance evaluation, we assign
# random values to the weights.
initialize_dummy_weights(model)
else:
# Load the weights from the cached or downloaded files.
model.load_weights(model_name, cache_dir, use_np_cache)
model.load_weights(
model_config.model, model_config.download_dir,
model_config.use_np_weights)
model = model.cuda()
return model.eval(), torch_dtype
return model.eval()

View File

@@ -1,6 +1,5 @@
"""Utils for model executor."""
import random
from typing import Union
import numpy as np
import torch
@@ -9,28 +8,6 @@ from cacheflow.model_executor.parallel_utils.parallel_state import model_paralle
from cacheflow.model_executor.parallel_utils.tensor_parallel import model_parallel_cuda_manual_seed
_STR_DTYPE_TO_TORCH_DTYPE = {
"half": torch.half,
"float": torch.float,
"float16": torch.float16,
"float32": torch.float32,
"bfloat16": torch.bfloat16,
}
def get_torch_dtype(dtype: Union[torch.dtype, str]) -> torch.dtype:
if isinstance(dtype, str):
torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype.lower()]
else:
torch_dtype = dtype
return torch_dtype
def get_dtype_size(dtype: Union[torch.dtype, str]) -> int:
torch_dtype = get_torch_dtype(dtype)
return torch.tensor([], dtype=torch_dtype).element_size()
def set_random_seed(seed: int) -> None:
random.seed(seed)
np.random.seed(seed)
@@ -40,15 +17,3 @@ def set_random_seed(seed: int) -> None:
if model_parallel_is_initialized():
model_parallel_cuda_manual_seed(seed)
def get_cache_block_size(block_size: int,
num_heads: int,
head_size: int,
num_layers: int,
dtype: str) -> int:
key_cache_block = block_size * num_heads * head_size
value_cache_block = key_cache_block
total = num_layers * (key_cache_block + value_cache_block)
dtype_size = get_dtype_size(dtype)
return dtype_size * total