Refactor system architecture (#109)

2023-05-20 13:06:59 -07:00
parent 7297fa6f7c
commit c3442c1f6f
24 changed files with 1017 additions and 1034 deletions
--- a/cacheflow/model_executor/init.py
+++ b/cacheflow/model_executor/init.py
@@ -1,12 +1,10 @@
 from cacheflow.model_executor.input_metadata import InputMetadata
 from cacheflow.model_executor.model_loader import get_model
-from cacheflow.model_executor.utils import (set_random_seed,
-                                            get_cache_block_size)
+from cacheflow.model_executor.utils import set_random_seed


 __all__ = [
    "InputMetadata",
-    "get_cache_block_size",
    "get_model",
    "set_random_seed",
 ]
--- a/cacheflow/model_executor/layers/attention.py
+++ b/cacheflow/model_executor/layers/attention.py
@@ -10,9 +10,9 @@ from cacheflow import cache_ops
 from cacheflow import pos_encoding_ops
 from cacheflow.model_executor.input_metadata import InputMetadata

-
 _SUPPORTED_HEAD_SIZES = [32, 64, 80, 96, 128, 160, 192, 256]

+
 class GPTCacheFlowAttention(nn.Module):
    """GPT-style multi-head attention.

--- a/cacheflow/model_executor/model_loader.py
+++ b/cacheflow/model_executor/model_loader.py
@@ -1,16 +1,13 @@
 """Utilities for selecting and loading models."""
-from typing import Optional
-
 import torch
 import torch.nn as nn
-from transformers import AutoConfig, PretrainedConfig
+from transformers import PretrainedConfig

+from cacheflow.config import ModelConfig
 from cacheflow.model_executor.models import (
    GPT2LMHeadModel, GPTNeoXForCausalLM, LlamaForCausalLM, OPTForCausalLM)
-from cacheflow.model_executor.utils import get_torch_dtype
 from cacheflow.model_executor.weight_utils import initialize_dummy_weights

-
 # TODO(woosuk): Lazy-load the model classes.
 _MODEL_REGISTRY = {
    "GPT2LMHeadModel": GPT2LMHeadModel,
@@ -19,6 +16,7 @@ _MODEL_REGISTRY = {
    "OPTForCausalLM": OPTForCausalLM,
 }

+
 def _get_model_architecture(config: PretrainedConfig) -> nn.Module:
    architectures = getattr(config, "architectures", [])
    for arch in architectures:
@@ -30,51 +28,22 @@ def _get_model_architecture(config: PretrainedConfig) -> nn.Module:
    )


-def _get_dtype(config: PretrainedConfig, dtype: str) -> torch.dtype:
-    # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
-    # because config.torch_dtype can be None.
-    config_dtype = getattr(config, "torch_dtype", None)
-    if config_dtype is None:
-        config_dtype = torch.float32
-    if dtype == "default":
-        if config_dtype == torch.float32:
-            # Following the common practice, we use float16 for float32 models.
-            torch_dtype = torch.float16
-        else:
-            torch_dtype = config_dtype
-    else:
-        torch_dtype = get_torch_dtype(dtype)
-        if torch_dtype != config_dtype and config_dtype != torch.float32:
-            # TODO(woosuk): Allow using float16 for bfloat16 models and
-            # vice versa. Print a warning message and continue.
-            raise ValueError(
-                f"Cannot use {torch_dtype} for {config_dtype} model.")
-    return torch_dtype
-
-
-def get_model(
-    model_name: str,
-    dtype: str,
-    cache_dir: Optional[str],
-    use_dummy_weights: bool,
-    use_np_cache: bool,
-) -> nn.Module:
-    config = AutoConfig.from_pretrained(model_name)
-    torch_dtype = _get_dtype(config, dtype)
-    torch.set_default_dtype(torch_dtype)
-    model_class = _get_model_architecture(config)
+def get_model(model_config: ModelConfig) -> nn.Module:
+    model_class = _get_model_architecture(model_config.hf_config)
+    torch.set_default_dtype(model_config.dtype)

    # Create a model instance.
    # The weights will be initialized as empty tensors.
-    model = model_class(config)
-    if use_dummy_weights:
+    model = model_class(model_config.hf_config)
+    if model_config.use_dummy_weights:
        model = model.cuda()
        # NOTE(woosuk): For accurate performance evaluation, we assign
        # random values to the weights.
        initialize_dummy_weights(model)
    else:
        # Load the weights from the cached or downloaded files.
-        model.load_weights(model_name, cache_dir, use_np_cache)
+        model.load_weights(
+            model_config.model, model_config.download_dir,
+            model_config.use_np_weights)
        model = model.cuda()
-    return model.eval(), torch_dtype
-
+    return model.eval()
--- a/cacheflow/model_executor/utils.py
+++ b/cacheflow/model_executor/utils.py
@@ -1,6 +1,5 @@
 """Utils for model executor."""
 import random
-from typing import Union

 import numpy as np
 import torch
@@ -9,28 +8,6 @@ from cacheflow.model_executor.parallel_utils.parallel_state import model_paralle
 from cacheflow.model_executor.parallel_utils.tensor_parallel import model_parallel_cuda_manual_seed


-_STR_DTYPE_TO_TORCH_DTYPE = {
-    "half": torch.half,
-    "float": torch.float,
-    "float16": torch.float16,
-    "float32": torch.float32,
-    "bfloat16": torch.bfloat16,
-}
-
-
-def get_torch_dtype(dtype: Union[torch.dtype, str]) -> torch.dtype:
-    if isinstance(dtype, str):
-        torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype.lower()]
-    else:
-        torch_dtype = dtype
-    return torch_dtype
-
-
-def get_dtype_size(dtype: Union[torch.dtype, str]) -> int:
-    torch_dtype = get_torch_dtype(dtype)
-    return torch.tensor([], dtype=torch_dtype).element_size()
-
-
 def set_random_seed(seed: int) -> None:
    random.seed(seed)
    np.random.seed(seed)
@@ -40,15 +17,3 @@ def set_random_seed(seed: int) -> None:

    if model_parallel_is_initialized():
        model_parallel_cuda_manual_seed(seed)
-
-
-def get_cache_block_size(block_size: int,
-                         num_heads: int,
-                         head_size: int,
-                         num_layers: int,
-                         dtype: str) -> int:
-    key_cache_block = block_size * num_heads * head_size
-    value_cache_block = key_cache_block
-    total = num_layers * (key_cache_block + value_cache_block)
-    dtype_size = get_dtype_size(dtype)
-    return dtype_size * total