Implement AWQ quantization support for LLaMA (#1032)

Co-authored-by: Robert Irvine <robert@seamlessml.com> Co-authored-by: root <rirv938@gmail.com> Co-authored-by: Casper <casperbh.96@gmail.com> Co-authored-by: julian-q <julianhquevedo@gmail.com>
2023-09-16 00:03:37 -07:00
parent b9fe4616f9
commit e3e79e9e8a
19 changed files with 1178 additions and 208 deletions
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -29,6 +29,7 @@ class EngineArgs:
    max_num_seqs: int = 256
    disable_log_stats: bool = False
    revision: Optional[str] = None
+    quantization: Optional[str] = None

    def __post_init__(self):
        if self.tokenizer is None:
@@ -88,7 +89,6 @@ class EngineArgs:
            'a numpy cache to speed up the loading. '
            '"dummy" will initialize the weights with random values, '
            'which is mainly for profiling.')
-        # TODO(woosuk): Support FP32.
        parser.add_argument(
            '--dtype',
            type=str,
@@ -150,6 +150,13 @@ class EngineArgs:
        parser.add_argument('--disable-log-stats',
                            action='store_true',
                            help='disable logging statistics')
+        # Quantization settings.
+        parser.add_argument('--quantization',
+                            '-q',
+                            type=str,
+                            choices=['awq', None],
+                            default=None,
+                            help='Method used to quantize the weights')
        return parser

    @classmethod
@@ -163,12 +170,11 @@ class EngineArgs:
    def create_engine_configs(
        self,
    ) -> Tuple[ModelConfig, CacheConfig, ParallelConfig, SchedulerConfig]:
-        # Initialize the configs.
        model_config = ModelConfig(self.model, self.tokenizer,
                                   self.tokenizer_mode, self.trust_remote_code,
                                   self.download_dir, self.load_format,
                                   self.dtype, self.seed, self.revision,
-                                   self.max_model_len)
+                                   self.max_model_len, self.quantization)
        cache_config = CacheConfig(self.block_size,
                                   self.gpu_memory_utilization,
                                   self.swap_space)
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -80,6 +80,7 @@ class LLMEngine:
            f"download_dir={model_config.download_dir!r}, "
            f"load_format={model_config.load_format}, "
            f"tensor_parallel_size={parallel_config.tensor_parallel_size}, "
+            f"quantization={model_config.quantization}, "
            f"seed={model_config.seed})")
        # TODO(woosuk): Print more configs in debug mode.