Enable safetensors loading for all models (#974)

2023-09-07 15:49:52 -07:00
parent c07ece5ca4
commit c957c741d9
18 changed files with 143 additions and 83 deletions
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -19,6 +19,7 @@ from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.attention import PagedAttentionWithRoPE
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.weight_utils import (
+    convert_pyslice_to_tensor,
    hf_model_weights_iterator,
    load_padded_tensor_parallel_vocab,
    load_tensor_parallel_weights,
@@ -249,17 +250,19 @@ class QWenLMHeadModel(nn.Module):
        self,
        model_name_or_path: str,
        cache_dir: Optional[str] = None,
-        use_np_cache: bool = False,
+        load_format: str = "auto",
    ):
        tp_world_size = get_tensor_model_parallel_world_size()
        tp_rank = get_tensor_model_parallel_rank()
        state_dict = self.state_dict()

        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, use_np_cache):
+                model_name_or_path, cache_dir, load_format):
            if "rotary_emb.inv_freq" in name:
                continue

+            loaded_weight = convert_pyslice_to_tensor(loaded_weight)
+
            if "c_attn" in name:
                total_num_heads = self.config.num_attention_heads
                hidden_size = self.config.hidden_size