[Core] Optimize SPMD architecture with delta + serialization optimization (#7109)

2024-08-18 17:57:20 -07:00
parent 200a2ffa6b
commit ff7ec82c4d
36 changed files with 722 additions and 346 deletions
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -1,5 +1,6 @@
 """Minimal implementation of CLIPVisionModel intended to be only used 
 within a vision language model."""
+from array import array
 from typing import Iterable, Optional, Tuple

 import torch
@@ -17,7 +18,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.multimodal.image import (cached_get_tokenizer,
                                   repeat_and_pad_image_tokens)
-from vllm.sequence import SequenceData
+from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData


 def get_clip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
@@ -53,8 +54,10 @@ def dummy_seq_data_for_clip(
    else:
        image_feature_size = image_feature_size_override

-    token_ids = [image_token_id] * image_feature_size * num_images
-    token_ids += [0] * (seq_len - image_feature_size * num_images)
+    token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE,
+                      [image_token_id]) * image_feature_size * num_images
+    token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE,
+                       [0]) * (seq_len - image_feature_size * num_images)
    return SequenceData(token_ids)