[Core] Optimize SPMD architecture with delta + serialization optimization (#7109)

This commit is contained in:
SangBin Cho
2024-08-18 17:57:20 -07:00
committed by GitHub
parent 200a2ffa6b
commit ff7ec82c4d
36 changed files with 722 additions and 346 deletions

View File

@@ -1,3 +1,4 @@
from array import array
from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
TypedDict, Union)
@@ -17,7 +18,8 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models.opt import OPTModel
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.sequence import IntermediateTensors, SamplerOutput, SequenceData
from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
SamplerOutput, SequenceData)
from .blip import (BlipVisionModel, dummy_image_for_blip,
get_max_blip_image_tokens)
@@ -427,8 +429,10 @@ def dummy_seq_data_for_blip2(
else:
image_feature_size = image_feature_size_override
token_ids = [image_token_id] * image_feature_size * num_images
token_ids += [0] * (seq_len - image_feature_size * num_images)
token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE,
[image_token_id]) * image_feature_size * num_images
token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE,
[0]) * (seq_len - image_feature_size * num_images)
return SequenceData(token_ids)