[Core] Optimize SPMD architecture with delta + serialization optimization (#7109)

This commit is contained in:
SangBin Cho
2024-08-18 17:57:20 -07:00
committed by GitHub
parent 200a2ffa6b
commit ff7ec82c4d
36 changed files with 722 additions and 346 deletions

View File

@@ -1,3 +1,4 @@
from array import array
from itertools import count
from typing import Callable, Dict, List, Optional
from typing import Sequence as GenericSequence
@@ -9,7 +10,8 @@ import torch
from vllm.engine.arg_utils import EngineArgs
from vllm.model_executor.utils import set_random_seed
from vllm.sampling_params import SamplingParams
from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE,
CompletionSequenceGroupOutput, Logprob,
SamplerOutput, SequenceData, SequenceGroupMetadata,
SequenceOutput)
from vllm.utils import get_distributed_init_method, get_ip, get_open_port
@@ -138,8 +140,9 @@ def create_seq_group_metadata_from_prompts(
seq_data={
i:
SequenceData(
prompt_token_ids=prompt_token_ids[:],
output_token_ids=cont_token_ids[:],
array(VLLM_TOKEN_ID_ARRAY_TYPE, prompt_token_ids[:]),
_output_token_ids=array(VLLM_TOKEN_ID_ARRAY_TYPE,
cont_token_ids[:]),
),
},
sampling_params=SamplingParams(temperature=0.0, ),