[Bugfix] Fix broadcasting logic for multi_modal_kwargs (#6836)
This commit is contained in:
@@ -56,7 +56,6 @@ steps:
|
|||||||
fast_check: true
|
fast_check: true
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s core
|
- pytest -v -s core
|
||||||
- pytest -v -s distributed/test_parallel_state.py
|
|
||||||
|
|
||||||
- label: Distributed Comm Ops Test
|
- label: Distributed Comm Ops Test
|
||||||
#mirror_hardwares: [amd]
|
#mirror_hardwares: [amd]
|
||||||
@@ -90,13 +89,13 @@ steps:
|
|||||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
|
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
|
||||||
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
|
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
|
||||||
- TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
|
- TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
|
||||||
- TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
|
- TEST_DIST_MODEL=llava-hf/llava-v1.6-mistral-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
|
||||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
|
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||||
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
|
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
|
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
|
||||||
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
|
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
|
||||||
- TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
|
- TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
|
||||||
- TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
|
- TEST_DIST_MODEL=llava-hf/llava-v1.6-mistral-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
|
||||||
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
||||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
||||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
|
||||||
|
|||||||
@@ -44,6 +44,8 @@ Base Classes
|
|||||||
|
|
||||||
.. autodata:: vllm.multimodal.BatchedTensors
|
.. autodata:: vllm.multimodal.BatchedTensors
|
||||||
|
|
||||||
|
.. autodata:: vllm.multimodal.BatchedTensorInputs
|
||||||
|
|
||||||
.. autoclass:: vllm.multimodal.MultiModalDataBuiltins
|
.. autoclass:: vllm.multimodal.MultiModalDataBuiltins
|
||||||
:members:
|
:members:
|
||||||
:show-inheritance:
|
:show-inheritance:
|
||||||
|
|||||||
@@ -19,10 +19,10 @@ from vllm.utils import cuda_device_count_stateless
|
|||||||
|
|
||||||
model = os.environ["TEST_DIST_MODEL"]
|
model = os.environ["TEST_DIST_MODEL"]
|
||||||
|
|
||||||
if model.startswith("llava-hf/llava"):
|
if model.startswith("llava-hf/llava-1.5"):
|
||||||
from ..models.test_llava import models, run_test
|
from ..models.test_llava import models, run_test
|
||||||
elif model.startswith("microsoft/Phi-3-vision"):
|
elif model.startswith("llava-hf/llava-v1.6"):
|
||||||
from ..models.test_phi3v import models, run_test
|
from ..models.test_llava_next import models, run_test
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError(f"Unsupported model: {model}")
|
raise NotImplementedError(f"Unsupported model: {model}")
|
||||||
|
|
||||||
@@ -45,7 +45,8 @@ def test_models(hf_runner, vllm_runner, image_assets,
|
|||||||
vllm_runner,
|
vllm_runner,
|
||||||
image_assets,
|
image_assets,
|
||||||
model=models[0],
|
model=models[0],
|
||||||
size_factors=[1.0],
|
# So that LLaVA-NeXT processor may return nested list
|
||||||
|
size_factors=[0.25, 0.5, 1.0],
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
max_tokens=max_tokens,
|
max_tokens=max_tokens,
|
||||||
num_logprobs=num_logprobs,
|
num_logprobs=num_logprobs,
|
||||||
|
|||||||
@@ -1,57 +0,0 @@
|
|||||||
from typing import Any, Dict
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
import torch
|
|
||||||
|
|
||||||
from vllm.distributed.parallel_state import (_split_tensor_dict,
|
|
||||||
_update_nested_dict)
|
|
||||||
|
|
||||||
|
|
||||||
def test_split_tensor_dict():
|
|
||||||
test_dict = {
|
|
||||||
"key_a": "a",
|
|
||||||
"key_b": torch.arange(8, dtype=torch.float32),
|
|
||||||
"key_c": {
|
|
||||||
"key_1": torch.arange(5, dtype=torch.float32),
|
|
||||||
"key_2": torch.tensor([], dtype=torch.float32),
|
|
||||||
"key_3": 123,
|
|
||||||
},
|
|
||||||
"key_d": {},
|
|
||||||
}
|
|
||||||
metadata_list, tensor_list = _split_tensor_dict(test_dict)
|
|
||||||
assert len(metadata_list) == 6
|
|
||||||
assert torch.allclose(tensor_list[0], test_dict["key_b"])
|
|
||||||
assert torch.allclose(tensor_list[1], test_dict["key_c"]["key_1"])
|
|
||||||
assert torch.allclose(tensor_list[2], test_dict["key_c"]["key_2"])
|
|
||||||
|
|
||||||
|
|
||||||
def test_split_tensor_dict_invalid_key():
|
|
||||||
test_dict = {
|
|
||||||
"a%b": "a",
|
|
||||||
}
|
|
||||||
with pytest.raises(AssertionError):
|
|
||||||
_split_tensor_dict(test_dict)
|
|
||||||
|
|
||||||
|
|
||||||
def test_update_nested_dict():
|
|
||||||
flattened_keys_values = [("key1%key2%key3", "value1"),
|
|
||||||
("key1%key2%key4", "value2"),
|
|
||||||
("key1%key5", "value3"), ("key6%key7", "value4"),
|
|
||||||
("key8", "value5")]
|
|
||||||
res: Dict[str, Any] = {}
|
|
||||||
|
|
||||||
for flat_key, value in flattened_keys_values:
|
|
||||||
_update_nested_dict(res, flat_key, value)
|
|
||||||
assert res == {
|
|
||||||
"key1": {
|
|
||||||
"key2": {
|
|
||||||
"key3": "value1",
|
|
||||||
"key4": "value2"
|
|
||||||
},
|
|
||||||
"key5": "value3"
|
|
||||||
},
|
|
||||||
"key6": {
|
|
||||||
"key7": "value4"
|
|
||||||
},
|
|
||||||
"key8": "value5"
|
|
||||||
}
|
|
||||||
@@ -1,14 +1,12 @@
|
|||||||
from typing import List, Optional, Tuple
|
from typing import List, Optional, Tuple, Type
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from transformers import AutoConfig, AutoTokenizer
|
from transformers import AutoConfig, AutoTokenizer
|
||||||
|
|
||||||
from vllm.model_executor.models.llava_next import (
|
|
||||||
get_llava_next_image_feature_size)
|
|
||||||
from vllm.multimodal.utils import rescale_image_size
|
from vllm.multimodal.utils import rescale_image_size
|
||||||
from vllm.sequence import SampleLogprobs
|
from vllm.sequence import SampleLogprobs
|
||||||
|
|
||||||
from ..conftest import IMAGE_ASSETS
|
from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
|
||||||
from .utils import check_logprobs_close
|
from .utils import check_logprobs_close
|
||||||
|
|
||||||
pytestmark = pytest.mark.vlm
|
pytestmark = pytest.mark.vlm
|
||||||
@@ -27,6 +25,8 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
|||||||
|
|
||||||
IMAGE_TOKEN_ID = 32000
|
IMAGE_TOKEN_ID = 32000
|
||||||
|
|
||||||
|
models = ["llava-hf/llava-v1.6-vicuna-7b-hf"]
|
||||||
|
|
||||||
|
|
||||||
def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
|
def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
|
||||||
Optional[SampleLogprobs]],
|
Optional[SampleLogprobs]],
|
||||||
@@ -50,34 +50,19 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
|
|||||||
return hf_output_ids, hf_output_str, out_logprobs
|
return hf_output_ids, hf_output_str, out_logprobs
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", ["llava-hf/llava-v1.6-vicuna-7b-hf"])
|
def run_test(
|
||||||
@pytest.mark.parametrize(
|
hf_runner: Type[HfRunner],
|
||||||
"size_factors",
|
vllm_runner: Type[VllmRunner],
|
||||||
[
|
image_assets: _ImageAssets,
|
||||||
# No image
|
model: str,
|
||||||
[],
|
*,
|
||||||
# Single-scale
|
size_factors: List[float],
|
||||||
[1.0],
|
dtype: str,
|
||||||
# Single-scale, batched
|
max_tokens: int,
|
||||||
[1.0, 1.0, 1.0],
|
num_logprobs: int,
|
||||||
# Multi-scale
|
tensor_parallel_size: int,
|
||||||
[0.25, 0.5, 1.0],
|
distributed_executor_backend: Optional[str] = None,
|
||||||
],
|
):
|
||||||
)
|
|
||||||
@pytest.mark.parametrize("dtype", ["half"])
|
|
||||||
@pytest.mark.parametrize("max_tokens", [128])
|
|
||||||
@pytest.mark.parametrize("num_logprobs", [5])
|
|
||||||
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
|
||||||
dtype, max_tokens, num_logprobs) -> None:
|
|
||||||
"""Inference result should be the same between hf and vllm.
|
|
||||||
|
|
||||||
All the image fixtures for the test is under tests/images.
|
|
||||||
For huggingface runner, we provide the PIL images as input.
|
|
||||||
For vllm runner, we provide MultiModalDataDict objects
|
|
||||||
and corresponding vision language config as input.
|
|
||||||
Note, the text input is also adjusted to abide by vllm contract.
|
|
||||||
The text output is sanitized to be able to compare with hf.
|
|
||||||
"""
|
|
||||||
images = [asset.pil_image for asset in image_assets]
|
images = [asset.pil_image for asset in image_assets]
|
||||||
|
|
||||||
inputs_per_image = [(
|
inputs_per_image = [(
|
||||||
@@ -89,6 +74,8 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
|||||||
with vllm_runner(model,
|
with vllm_runner(model,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
|
tensor_parallel_size=tensor_parallel_size,
|
||||||
|
distributed_executor_backend=distributed_executor_backend,
|
||||||
enforce_eager=True) as vllm_model:
|
enforce_eager=True) as vllm_model:
|
||||||
vllm_outputs_per_image = [
|
vllm_outputs_per_image = [
|
||||||
vllm_model.generate_greedy_logprobs(prompts,
|
vllm_model.generate_greedy_logprobs(prompts,
|
||||||
@@ -122,9 +109,54 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("model", models)
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"size_factors",
|
||||||
|
[
|
||||||
|
# No image
|
||||||
|
[],
|
||||||
|
# Single-scale
|
||||||
|
[1.0],
|
||||||
|
# Single-scale, batched
|
||||||
|
[1.0, 1.0, 1.0],
|
||||||
|
# Multi-scale
|
||||||
|
[0.25, 0.5, 1.0],
|
||||||
|
],
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize("dtype", ["half"])
|
||||||
|
@pytest.mark.parametrize("max_tokens", [128])
|
||||||
|
@pytest.mark.parametrize("num_logprobs", [5])
|
||||||
|
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
||||||
|
dtype, max_tokens, num_logprobs) -> None:
|
||||||
|
"""Inference result should be the same between hf and vllm.
|
||||||
|
|
||||||
|
All the image fixtures for the test is under tests/images.
|
||||||
|
For huggingface runner, we provide the PIL images as input.
|
||||||
|
For vllm runner, we provide MultiModalDataDict objects
|
||||||
|
and corresponding vision language config as input.
|
||||||
|
Note, the text input is also adjusted to abide by vllm contract.
|
||||||
|
The text output is sanitized to be able to compare with hf.
|
||||||
|
"""
|
||||||
|
run_test(
|
||||||
|
hf_runner,
|
||||||
|
vllm_runner,
|
||||||
|
image_assets,
|
||||||
|
model,
|
||||||
|
size_factors=size_factors,
|
||||||
|
dtype=dtype,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
num_logprobs=num_logprobs,
|
||||||
|
tensor_parallel_size=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("height_and_width_and_result", [(1669, 2560, 2144),
|
@pytest.mark.parametrize("height_and_width_and_result", [(1669, 2560, 2144),
|
||||||
(183, 488, 776)])
|
(183, 488, 776)])
|
||||||
def test_image_feature_size(height_and_width_and_result):
|
def test_image_feature_size(height_and_width_and_result):
|
||||||
|
# Avoid initializing CUDA too early in distributed tests
|
||||||
|
from vllm.model_executor.models.llava_next import (
|
||||||
|
get_llava_next_image_feature_size)
|
||||||
|
|
||||||
height, width, result = height_and_width_and_result
|
height, width, result = height_and_width_and_result
|
||||||
config = AutoConfig.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
|
config = AutoConfig.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
|
||||||
assert get_llava_next_image_feature_size(config,
|
assert get_llava_next_image_feature_size(config,
|
||||||
|
|||||||
@@ -45,22 +45,16 @@ TensorMetadata = namedtuple("TensorMetadata", ["device", "dtype", "size"])
|
|||||||
|
|
||||||
|
|
||||||
def _split_tensor_dict(
|
def _split_tensor_dict(
|
||||||
tensor_dict: Dict[str, Union[torch.Tensor, Any]],
|
tensor_dict: Dict[str, Union[torch.Tensor, Any]]
|
||||||
prefix: str = "") -> Tuple[List[Tuple[str, Any]], List[torch.Tensor]]:
|
) -> Tuple[List[Tuple[str, Any]], List[torch.Tensor]]:
|
||||||
"""Split the tensor dictionary into two parts:
|
"""Split the tensor dictionary into two parts:
|
||||||
1. A list of (key, value) pairs. If the value is a tensor, it is replaced
|
1. A list of (key, value) pairs. If the value is a tensor, it is replaced
|
||||||
by its metadata.
|
by its metadata.
|
||||||
2. A list of tensors.
|
2. A list of tensors.
|
||||||
|
|
||||||
If the Tensor is nested under `tensor_dict["key1"]["key2"]`, the key of its
|
|
||||||
metadata will be "key1%key2".
|
|
||||||
"""
|
"""
|
||||||
metadata_list: List[Tuple[str, Any]] = []
|
metadata_list: List[Tuple[str, Any]] = []
|
||||||
tensor_list = []
|
tensor_list: List[torch.Tensor] = []
|
||||||
for key, value in tensor_dict.items():
|
for key, value in tensor_dict.items():
|
||||||
assert "%" not in key, (
|
|
||||||
"Avoid having '%' in key "
|
|
||||||
"as it is used as a separator for nested entries.")
|
|
||||||
if isinstance(value, torch.Tensor):
|
if isinstance(value, torch.Tensor):
|
||||||
# Note: we cannot use `value.device` here,
|
# Note: we cannot use `value.device` here,
|
||||||
# because it contains not only the device type but also the device
|
# because it contains not only the device type but also the device
|
||||||
@@ -68,31 +62,13 @@ def _split_tensor_dict(
|
|||||||
# receiving side will set the device index.
|
# receiving side will set the device index.
|
||||||
device = value.device.type
|
device = value.device.type
|
||||||
metadata_list.append(
|
metadata_list.append(
|
||||||
(prefix + key, TensorMetadata(device, value.dtype,
|
(key, TensorMetadata(device, value.dtype, value.size())))
|
||||||
value.size())))
|
|
||||||
tensor_list.append(value)
|
tensor_list.append(value)
|
||||||
elif isinstance(value, dict):
|
|
||||||
if len(value) == 0:
|
|
||||||
metadata_list.append((prefix + key, value))
|
|
||||||
inner_metadata_list, inner_tensor_list = _split_tensor_dict(
|
|
||||||
value, prefix + key + "%")
|
|
||||||
metadata_list.extend(inner_metadata_list)
|
|
||||||
tensor_list.extend(inner_tensor_list)
|
|
||||||
else:
|
else:
|
||||||
metadata_list.append((prefix + key, value))
|
metadata_list.append((key, value))
|
||||||
return metadata_list, tensor_list
|
return metadata_list, tensor_list
|
||||||
|
|
||||||
|
|
||||||
def _update_nested_dict(nested_dict, flattened_key, value):
|
|
||||||
key_splits = flattened_key.split("%")
|
|
||||||
cur_dict = nested_dict
|
|
||||||
for k in key_splits[:-1]:
|
|
||||||
if k not in cur_dict:
|
|
||||||
cur_dict[k] = {}
|
|
||||||
cur_dict = cur_dict[k]
|
|
||||||
cur_dict[key_splits[-1]] = value
|
|
||||||
|
|
||||||
|
|
||||||
class GroupCoordinator:
|
class GroupCoordinator:
|
||||||
"""
|
"""
|
||||||
PyTorch ProcessGroup wrapper for a group of processes.
|
PyTorch ProcessGroup wrapper for a group of processes.
|
||||||
@@ -566,7 +542,7 @@ class GroupCoordinator:
|
|||||||
device=value.device)
|
device=value.device)
|
||||||
if tensor.numel() == 0:
|
if tensor.numel() == 0:
|
||||||
# Skip broadcasting empty tensors.
|
# Skip broadcasting empty tensors.
|
||||||
_update_nested_dict(tensor_dict, key, tensor)
|
tensor_dict[key] = tensor
|
||||||
continue
|
continue
|
||||||
if tensor.is_cpu:
|
if tensor.is_cpu:
|
||||||
# use metadata_group for CPU tensors
|
# use metadata_group for CPU tensors
|
||||||
@@ -583,9 +559,9 @@ class GroupCoordinator:
|
|||||||
group=group,
|
group=group,
|
||||||
async_op=True)
|
async_op=True)
|
||||||
async_handles.append(handle)
|
async_handles.append(handle)
|
||||||
_update_nested_dict(tensor_dict, key, tensor)
|
tensor_dict[key] = tensor
|
||||||
else:
|
else:
|
||||||
_update_nested_dict(tensor_dict, key, value)
|
tensor_dict[key] = value
|
||||||
for async_handle in async_handles:
|
for async_handle in async_handles:
|
||||||
async_handle.wait()
|
async_handle.wait()
|
||||||
return tensor_dict
|
return tensor_dict
|
||||||
@@ -661,7 +637,7 @@ class GroupCoordinator:
|
|||||||
device=value.device)
|
device=value.device)
|
||||||
if tensor.numel() == 0:
|
if tensor.numel() == 0:
|
||||||
# Skip broadcasting empty tensors.
|
# Skip broadcasting empty tensors.
|
||||||
_update_nested_dict(tensor_dict, key, tensor)
|
tensor_dict[key] = tensor
|
||||||
continue
|
continue
|
||||||
if tensor.is_cpu:
|
if tensor.is_cpu:
|
||||||
# use metadata_group for CPU tensors
|
# use metadata_group for CPU tensors
|
||||||
@@ -673,9 +649,9 @@ class GroupCoordinator:
|
|||||||
torch.distributed.recv(tensor,
|
torch.distributed.recv(tensor,
|
||||||
src=self.ranks[src],
|
src=self.ranks[src],
|
||||||
group=group)
|
group=group)
|
||||||
_update_nested_dict(tensor_dict, key, tensor)
|
tensor_dict[key] = tensor
|
||||||
else:
|
else:
|
||||||
_update_nested_dict(tensor_dict, key, value)
|
tensor_dict[key] = value
|
||||||
return tensor_dict
|
return tensor_dict
|
||||||
|
|
||||||
def barrier(self):
|
def barrier(self):
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
from .base import (BatchedTensors, MultiModalDataBuiltins, MultiModalDataDict,
|
from .base import (BatchedTensorInputs, BatchedTensors, MultiModalDataBuiltins,
|
||||||
MultiModalInputs, MultiModalPlugin, NestedTensors)
|
MultiModalDataDict, MultiModalInputs, MultiModalPlugin,
|
||||||
|
NestedTensors)
|
||||||
from .registry import MultiModalRegistry
|
from .registry import MultiModalRegistry
|
||||||
|
|
||||||
MULTIMODAL_REGISTRY = MultiModalRegistry()
|
MULTIMODAL_REGISTRY = MultiModalRegistry()
|
||||||
@@ -12,6 +13,7 @@ See also:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
|
"BatchedTensorInputs",
|
||||||
"BatchedTensors",
|
"BatchedTensors",
|
||||||
"MultiModalDataBuiltins",
|
"MultiModalDataBuiltins",
|
||||||
"MultiModalDataDict",
|
"MultiModalDataDict",
|
||||||
|
|||||||
@@ -9,10 +9,12 @@ import torch
|
|||||||
import torch.types
|
import torch.types
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from torch import nn
|
from torch import nn
|
||||||
|
from typing_extensions import TypeAlias
|
||||||
|
|
||||||
from vllm.config import ModelConfig
|
from vllm.config import ModelConfig
|
||||||
from vllm.inputs import InputContext
|
from vllm.inputs import InputContext
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
from vllm.utils import JSONTree, json_map_leaves
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
@@ -22,11 +24,16 @@ Use a list instead of a tensor if the dimensions of each element do not match.
|
|||||||
Currently only supports up to singly nested list of tensors.
|
Currently only supports up to singly nested list of tensors.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
BatchedTensors = Union[GenericSequence[NestedTensors], NestedTensors]
|
BatchedTensors: TypeAlias = JSONTree[torch.Tensor]
|
||||||
"""
|
"""
|
||||||
If each input tensor in the batch has the same size, this is a single batched
|
A nested JSON structure of tensors which have been batched via
|
||||||
tensor; otherwise, this is a list of :class:`NestedTensors` with one element
|
:meth:`MultiModalInputs.batch`.
|
||||||
per item in the batch.
|
"""
|
||||||
|
|
||||||
|
BatchedTensorInputs: TypeAlias = Dict[str, JSONTree[torch.Tensor]]
|
||||||
|
"""
|
||||||
|
A dictionary containing nested tensors which have been batched via
|
||||||
|
:meth:`MultiModalInputs.batch`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if sys.version_info < (3, 9):
|
if sys.version_info < (3, 9):
|
||||||
@@ -46,14 +53,17 @@ class MultiModalInputs(_MultiModalInputsBase):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def try_concat(
|
def _try_concat(
|
||||||
tensors: List[NestedTensors],
|
tensors: List[NestedTensors],
|
||||||
*,
|
) -> Union[GenericSequence[NestedTensors], NestedTensors]:
|
||||||
device: torch.types.Device,
|
"""
|
||||||
) -> BatchedTensors:
|
If each input tensor in the batch has the same shape, return a single
|
||||||
|
batched tensor; otherwise, return a list of :class:`NestedTensors` with
|
||||||
|
one element per item in the batch.
|
||||||
|
"""
|
||||||
# may be list rather than tensors
|
# may be list rather than tensors
|
||||||
if isinstance(tensors[0], list):
|
if isinstance(tensors[0], list):
|
||||||
return [[t.to(device=device) for t in tensor[0]]
|
return [[t for t in tensor[0]]
|
||||||
for tensor in cast(List[List[torch.Tensor]], tensors)]
|
for tensor in cast(List[List[torch.Tensor]], tensors)]
|
||||||
|
|
||||||
tensors_ = cast(List[torch.Tensor], tensors)
|
tensors_ = cast(List[torch.Tensor], tensors)
|
||||||
@@ -62,18 +72,21 @@ class MultiModalInputs(_MultiModalInputsBase):
|
|||||||
|
|
||||||
for tensor in tensors_:
|
for tensor in tensors_:
|
||||||
if tensor.shape[1:] != unbatched_shape:
|
if tensor.shape[1:] != unbatched_shape:
|
||||||
return [
|
return [tensor.squeeze(0) for tensor in tensors_]
|
||||||
tensor.squeeze(0).to(device=device) for tensor in tensors_
|
|
||||||
]
|
|
||||||
|
|
||||||
return torch.cat(tensors_, dim=0).to(device=device)
|
return torch.cat(tensors_, dim=0)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def batch(
|
def batch(inputs_list: List["MultiModalInputs"]) -> BatchedTensorInputs:
|
||||||
inputs_list: List["MultiModalInputs"],
|
"""
|
||||||
device: torch.types.Device,
|
Batch multiple inputs together into a dictionary.
|
||||||
) -> Dict[str, BatchedTensors]:
|
|
||||||
"""Batch multiple inputs together into a dictionary."""
|
The resulting dictionary has the same keys as the inputs.
|
||||||
|
If the corresponding value from each input is a tensor and they all
|
||||||
|
share the same shape, the output value is a single batched tensor;
|
||||||
|
otherwise, the output value is a list containing the original value
|
||||||
|
from each input.
|
||||||
|
"""
|
||||||
if len(inputs_list) == 0:
|
if len(inputs_list) == 0:
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
@@ -90,9 +103,18 @@ class MultiModalInputs(_MultiModalInputsBase):
|
|||||||
item_lists[k].append(v)
|
item_lists[k].append(v)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
k: MultiModalInputs.try_concat(item_list, device=device)
|
k: MultiModalInputs._try_concat(item_list)
|
||||||
for k, item_list in item_lists.items()
|
for k, item_list in item_lists.items()
|
||||||
}
|
} # type: ignore
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def as_kwargs(
|
||||||
|
batched_inputs: BatchedTensorInputs,
|
||||||
|
*,
|
||||||
|
device: torch.types.Device,
|
||||||
|
) -> BatchedTensorInputs:
|
||||||
|
return json_map_leaves(lambda x: x.to(device, non_blocking=True),
|
||||||
|
batched_inputs)
|
||||||
|
|
||||||
|
|
||||||
class MultiModalDataBuiltins(TypedDict, total=False):
|
class MultiModalDataBuiltins(TypedDict, total=False):
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
|
|||||||
ModelConfig, MultiModalConfig, ParallelConfig,
|
ModelConfig, MultiModalConfig, ParallelConfig,
|
||||||
PromptAdapterConfig, SchedulerConfig)
|
PromptAdapterConfig, SchedulerConfig)
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
from vllm.multimodal import MultiModalInputs
|
||||||
from vllm.sequence import (ExecuteModelRequest, IntermediateTensors,
|
from vllm.sequence import (ExecuteModelRequest, IntermediateTensors,
|
||||||
SamplerOutput)
|
SamplerOutput)
|
||||||
from vllm.worker.model_runner import (ModelInputForGPUWithSamplingMetadata,
|
from vllm.worker.model_runner import (ModelInputForGPUWithSamplingMetadata,
|
||||||
@@ -323,7 +324,8 @@ class TP1DraftModelRunner(ModelRunner):
|
|||||||
kv_caches=kv_caches,
|
kv_caches=kv_caches,
|
||||||
attn_metadata=model_input.attn_metadata,
|
attn_metadata=model_input.attn_metadata,
|
||||||
intermediate_tensors=intermediate_tensors,
|
intermediate_tensors=intermediate_tensors,
|
||||||
**multi_modal_kwargs,
|
**MultiModalInputs.as_kwargs(multi_modal_kwargs,
|
||||||
|
device=self.device),
|
||||||
)
|
)
|
||||||
|
|
||||||
# Compute the logits.
|
# Compute the logits.
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ from functools import lru_cache, partial, wraps
|
|||||||
from platform import uname
|
from platform import uname
|
||||||
from typing import (Any, AsyncIterator, Awaitable, Callable, Dict, Generic,
|
from typing import (Any, AsyncIterator, Awaitable, Callable, Dict, Generic,
|
||||||
Hashable, List, Optional, OrderedDict, Set, Tuple, TypeVar,
|
Hashable, List, Optional, OrderedDict, Set, Tuple, TypeVar,
|
||||||
Union)
|
Union, overload)
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import numpy.typing as npt
|
import numpy.typing as npt
|
||||||
@@ -53,6 +53,7 @@ TORCH_DTYPE_TO_NUMPY_DTYPE = {
|
|||||||
P = ParamSpec('P')
|
P = ParamSpec('P')
|
||||||
K = TypeVar("K")
|
K = TypeVar("K")
|
||||||
T = TypeVar("T")
|
T = TypeVar("T")
|
||||||
|
U = TypeVar("U")
|
||||||
|
|
||||||
|
|
||||||
class _Sentinel:
|
class _Sentinel:
|
||||||
@@ -712,6 +713,54 @@ def merge_dicts(dict1: Dict[K, List[T]],
|
|||||||
return dict(merged_dict)
|
return dict(merged_dict)
|
||||||
|
|
||||||
|
|
||||||
|
JSONTree = Union[Dict[str, "JSONTree[T]"], List["JSONTree[T]"],
|
||||||
|
Tuple["JSONTree[T]", ...], T]
|
||||||
|
"""A nested JSON structure where the leaves need not be JSON-serializable."""
|
||||||
|
|
||||||
|
|
||||||
|
@overload
|
||||||
|
def json_map_leaves(
|
||||||
|
func: Callable[[T], U],
|
||||||
|
value: Dict[str, JSONTree[T]],
|
||||||
|
) -> Dict[str, JSONTree[U]]:
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
@overload
|
||||||
|
def json_map_leaves(
|
||||||
|
func: Callable[[T], U],
|
||||||
|
value: List[JSONTree[T]],
|
||||||
|
) -> List[JSONTree[U]]:
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
@overload
|
||||||
|
def json_map_leaves(
|
||||||
|
func: Callable[[T], U],
|
||||||
|
value: Tuple[JSONTree[T], ...],
|
||||||
|
) -> Tuple[JSONTree[U], ...]:
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
@overload
|
||||||
|
def json_map_leaves(
|
||||||
|
func: Callable[[T], U],
|
||||||
|
value: JSONTree[T],
|
||||||
|
) -> JSONTree[U]:
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
def json_map_leaves(func: Callable[[T], U], value: JSONTree[T]) -> JSONTree[U]:
|
||||||
|
if isinstance(value, dict):
|
||||||
|
return {k: json_map_leaves(func, v) for k, v in value.items()}
|
||||||
|
elif isinstance(value, list):
|
||||||
|
return [json_map_leaves(func, v) for v in value]
|
||||||
|
elif isinstance(value, tuple):
|
||||||
|
return tuple(json_map_leaves(func, v) for v in value)
|
||||||
|
else:
|
||||||
|
return func(value)
|
||||||
|
|
||||||
|
|
||||||
def flatten_2d_lists(lists: List[List[T]]) -> List[T]:
|
def flatten_2d_lists(lists: List[List[T]]) -> List[T]:
|
||||||
"""Flatten a list of lists to a single list."""
|
"""Flatten a list of lists to a single list."""
|
||||||
return [item for sublist in lists for item in sublist]
|
return [item for sublist in lists for item in sublist]
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import (TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple,
|
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union
|
||||||
Type, Union)
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from torch import nn
|
from torch import nn
|
||||||
@@ -12,7 +11,7 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
|
|||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.model_executor import SamplingMetadata
|
from vllm.model_executor import SamplingMetadata
|
||||||
from vllm.model_executor.model_loader import get_model
|
from vllm.model_executor.model_loader import get_model
|
||||||
from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors,
|
from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
|
||||||
MultiModalInputs)
|
MultiModalInputs)
|
||||||
from vllm.sequence import (IntermediateTensors, SamplerOutput,
|
from vllm.sequence import (IntermediateTensors, SamplerOutput,
|
||||||
SequenceGroupMetadata)
|
SequenceGroupMetadata)
|
||||||
@@ -41,7 +40,7 @@ class CPUModelInput(ModelRunnerInputBase):
|
|||||||
input_positions: Optional[torch.Tensor] = None
|
input_positions: Optional[torch.Tensor] = None
|
||||||
attn_metadata: Optional["AttentionMetadata"] = None
|
attn_metadata: Optional["AttentionMetadata"] = None
|
||||||
sampling_metadata: Optional["SamplingMetadata"] = None
|
sampling_metadata: Optional["SamplingMetadata"] = None
|
||||||
multi_modal_kwargs: Optional[Mapping[str, BatchedTensors]] = None
|
multi_modal_kwargs: Optional[BatchedTensorInputs] = None
|
||||||
virtual_engine: Optional[int] = None
|
virtual_engine: Optional[int] = None
|
||||||
|
|
||||||
def as_broadcastable_tensor_dict(
|
def as_broadcastable_tensor_dict(
|
||||||
@@ -136,7 +135,7 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
|
|||||||
self,
|
self,
|
||||||
seq_group_metadata_list: List[SequenceGroupMetadata],
|
seq_group_metadata_list: List[SequenceGroupMetadata],
|
||||||
) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int],
|
) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int],
|
||||||
Mapping[str, BatchedTensors]]:
|
BatchedTensorInputs]:
|
||||||
assert len(seq_group_metadata_list) > 0
|
assert len(seq_group_metadata_list) > 0
|
||||||
input_tokens: List[int] = []
|
input_tokens: List[int] = []
|
||||||
input_positions: List[int] = []
|
input_positions: List[int] = []
|
||||||
@@ -214,8 +213,7 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
|
|||||||
slot_mapping=slot_mapping,
|
slot_mapping=slot_mapping,
|
||||||
)
|
)
|
||||||
|
|
||||||
multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list,
|
multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list)
|
||||||
device=self.device)
|
|
||||||
|
|
||||||
return (input_tokens, input_positions, attn_metadata, seq_lens,
|
return (input_tokens, input_positions, attn_metadata, seq_lens,
|
||||||
multi_modal_kwargs)
|
multi_modal_kwargs)
|
||||||
@@ -361,11 +359,16 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
|
|||||||
|
|
||||||
model_executable = self.model
|
model_executable = self.model
|
||||||
execute_model_kwargs = {
|
execute_model_kwargs = {
|
||||||
"input_ids": model_input.input_tokens,
|
"input_ids":
|
||||||
"positions": model_input.input_positions,
|
model_input.input_tokens,
|
||||||
"kv_caches": kv_caches,
|
"positions":
|
||||||
"attn_metadata": model_input.attn_metadata,
|
model_input.input_positions,
|
||||||
**(model_input.multi_modal_kwargs or {}),
|
"kv_caches":
|
||||||
|
kv_caches,
|
||||||
|
"attn_metadata":
|
||||||
|
model_input.attn_metadata,
|
||||||
|
**MultiModalInputs.as_kwargs(model_input.multi_modal_kwargs or {},
|
||||||
|
device=self.device),
|
||||||
}
|
}
|
||||||
|
|
||||||
hidden_states = model_executable(**execute_model_kwargs)
|
hidden_states = model_executable(**execute_model_kwargs)
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
|
|||||||
PromptAdapterConfig, SchedulerConfig)
|
PromptAdapterConfig, SchedulerConfig)
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.model_executor.pooling_metadata import PoolingMetadata
|
from vllm.model_executor.pooling_metadata import PoolingMetadata
|
||||||
|
from vllm.multimodal import MultiModalInputs
|
||||||
from vllm.pooling_params import PoolingParams
|
from vllm.pooling_params import PoolingParams
|
||||||
from vllm.sequence import (IntermediateTensors, PoolerOutput, SequenceData,
|
from vllm.sequence import (IntermediateTensors, PoolerOutput, SequenceData,
|
||||||
SequenceGroupMetadata)
|
SequenceGroupMetadata)
|
||||||
@@ -99,11 +100,16 @@ class EmbeddingModelRunner(
|
|||||||
kv_caches = [None] * num_layers
|
kv_caches = [None] * num_layers
|
||||||
|
|
||||||
execute_model_kwargs = {
|
execute_model_kwargs = {
|
||||||
"input_ids": model_input.input_tokens,
|
"input_ids":
|
||||||
"positions": model_input.input_positions,
|
model_input.input_tokens,
|
||||||
"kv_caches": kv_caches,
|
"positions":
|
||||||
"attn_metadata": model_input.attn_metadata,
|
model_input.input_positions,
|
||||||
**(model_input.multi_modal_kwargs or {}),
|
"kv_caches":
|
||||||
|
kv_caches,
|
||||||
|
"attn_metadata":
|
||||||
|
model_input.attn_metadata,
|
||||||
|
**MultiModalInputs.as_kwargs(model_input.multi_modal_kwargs or {},
|
||||||
|
device=self.device),
|
||||||
}
|
}
|
||||||
|
|
||||||
hidden_states = model_executable(**execute_model_kwargs)
|
hidden_states = model_executable(**execute_model_kwargs)
|
||||||
|
|||||||
@@ -4,8 +4,8 @@ import time
|
|||||||
import warnings
|
import warnings
|
||||||
import weakref
|
import weakref
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import (TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Set,
|
from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Type,
|
||||||
Tuple, Type, TypeVar, Union)
|
TypeVar, Union)
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
@@ -40,7 +40,7 @@ from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
|
|||||||
from vllm.model_executor.models.interfaces import (supports_lora,
|
from vllm.model_executor.models.interfaces import (supports_lora,
|
||||||
supports_vision)
|
supports_vision)
|
||||||
from vllm.model_executor.models.utils import set_cpu_offload_max_bytes
|
from vllm.model_executor.models.utils import set_cpu_offload_max_bytes
|
||||||
from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors,
|
from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
|
||||||
MultiModalInputs)
|
MultiModalInputs)
|
||||||
from vllm.prompt_adapter.layers import PromptAdapterMapping
|
from vllm.prompt_adapter.layers import PromptAdapterMapping
|
||||||
from vllm.prompt_adapter.request import PromptAdapterRequest
|
from vllm.prompt_adapter.request import PromptAdapterRequest
|
||||||
@@ -94,7 +94,7 @@ class ModelInputForGPU(ModelRunnerInputBase):
|
|||||||
attn_metadata: Optional["AttentionMetadata"] = None
|
attn_metadata: Optional["AttentionMetadata"] = None
|
||||||
prompt_adapter_mapping: Optional[PromptAdapterMapping] = None
|
prompt_adapter_mapping: Optional[PromptAdapterMapping] = None
|
||||||
prompt_adapter_requests: Optional[Set[PromptAdapterRequest]] = None
|
prompt_adapter_requests: Optional[Set[PromptAdapterRequest]] = None
|
||||||
multi_modal_kwargs: Optional[Mapping[str, BatchedTensors]] = None
|
multi_modal_kwargs: Optional[BatchedTensorInputs] = None
|
||||||
request_ids_to_seq_ids: Optional[Dict[str, List[int]]] = None
|
request_ids_to_seq_ids: Optional[Dict[str, List[int]]] = None
|
||||||
finished_requests_ids: Optional[List[str]] = None
|
finished_requests_ids: Optional[List[str]] = None
|
||||||
virtual_engine: int = 0
|
virtual_engine: int = 0
|
||||||
@@ -608,8 +608,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
|
|||||||
data.multi_modal_inputs for data in self.inter_data_list
|
data.multi_modal_inputs for data in self.inter_data_list
|
||||||
if data.multi_modal_inputs is not None
|
if data.multi_modal_inputs is not None
|
||||||
]
|
]
|
||||||
multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list,
|
multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list)
|
||||||
device=self.runner.device)
|
|
||||||
|
|
||||||
return self.model_input_cls(
|
return self.model_input_cls(
|
||||||
input_tokens=input_tokens_tensor,
|
input_tokens=input_tokens_tensor,
|
||||||
@@ -1361,7 +1360,8 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
|
|||||||
kv_caches=kv_caches,
|
kv_caches=kv_caches,
|
||||||
attn_metadata=model_input.attn_metadata,
|
attn_metadata=model_input.attn_metadata,
|
||||||
intermediate_tensors=intermediate_tensors,
|
intermediate_tensors=intermediate_tensors,
|
||||||
**multi_modal_kwargs,
|
**MultiModalInputs.as_kwargs(multi_modal_kwargs,
|
||||||
|
device=self.device),
|
||||||
**seqlen_agnostic_kwargs)
|
**seqlen_agnostic_kwargs)
|
||||||
|
|
||||||
# Compute the logits in the last pipeline stage.
|
# Compute the logits in the last pipeline stage.
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import (TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple,
|
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
|
||||||
Union)
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from torch import nn
|
from torch import nn
|
||||||
@@ -10,7 +9,7 @@ from vllm.config import (DeviceConfig, ModelConfig, ParallelConfig,
|
|||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.model_executor import SamplingMetadata
|
from vllm.model_executor import SamplingMetadata
|
||||||
from vllm.model_executor.model_loader.neuron import get_neuron_model
|
from vllm.model_executor.model_loader.neuron import get_neuron_model
|
||||||
from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors,
|
from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
|
||||||
MultiModalInputs)
|
MultiModalInputs)
|
||||||
from vllm.sequence import (IntermediateTensors, SamplerOutput,
|
from vllm.sequence import (IntermediateTensors, SamplerOutput,
|
||||||
SequenceGroupMetadata)
|
SequenceGroupMetadata)
|
||||||
@@ -32,7 +31,7 @@ class ModelInputForNeuron(ModelRunnerInputBase):
|
|||||||
input_positions: Optional[torch.Tensor] = None
|
input_positions: Optional[torch.Tensor] = None
|
||||||
input_block_ids: Optional[torch.Tensor] = None
|
input_block_ids: Optional[torch.Tensor] = None
|
||||||
sampling_metadata: Optional["SamplingMetadata"] = None
|
sampling_metadata: Optional["SamplingMetadata"] = None
|
||||||
multi_modal_kwargs: Optional[Mapping[str, BatchedTensors]] = None
|
multi_modal_kwargs: Optional[BatchedTensorInputs] = None
|
||||||
|
|
||||||
def as_broadcastable_tensor_dict(
|
def as_broadcastable_tensor_dict(
|
||||||
self) -> Dict[str, Union[int, torch.Tensor]]:
|
self) -> Dict[str, Union[int, torch.Tensor]]:
|
||||||
@@ -84,8 +83,8 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
|
|||||||
def _prepare_prompt(
|
def _prepare_prompt(
|
||||||
self,
|
self,
|
||||||
seq_group_metadata_list: List[SequenceGroupMetadata],
|
seq_group_metadata_list: List[SequenceGroupMetadata],
|
||||||
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[int], Mapping[
|
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[int],
|
||||||
str, BatchedTensors]]:
|
BatchedTensorInputs]:
|
||||||
assert len(seq_group_metadata_list) > 0
|
assert len(seq_group_metadata_list) > 0
|
||||||
input_tokens: List[List[int]] = []
|
input_tokens: List[List[int]] = []
|
||||||
input_positions: List[List[int]] = []
|
input_positions: List[List[int]] = []
|
||||||
@@ -134,8 +133,7 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
|
|||||||
dtype=torch.long,
|
dtype=torch.long,
|
||||||
device=self.device)
|
device=self.device)
|
||||||
|
|
||||||
multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list,
|
multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list)
|
||||||
device=self.device)
|
|
||||||
|
|
||||||
return (input_tokens, input_positions, input_block_ids, seq_lens,
|
return (input_tokens, input_positions, input_block_ids, seq_lens,
|
||||||
multi_modal_kwargs)
|
multi_modal_kwargs)
|
||||||
@@ -244,7 +242,8 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
|
|||||||
input_ids=model_input.input_tokens,
|
input_ids=model_input.input_tokens,
|
||||||
positions=model_input.input_positions,
|
positions=model_input.input_positions,
|
||||||
input_block_ids=model_input.input_block_ids,
|
input_block_ids=model_input.input_block_ids,
|
||||||
**(model_input.multi_modal_kwargs or {}),
|
**MultiModalInputs.as_kwargs(model_input.multi_modal_kwargs or {},
|
||||||
|
device=self.device),
|
||||||
)
|
)
|
||||||
|
|
||||||
# Compute the logits.
|
# Compute the logits.
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
from typing import List, Mapping, NamedTuple, Optional, Tuple
|
from typing import List, NamedTuple, Optional, Tuple
|
||||||
|
|
||||||
import openvino as ov
|
import openvino as ov
|
||||||
import torch
|
import torch
|
||||||
@@ -12,7 +12,7 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
|
|||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.model_executor import SamplingMetadata
|
from vllm.model_executor import SamplingMetadata
|
||||||
from vllm.model_executor.model_loader.openvino import get_model
|
from vllm.model_executor.model_loader.openvino import get_model
|
||||||
from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors,
|
from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
|
||||||
MultiModalInputs)
|
MultiModalInputs)
|
||||||
from vllm.sequence import SamplerOutput, SequenceGroupMetadata
|
from vllm.sequence import SamplerOutput, SequenceGroupMetadata
|
||||||
|
|
||||||
@@ -25,7 +25,7 @@ class ModelInput(NamedTuple):
|
|||||||
attn_metadata: Optional[OpenVINOAttentionMetadata]
|
attn_metadata: Optional[OpenVINOAttentionMetadata]
|
||||||
seq_lens: List[int]
|
seq_lens: List[int]
|
||||||
query_lens: List[int]
|
query_lens: List[int]
|
||||||
multi_modal_kwargs: Mapping[str, BatchedTensors]
|
multi_modal_kwargs: BatchedTensorInputs
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def empty(cls, device):
|
def empty(cls, device):
|
||||||
@@ -265,8 +265,7 @@ class OpenVINOModelRunner:
|
|||||||
max_context_len=max_context_len_tensor,
|
max_context_len=max_context_len_tensor,
|
||||||
)
|
)
|
||||||
|
|
||||||
multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list,
|
multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list)
|
||||||
device=self.device)
|
|
||||||
|
|
||||||
return ModelInput(
|
return ModelInput(
|
||||||
input_tokens,
|
input_tokens,
|
||||||
@@ -281,7 +280,7 @@ class OpenVINOModelRunner:
|
|||||||
self,
|
self,
|
||||||
seq_group_metadata_list: List[SequenceGroupMetadata],
|
seq_group_metadata_list: List[SequenceGroupMetadata],
|
||||||
) -> Tuple[torch.Tensor, torch.Tensor, OpenVINOAttentionMetadata,
|
) -> Tuple[torch.Tensor, torch.Tensor, OpenVINOAttentionMetadata,
|
||||||
SamplingMetadata, Mapping[str, BatchedTensors]]:
|
SamplingMetadata, BatchedTensorInputs]:
|
||||||
# Prepare input tensors.
|
# Prepare input tensors.
|
||||||
(
|
(
|
||||||
input_tokens,
|
input_tokens,
|
||||||
@@ -324,11 +323,16 @@ class OpenVINOModelRunner:
|
|||||||
|
|
||||||
model_executable = self.model
|
model_executable = self.model
|
||||||
execute_model_kwargs = {
|
execute_model_kwargs = {
|
||||||
"input_ids": input_tokens,
|
"input_ids":
|
||||||
"positions": input_positions,
|
input_tokens,
|
||||||
"kv_caches": kv_caches,
|
"positions":
|
||||||
"attn_metadata": attn_metadata,
|
input_positions,
|
||||||
**(multi_modal_kwargs or {}),
|
"kv_caches":
|
||||||
|
kv_caches,
|
||||||
|
"attn_metadata":
|
||||||
|
attn_metadata,
|
||||||
|
**MultiModalInputs.as_kwargs(multi_modal_kwargs or {},
|
||||||
|
device=self.device),
|
||||||
}
|
}
|
||||||
|
|
||||||
hidden_states = model_executable(**execute_model_kwargs)
|
hidden_states = model_executable(**execute_model_kwargs)
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import (TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple,
|
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union
|
||||||
Type, Union)
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
@@ -14,7 +13,7 @@ from vllm.inputs import INPUT_REGISTRY
|
|||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.model_executor.model_loader import get_model
|
from vllm.model_executor.model_loader import get_model
|
||||||
from vllm.model_executor.models.interfaces import supports_vision
|
from vllm.model_executor.models.interfaces import supports_vision
|
||||||
from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors,
|
from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
|
||||||
MultiModalInputs)
|
MultiModalInputs)
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
from vllm.sequence import (IntermediateTensors, SamplerOutput,
|
from vllm.sequence import (IntermediateTensors, SamplerOutput,
|
||||||
@@ -49,7 +48,7 @@ class ModelInputForXPU(ModelRunnerInputBase):
|
|||||||
input_positions: Optional[torch.Tensor] = None
|
input_positions: Optional[torch.Tensor] = None
|
||||||
attn_metadata: Optional["AttentionMetadata"] = None
|
attn_metadata: Optional["AttentionMetadata"] = None
|
||||||
sampling_metadata: Optional["SamplingMetadata"] = None
|
sampling_metadata: Optional["SamplingMetadata"] = None
|
||||||
multi_modal_kwargs: Optional[Mapping[str, BatchedTensors]] = None
|
multi_modal_kwargs: Optional[BatchedTensorInputs] = None
|
||||||
|
|
||||||
def as_broadcastable_tensor_dict(
|
def as_broadcastable_tensor_dict(
|
||||||
self) -> Dict[str, Union[int, torch.Tensor]]:
|
self) -> Dict[str, Union[int, torch.Tensor]]:
|
||||||
@@ -376,11 +375,16 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]):
|
|||||||
|
|
||||||
model_executable = self.model
|
model_executable = self.model
|
||||||
execute_model_kwargs = {
|
execute_model_kwargs = {
|
||||||
"input_ids": model_input.input_tokens,
|
"input_ids":
|
||||||
"positions": model_input.input_positions,
|
model_input.input_tokens,
|
||||||
"kv_caches": kv_caches,
|
"positions":
|
||||||
"attn_metadata": model_input.attn_metadata,
|
model_input.input_positions,
|
||||||
**(model_input.multi_modal_kwargs or {}),
|
"kv_caches":
|
||||||
|
kv_caches,
|
||||||
|
"attn_metadata":
|
||||||
|
model_input.attn_metadata,
|
||||||
|
**MultiModalInputs.as_kwargs(model_input.multi_modal_kwargs or {},
|
||||||
|
device=self.device),
|
||||||
}
|
}
|
||||||
|
|
||||||
hidden_states = model_executable(**execute_model_kwargs)
|
hidden_states = model_executable(**execute_model_kwargs)
|
||||||
@@ -404,7 +408,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]):
|
|||||||
self,
|
self,
|
||||||
seq_group_metadata_list: List[SequenceGroupMetadata],
|
seq_group_metadata_list: List[SequenceGroupMetadata],
|
||||||
) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int],
|
) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int],
|
||||||
Mapping[str, BatchedTensors]]:
|
BatchedTensorInputs]:
|
||||||
assert len(seq_group_metadata_list) > 0
|
assert len(seq_group_metadata_list) > 0
|
||||||
input_tokens: List[int] = []
|
input_tokens: List[int] = []
|
||||||
input_positions: List[int] = []
|
input_positions: List[int] = []
|
||||||
@@ -496,8 +500,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]):
|
|||||||
block_tables=torch.tensor([], device=self.device, dtype=torch.int),
|
block_tables=torch.tensor([], device=self.device, dtype=torch.int),
|
||||||
)
|
)
|
||||||
|
|
||||||
multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list,
|
multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list)
|
||||||
device=self.device)
|
|
||||||
|
|
||||||
return (input_tokens, input_positions, attn_metadata, seq_lens,
|
return (input_tokens, input_positions, attn_metadata, seq_lens,
|
||||||
multi_modal_kwargs)
|
multi_modal_kwargs)
|
||||||
|
|||||||
Reference in New Issue
Block a user