[Model] PP support for embedding models and update docs (#9090)
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
This commit is contained in:
@@ -7,7 +7,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
|
||||
"""
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from typing import List, NamedTuple, Optional
|
||||
from typing import List, Literal, NamedTuple, Optional
|
||||
|
||||
import pytest
|
||||
|
||||
@@ -97,6 +97,9 @@ class PPTestSettings:
|
||||
self.trust_remote_code, self.tokenizer_mode)
|
||||
|
||||
|
||||
# NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
|
||||
# The values displayed here are only a rough indicator of the size of the model
|
||||
|
||||
# yapf: disable
|
||||
GENERATION_MODEL_SETTINGS = {
|
||||
# [DETAILED TESTS]
|
||||
@@ -104,15 +107,13 @@ GENERATION_MODEL_SETTINGS = {
|
||||
# [FAST TESTS]
|
||||
# Uses Llama
|
||||
# "BAAI/AquilaChat-7B": PPTestSettings.fast(),
|
||||
# TODO: Test on larger GPU
|
||||
# "Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501
|
||||
"Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(tp_base=8, trust_remote_code=True), # noqa: E501
|
||||
"baichuan-inc/Baichuan-7B": PPTestSettings.fast(trust_remote_code=True),
|
||||
"baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(trust_remote_code=True), # noqa: E501
|
||||
"bigscience/bloomz-1b1": PPTestSettings.fast(),
|
||||
"THUDM/chatglm3-6b": PPTestSettings.fast(trust_remote_code=True),
|
||||
"CohereForAI/c4ai-command-r-v01": PPTestSettings.fast(tp_base=2, trust_remote_code=True), # noqa: E501
|
||||
# TODO: Test on larger GPU
|
||||
# "databricks/dbrx-instruct": PPTestSettings.fast(),
|
||||
"databricks/dbrx-instruct": PPTestSettings.fast(tp_base=8),
|
||||
"Deci/DeciLM-7B-instruct": PPTestSettings.fast(trust_remote_code=True),
|
||||
"deepseek-ai/deepseek-llm-7b-chat": PPTestSettings.fast(),
|
||||
"deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(trust_remote_code=True), # noqa: E501
|
||||
@@ -161,8 +162,9 @@ GENERATION_MODEL_SETTINGS = {
|
||||
|
||||
EMBEDDING_MODEL_SETTINGS = { # type: ignore[var-annotated]
|
||||
# [FAST TESTS]
|
||||
# Uses Llama
|
||||
# "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(),
|
||||
"intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(),
|
||||
"BAAI/bge-multilingual-gemma2": PPTestSettings.fast(),
|
||||
"Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(tp_base=4, trust_remote_code=True), # noqa: E501
|
||||
}
|
||||
|
||||
MULTIMODAL_MODEL_SETTINGS = {
|
||||
@@ -192,40 +194,35 @@ CONDITIONAL_GENERATION_MODEL_SETTINGS = { # type: ignore[var-annotated]
|
||||
}
|
||||
# yapf: enable
|
||||
|
||||
MODEL_SETTINGS = {
|
||||
**GENERATION_MODEL_SETTINGS,
|
||||
**EMBEDDING_MODEL_SETTINGS,
|
||||
**MULTIMODAL_MODEL_SETTINGS,
|
||||
}
|
||||
|
||||
# You can update this on your local machine to run specific tests
|
||||
# NOTE: You can update this on your local machine to run specific tests
|
||||
TEST_MODELS = [
|
||||
# [LANGUAGE GENERATION]
|
||||
"meta-llama/Meta-Llama-3-8B",
|
||||
"facebook/chameleon-7b",
|
||||
"ibm/PowerLM-3b",
|
||||
# [LANGUAGE EMBEDDING]
|
||||
"intfloat/e5-mistral-7b-instruct",
|
||||
"BAAI/bge-multilingual-gemma2",
|
||||
# [MULTIMODAL GENERATION]
|
||||
"OpenGVLab/InternVL2-1B",
|
||||
"microsoft/Phi-3-vision-128k-instruct",
|
||||
"mistralai/Pixtral-12B-2409",
|
||||
"fixie-ai/ultravox-v0_3",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("model_name", "parallel_setup", "distributed_backend",
|
||||
"trust_remote_code", "tokenizer_mode"),
|
||||
[
|
||||
params for model_name, settings in MODEL_SETTINGS.items()
|
||||
for params in settings.iter_params(model_name)
|
||||
if model_name in TEST_MODELS
|
||||
],
|
||||
)
|
||||
@fork_new_process_for_each_test
|
||||
def test_compare_tp(model_name: str, parallel_setup: ParallelSetup,
|
||||
distributed_backend: str, trust_remote_code: bool,
|
||||
tokenizer_mode: Optional[str], num_gpus_available):
|
||||
def _compare_tp(
|
||||
model_name: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
distributed_backend: str,
|
||||
trust_remote_code: bool,
|
||||
tokenizer_mode: Optional[str],
|
||||
num_gpus_available: int,
|
||||
*,
|
||||
method: Literal["generate", "encode"] = "encode",
|
||||
):
|
||||
tp_size, pp_size, eager_mode, chunked_prefill = parallel_setup
|
||||
|
||||
if num_gpus_available < tp_size:
|
||||
pytest.skip(f"Need at least {tp_size} GPUs to run the test")
|
||||
if num_gpus_available < tp_size * pp_size:
|
||||
pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
|
||||
if VLLM_MULTI_NODE and distributed_backend == "mp":
|
||||
pytest.skip("Skipping multi-node pipeline parallel test for "
|
||||
"multiprocessing distributed backend")
|
||||
@@ -286,10 +283,95 @@ def test_compare_tp(model_name: str, parallel_setup: ParallelSetup,
|
||||
]
|
||||
|
||||
try:
|
||||
compare_two_settings(model_name, pp_args, tp_args, pp_env)
|
||||
compare_two_settings(model_name,
|
||||
pp_args,
|
||||
tp_args,
|
||||
pp_env,
|
||||
method=method)
|
||||
except Exception:
|
||||
if pp_env is None:
|
||||
raise
|
||||
else:
|
||||
# Ray ADAG tests are flaky, so we don't want to fail the test
|
||||
logger.exception("Ray ADAG tests failed")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("model_name", "parallel_setup", "distributed_backend",
|
||||
"trust_remote_code", "tokenizer_mode"),
|
||||
[
|
||||
params for model_name, settings in GENERATION_MODEL_SETTINGS.items()
|
||||
for params in settings.iter_params(model_name)
|
||||
if model_name in TEST_MODELS
|
||||
],
|
||||
)
|
||||
@fork_new_process_for_each_test
|
||||
def test_tp_language_generation(
|
||||
model_name: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
distributed_backend: str,
|
||||
trust_remote_code: bool,
|
||||
tokenizer_mode: Optional[str],
|
||||
num_gpus_available,
|
||||
):
|
||||
_compare_tp(model_name,
|
||||
parallel_setup,
|
||||
distributed_backend,
|
||||
trust_remote_code,
|
||||
tokenizer_mode,
|
||||
num_gpus_available,
|
||||
method="generate")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("model_name", "parallel_setup", "distributed_backend",
|
||||
"trust_remote_code", "tokenizer_mode"),
|
||||
[
|
||||
params for model_name, settings in EMBEDDING_MODEL_SETTINGS.items()
|
||||
for params in settings.iter_params(model_name)
|
||||
if model_name in TEST_MODELS
|
||||
],
|
||||
)
|
||||
@fork_new_process_for_each_test
|
||||
def test_tp_language_embedding(
|
||||
model_name: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
distributed_backend: str,
|
||||
trust_remote_code: bool,
|
||||
tokenizer_mode: Optional[str],
|
||||
num_gpus_available,
|
||||
):
|
||||
_compare_tp(model_name,
|
||||
parallel_setup,
|
||||
distributed_backend,
|
||||
trust_remote_code,
|
||||
tokenizer_mode,
|
||||
num_gpus_available,
|
||||
method="encode")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("model_name", "parallel_setup", "distributed_backend",
|
||||
"trust_remote_code", "tokenizer_mode"),
|
||||
[
|
||||
params for model_name, settings in MULTIMODAL_MODEL_SETTINGS.items()
|
||||
for params in settings.iter_params(model_name)
|
||||
if model_name in TEST_MODELS
|
||||
],
|
||||
)
|
||||
@fork_new_process_for_each_test
|
||||
def test_tp_multimodal_generation(
|
||||
model_name: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
distributed_backend: str,
|
||||
trust_remote_code: bool,
|
||||
tokenizer_mode: Optional[str],
|
||||
num_gpus_available,
|
||||
):
|
||||
_compare_tp(model_name,
|
||||
parallel_setup,
|
||||
distributed_backend,
|
||||
trust_remote_code,
|
||||
tokenizer_mode,
|
||||
num_gpus_available,
|
||||
method="generate")
|
||||
|
||||
Reference in New Issue
Block a user