Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor
2025-10-05 15:06:22 +01:00
committed by GitHub
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions

View File

@@ -10,10 +10,11 @@ if envs.VLLM_USE_MODELSCOPE:
from packaging import version
# patch_hub begins from modelscope>=1.18.1
if version.parse(modelscope.__version__) <= version.parse('1.18.0'):
if version.parse(modelscope.__version__) <= version.parse("1.18.0"):
raise ImportError(
'Using vLLM with ModelScope needs modelscope>=1.18.1, please '
'install by `pip install modelscope -U`')
"Using vLLM with ModelScope needs modelscope>=1.18.1, please "
"install by `pip install modelscope -U`"
)
from modelscope.utils.hf_util import patch_hub
# Patch hub to download models from modelscope to speed up.
@@ -21,4 +22,5 @@ if envs.VLLM_USE_MODELSCOPE:
except ImportError as err:
raise ImportError(
"Please install modelscope>=1.18.1 via "
"`pip install modelscope>=1.18.1` to use ModelScope.") from err
"`pip install modelscope>=1.18.1` to use ModelScope."
) from err

View File

@@ -12,16 +12,14 @@ CHAT_TEMPLATES_DIR = Path(__file__).parent
ChatTemplatePath = Union[Path, Callable[[str], Optional[Path]]]
def _get_qwen_chat_template_fallback(
tokenizer_name_or_path: str) -> Optional[Path]:
def _get_qwen_chat_template_fallback(tokenizer_name_or_path: str) -> Optional[Path]:
if tokenizer_name_or_path.endswith("-Chat"):
return CHAT_TEMPLATES_DIR / "template_chatml.jinja"
return CHAT_TEMPLATES_DIR / "template_basic.jinja"
def _get_minicpmv_chat_template_fallback(
tokenizer_name_or_path: str) -> Optional[Path]:
def _get_minicpmv_chat_template_fallback(tokenizer_name_or_path: str) -> Optional[Path]:
# MiniCPM-V-4.5 version uses a dedicated template
if "4.5" in tokenizer_name_or_path or "4_5" in tokenizer_name_or_path:
return CHAT_TEMPLATES_DIR / "template_minicpmv45.jinja"
@@ -51,8 +49,10 @@ def register_chat_template_fallback_path(
if model_type in _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK:
logger.warning(
"Model type %s already has a chat template registered. "
"It will be overwritten by the new chat template %s.", model_type,
chat_template)
"It will be overwritten by the new chat template %s.",
model_type,
chat_template,
)
_MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK[model_type] = chat_template

View File

@@ -10,26 +10,32 @@ from pathlib import Path
from typing import Any, Callable, Literal, Optional, TypeVar, Union
import huggingface_hub
from huggingface_hub import get_safetensors_metadata, hf_hub_download
from huggingface_hub import (
get_safetensors_metadata,
hf_hub_download,
try_to_load_from_cache,
)
from huggingface_hub import list_repo_files as hf_list_repo_files
from huggingface_hub import try_to_load_from_cache
from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError,
LocalEntryNotFoundError,
RepositoryNotFoundError,
RevisionNotFoundError)
from huggingface_hub.utils import (
EntryNotFoundError,
HfHubHTTPError,
LocalEntryNotFoundError,
RepositoryNotFoundError,
RevisionNotFoundError,
)
from transformers import GenerationConfig, PretrainedConfig
from transformers.models.auto.image_processing_auto import (
get_image_processor_config)
from transformers.models.auto.modeling_auto import (
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)
from transformers.models.auto.image_processing_auto import get_image_processor_config
from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
from transformers.models.auto.tokenization_auto import get_tokenizer_config
from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME
from vllm import envs
from vllm.logger import init_logger
from vllm.transformers_utils.config_parser_base import ConfigParserBase
from vllm.transformers_utils.utils import (check_gguf_file,
parse_safetensors_file_metadata)
from vllm.transformers_utils.utils import (
check_gguf_file,
parse_safetensors_file_metadata,
)
if envs.VLLM_USE_MODELSCOPE:
from modelscope import AutoConfig
@@ -45,21 +51,21 @@ def _get_hf_token() -> Optional[str]:
"""
Get the HuggingFace token from environment variable.
Returns None if the token is not set, is an empty string,
Returns None if the token is not set, is an empty string,
or contains only whitespace.
This follows the same pattern as huggingface_hub library which
treats empty string tokens as None to avoid authentication errors.
"""
token = os.getenv('HF_TOKEN')
token = os.getenv("HF_TOKEN")
if token and token.strip():
return token
return None
class LazyConfigDict(dict):
def __getitem__(self, key):
import vllm.transformers_utils.configs as configs
return getattr(configs, super().__getitem__(key))
@@ -84,30 +90,28 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
ultravox="UltravoxConfig",
step3_vl="Step3VLConfig",
step3_text="Step3TextConfig",
qwen3_next="Qwen3NextConfig")
qwen3_next="Qwen3NextConfig",
)
_CONFIG_ATTRS_MAPPING: dict[str, str] = {
"llm_config": "text_config",
}
_AUTO_CONFIG_KWARGS_OVERRIDES: dict[str, dict[str, Any]] = {
"internvl_chat": {
"has_no_defaults_at_init": True
},
"NVLM_D": {
"has_no_defaults_at_init": True
},
"internvl_chat": {"has_no_defaults_at_init": True},
"NVLM_D": {"has_no_defaults_at_init": True},
}
class HFConfigParser(ConfigParserBase):
def parse(self,
model: Union[str, Path],
trust_remote_code: bool,
revision: Optional[str] = None,
code_revision: Optional[str] = None,
**kwargs) -> tuple[dict, PretrainedConfig]:
def parse(
self,
model: Union[str, Path],
trust_remote_code: bool,
revision: Optional[str] = None,
code_revision: Optional[str] = None,
**kwargs,
) -> tuple[dict, PretrainedConfig]:
kwargs["local_files_only"] = huggingface_hub.constants.HF_HUB_OFFLINE
config_dict, _ = PretrainedConfig.get_config_dict(
model,
@@ -119,8 +123,11 @@ class HFConfigParser(ConfigParserBase):
# Use custom model class if it's in our registry
model_type = config_dict.get("model_type")
if model_type is None:
model_type = "speculators" if config_dict.get(
"speculators_config") is not None else model_type
model_type = (
"speculators"
if config_dict.get("speculators_config") is not None
else model_type
)
if model_type in _CONFIG_REGISTRY:
config_class = _CONFIG_REGISTRY[model_type]
@@ -133,8 +140,7 @@ class HFConfigParser(ConfigParserBase):
)
else:
try:
kwargs = _maybe_update_auto_config_kwargs(
kwargs, model_type=model_type)
kwargs = _maybe_update_auto_config_kwargs(kwargs, model_type=model_type)
config = AutoConfig.from_pretrained(
model,
trust_remote_code=trust_remote_code,
@@ -144,15 +150,17 @@ class HFConfigParser(ConfigParserBase):
**kwargs,
)
except ValueError as e:
if (not trust_remote_code
and "requires you to execute the configuration file"
in str(e)):
if (
not trust_remote_code
and "requires you to execute the configuration file" in str(e)
):
err_msg = (
"Failed to load the model config. If the model "
"is a custom model not yet available in the "
"HuggingFace transformers library, consider setting "
"`trust_remote_code=True` in LLM or using the "
"`--trust-remote-code` flag in the CLI.")
"`--trust-remote-code` flag in the CLI."
)
raise RuntimeError(err_msg) from e
else:
raise e
@@ -161,20 +169,23 @@ class HFConfigParser(ConfigParserBase):
class MistralConfigParser(ConfigParserBase):
def parse(self,
model: Union[str, Path],
trust_remote_code: bool,
revision: Optional[str] = None,
code_revision: Optional[str] = None,
**kwargs) -> tuple[dict, PretrainedConfig]:
def parse(
self,
model: Union[str, Path],
trust_remote_code: bool,
revision: Optional[str] = None,
code_revision: Optional[str] = None,
**kwargs,
) -> tuple[dict, PretrainedConfig]:
# This function loads a params.json config which
# should be used when loading models in mistral format
config_dict = _download_mistral_config_file(model, revision)
if (max_position_embeddings :=
config_dict.get("max_position_embeddings")) is None:
if (
max_position_embeddings := config_dict.get("max_position_embeddings")
) is None:
max_position_embeddings = _maybe_retrieve_max_pos_from_hf(
model, revision, **kwargs)
model, revision, **kwargs
)
config_dict["max_position_embeddings"] = max_position_embeddings
from vllm.transformers_utils.configs.mistral import adapt_config_dict
@@ -183,8 +194,9 @@ class MistralConfigParser(ConfigParserBase):
# Mistral configs may define sliding_window as list[int]. Convert it
# to int and add the layer_types list[str] to make it HF compatible
if ((sliding_window := getattr(config, "sliding_window", None))
and isinstance(sliding_window, list)):
if (sliding_window := getattr(config, "sliding_window", None)) and isinstance(
sliding_window, list
):
pattern_repeats = config.num_hidden_layers // len(sliding_window)
layer_types = sliding_window * pattern_repeats
config.layer_types = [
@@ -216,44 +228,51 @@ def get_config_parser(config_format: str) -> ConfigParserBase:
def register_config_parser(config_format: str):
"""Register a customized vllm config parser.
When a config format is not supported by vllm, you can register a customized
config parser to support it.
Args:
config_format (str): The config parser format name.
Examples:
When a config format is not supported by vllm, you can register a customized
config parser to support it.
Args:
config_format (str): The config parser format name.
Examples:
>>> from vllm.transformers_utils.config import (get_config_parser,
register_config_parser)
>>> from vllm.transformers_utils.config_parser_base import ConfigParserBase
>>>
>>> @register_config_parser("custom_config_parser")
... class CustomConfigParser(ConfigParserBase):
... def parse(self,
... model: Union[str, Path],
... trust_remote_code: bool,
... revision: Optional[str] = None,
... code_revision: Optional[str] = None,
... **kwargs) -> tuple[dict, PretrainedConfig]:
... raise NotImplementedError
>>>
>>> type(get_config_parser("custom_config_parser"))
<class 'CustomConfigParser'>
>>> from vllm.transformers_utils.config import (get_config_parser,
register_config_parser)
>>> from vllm.transformers_utils.config_parser_base import ConfigParserBase
>>>
>>> @register_config_parser("custom_config_parser")
... class CustomConfigParser(ConfigParserBase):
... def parse(
... self,
... model: Union[str, Path],
... trust_remote_code: bool,
... revision: Optional[str] = None,
... code_revision: Optional[str] = None,
... **kwargs,
... ) -> tuple[dict, PretrainedConfig]:
... raise NotImplementedError
>>>
>>> type(get_config_parser("custom_config_parser"))
<class 'CustomConfigParser'>
""" # noqa: E501
def _wrapper(config_parser_cls):
if config_format in _CONFIG_FORMAT_TO_CONFIG_PARSER:
logger.warning(
"Config format `%s` is already registered, and will be "
"overwritten by the new parser class `%s`.", config_format,
config_parser_cls)
"overwritten by the new parser class `%s`.",
config_format,
config_parser_cls,
)
if not issubclass(config_parser_cls, ConfigParserBase):
raise ValueError("The config parser must be a subclass of "
"`ConfigParserBase`.")
raise ValueError(
"The config parser must be a subclass of `ConfigParserBase`."
)
_CONFIG_FORMAT_TO_CONFIG_PARSER[config_format] = config_parser_cls
logger.info("Registered config parser `%s` with config format `%s`",
config_parser_cls, config_format)
logger.info(
"Registered config parser `%s` with config format `%s`",
config_parser_cls,
config_format,
)
return config_parser_cls
return _wrapper
@@ -275,8 +294,9 @@ def with_retry(
if attempt == max_retries - 1:
logger.error("%s: %s", log_msg, e)
raise
logger.error("%s: %s, retrying %d of %d", log_msg, e, attempt + 1,
max_retries)
logger.error(
"%s: %s, retrying %d of %d", log_msg, e, attempt + 1, max_retries
)
time.sleep(retry_delay)
retry_delay *= 2
@@ -292,28 +312,27 @@ def list_repo_files(
repo_type: Optional[str] = None,
token: Union[str, bool, None] = None,
) -> list[str]:
def lookup_files() -> list[str]:
# directly list files if model is local
if (local_path := Path(repo_id)).exists():
return [
str(file.relative_to(local_path))
for file in local_path.rglob('*') if file.is_file()
for file in local_path.rglob("*")
if file.is_file()
]
# if model is remote, use hf_hub api to list files
try:
if envs.VLLM_USE_MODELSCOPE:
from vllm.transformers_utils.utils import (
modelscope_list_repo_files)
return modelscope_list_repo_files(repo_id,
revision=revision,
token=os.getenv(
"MODELSCOPE_API_TOKEN",
None))
return hf_list_repo_files(repo_id,
revision=revision,
repo_type=repo_type,
token=token)
from vllm.transformers_utils.utils import modelscope_list_repo_files
return modelscope_list_repo_files(
repo_id,
revision=revision,
token=os.getenv("MODELSCOPE_API_TOKEN", None),
)
return hf_list_repo_files(
repo_id, revision=revision, repo_type=repo_type, token=token
)
except huggingface_hub.errors.OfflineModeIsEnabled:
# Don't raise in offline mode,
# all we know is that we don't have this
@@ -331,23 +350,23 @@ def file_exists(
revision: Optional[str] = None,
token: Union[str, bool, None] = None,
) -> bool:
file_list = list_repo_files(repo_id,
repo_type=repo_type,
revision=revision,
token=token)
file_list = list_repo_files(
repo_id, repo_type=repo_type, revision=revision, token=token
)
return file_name in file_list
# In offline mode the result can be a false negative
def file_or_path_exists(model: Union[str, Path], config_name: str,
revision: Optional[str]) -> bool:
def file_or_path_exists(
model: Union[str, Path], config_name: str, revision: Optional[str]
) -> bool:
if (local_path := Path(model)).exists():
return (local_path / config_name).is_file()
# Offline mode support: Check if config file is cached already
cached_filepath = try_to_load_from_cache(repo_id=model,
filename=config_name,
revision=revision)
cached_filepath = try_to_load_from_cache(
repo_id=model, filename=config_name, revision=revision
)
if isinstance(cached_filepath, str):
# The config file exists in cache- we can continue trying to load
return True
@@ -356,10 +375,9 @@ def file_or_path_exists(model: Union[str, Path], config_name: str,
# hf_hub. This will fail in offline mode.
# Call HF to check if the file exists
return file_exists(str(model),
config_name,
revision=revision,
token=_get_hf_token())
return file_exists(
str(model), config_name, revision=revision, token=_get_hf_token()
)
def patch_rope_scaling(config: PretrainedConfig) -> None:
@@ -381,7 +399,8 @@ def patch_rope_scaling_dict(rope_scaling: dict[str, Any]) -> None:
raise ValueError(
f"Found conflicts between 'rope_type={rope_type}' (modern "
f"field) and 'type={rope_type_legacy}' (legacy field). "
"You should only specify one of them.")
"You should only specify one of them."
)
if "rope_type" not in rope_scaling and "type" in rope_scaling:
rope_scaling["rope_type"] = rope_scaling["type"]
@@ -409,8 +428,11 @@ def _uses_mrope(config: PretrainedConfig) -> bool:
def uses_mrope(config: PretrainedConfig) -> bool:
"""Detect if the model with this config uses M-ROPE."""
return _uses_mrope(config) or _uses_mrope(
config.get_text_config()) or thinker_uses_mrope(config)
return (
_uses_mrope(config)
or _uses_mrope(config.get_text_config())
or thinker_uses_mrope(config)
)
def thinker_uses_mrope(config: PretrainedConfig) -> bool:
@@ -432,8 +454,7 @@ def is_encoder_decoder(config: PretrainedConfig) -> bool:
def _is_encoder_decoder(config: PretrainedConfig) -> bool:
return getattr(config, "is_encoder_decoder", False)
return (_is_encoder_decoder(config)
or _is_encoder_decoder(config.get_text_config()))
return _is_encoder_decoder(config) or _is_encoder_decoder(config.get_text_config())
def is_interleaved(config: PretrainedConfig) -> bool:
@@ -462,8 +483,7 @@ def _maybe_remap_hf_config_attrs(config: PretrainedConfig) -> PretrainedConfig:
if hasattr(config, old_attr):
if not hasattr(config, new_attr):
config.update({new_attr: getattr(config, old_attr)})
logger.debug("Remapped config attribute '%s' to '%s'", old_attr,
new_attr)
logger.debug("Remapped config attribute '%s' to '%s'", old_attr, new_attr)
return config
@@ -512,11 +532,11 @@ def maybe_override_with_speculators(
return model, tokenizer, vllm_speculative_config
# Speculators format detected - process overrides
from vllm.transformers_utils.configs.speculators.base import (
SpeculatorsConfig)
from vllm.transformers_utils.configs.speculators.base import SpeculatorsConfig
speculative_config = SpeculatorsConfig.extract_vllm_speculative_config(
config_dict=config_dict)
config_dict=config_dict
)
# Set the draft model to the speculators model
speculative_config["model"] = model
@@ -535,8 +555,7 @@ def get_config(
code_revision: Optional[str] = None,
config_format: Union[str, ConfigFormat] = "auto",
hf_overrides_kw: Optional[dict[str, Any]] = None,
hf_overrides_fn: Optional[Callable[[PretrainedConfig],
PretrainedConfig]] = None,
hf_overrides_fn: Optional[Callable[[PretrainedConfig], PretrainedConfig]] = None,
**kwargs,
) -> PretrainedConfig:
# Separate model folder from file path for GGUF models
@@ -548,12 +567,9 @@ def get_config(
if config_format == "auto":
try:
if is_gguf or file_or_path_exists(
model, HF_CONFIG_NAME, revision=revision):
if is_gguf or file_or_path_exists(model, HF_CONFIG_NAME, revision=revision):
config_format = "hf"
elif file_or_path_exists(model,
MISTRAL_CONFIG_NAME,
revision=revision):
elif file_or_path_exists(model, MISTRAL_CONFIG_NAME, revision=revision):
config_format = "mistral"
else:
raise ValueError(
@@ -561,7 +577,8 @@ def get_config(
"With config_format 'auto', ensure your model has either "
"config.json (HF format) or params.json (Mistral format). "
"Otherwise please specify your_custom_config_format "
"in engine args for customized config parser.")
"in engine args for customized config parser."
)
except Exception as e:
error_message = (
@@ -576,7 +593,8 @@ def get_config(
"'params.json'.\n"
"3. For GGUF: pass the local path of the GGUF checkpoint.\n"
" Loading GGUF from a remote repo directly is not yet "
"supported.\n").format(model=model)
"supported.\n"
).format(model=model)
raise ValueError(error_message) from e
@@ -591,8 +609,7 @@ def get_config(
# Special architecture mapping check for GGUF models
if is_gguf:
if config.model_type not in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
raise RuntimeError(
f"Can't get gguf config for {config.model_type}.")
raise RuntimeError(f"Can't get gguf config for {config.model_type}.")
model_type = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[config.model_type]
config.update({"architectures": [model_type]})
@@ -602,29 +619,35 @@ def get_config(
# ModelOpt 0.29.0 and before saves the quantization config in a separate
# "hf_quant_config.json" in the same directory as the model config file.
if quantization_config is None \
and file_or_path_exists(model, "hf_quant_config.json", revision):
quantization_config = get_hf_file_to_dict("hf_quant_config.json",
model, revision)
if quantization_config is None and file_or_path_exists(
model, "hf_quant_config.json", revision
):
quantization_config = get_hf_file_to_dict(
"hf_quant_config.json", model, revision
)
if quantization_config is not None:
config.quantization_config = quantization_config
# auto-enable DeepGEMM UE8M0 on Hopper if model config requests it
scale_fmt = quantization_config.get("scale_fmt", None)
if scale_fmt in ("ue8m0", ):
if scale_fmt in ("ue8m0",):
if not envs.is_set("VLLM_USE_DEEP_GEMM_E8M0_HOPPER"):
os.environ["VLLM_USE_DEEP_GEMM_E8M0_HOPPER"] = "1"
logger.info_once(
("Detected quantization_config.scale_fmt=%s; "
"enabling Hopper UE8M0."),
(
"Detected quantization_config.scale_fmt=%s; "
"enabling Hopper UE8M0."
),
scale_fmt,
)
elif not envs.VLLM_USE_DEEP_GEMM_E8M0_HOPPER:
logger.warning_once(
("Model config requests UE8M0 "
"(quantization_config.scale_fmt=%s), but "
"VLLM_USE_DEEP_GEMM_E8M0_HOPPER=0 is set; "
"Hopper UE8M0 disabled."),
(
"Model config requests UE8M0 "
"(quantization_config.scale_fmt=%s), but "
"VLLM_USE_DEEP_GEMM_E8M0_HOPPER=0 is set; "
"Hopper UE8M0 disabled."
),
scale_fmt,
)
@@ -643,17 +666,17 @@ def get_config(
return config
def try_get_local_file(model: Union[str, Path],
file_name: str,
revision: Optional[str] = 'main') -> Optional[Path]:
def try_get_local_file(
model: Union[str, Path], file_name: str, revision: Optional[str] = "main"
) -> Optional[Path]:
file_path = Path(model) / file_name
if file_path.is_file():
return file_path
else:
try:
cached_filepath = try_to_load_from_cache(repo_id=model,
filename=file_name,
revision=revision)
cached_filepath = try_to_load_from_cache(
repo_id=model, filename=file_name, revision=revision
)
if isinstance(cached_filepath, str):
return Path(cached_filepath)
except ValueError:
@@ -661,9 +684,9 @@ def try_get_local_file(model: Union[str, Path],
return None
def get_hf_file_to_dict(file_name: str,
model: Union[str, Path],
revision: Optional[str] = 'main'):
def get_hf_file_to_dict(
file_name: str, model: Union[str, Path], revision: Optional[str] = "main"
):
"""
Downloads a file from the Hugging Face Hub and returns
its contents as a dictionary.
@@ -678,25 +701,27 @@ def get_hf_file_to_dict(file_name: str,
the contents of the downloaded file.
"""
file_path = try_get_local_file(model=model,
file_name=file_name,
revision=revision)
file_path = try_get_local_file(model=model, file_name=file_name, revision=revision)
if file_path is None:
try:
hf_hub_file = hf_hub_download(model, file_name, revision=revision)
except huggingface_hub.errors.OfflineModeIsEnabled:
return None
except (RepositoryNotFoundError, RevisionNotFoundError,
EntryNotFoundError, LocalEntryNotFoundError) as e:
except (
RepositoryNotFoundError,
RevisionNotFoundError,
EntryNotFoundError,
LocalEntryNotFoundError,
) as e:
logger.debug("File or repository not found in hf_hub_download", e)
return None
except HfHubHTTPError as e:
logger.warning(
"Cannot connect to Hugging Face Hub. Skipping file "
"download for '%s':",
"Cannot connect to Hugging Face Hub. Skipping file download for '%s':",
file_name,
exc_info=e)
exc_info=e,
)
return None
file_path = Path(hf_hub_file)
@@ -708,8 +733,7 @@ def get_hf_file_to_dict(file_name: str,
@cache
def get_pooling_config(model: str,
revision: Optional[str] = 'main') -> Optional[dict]:
def get_pooling_config(model: str, revision: Optional[str] = "main") -> Optional[dict]:
"""
This function gets the pooling and normalize
config from the model - only applies to
@@ -717,20 +741,20 @@ def get_pooling_config(model: str,
Args:
model: The name of the Hugging Face model.
revision: The specific version of the model to use.
revision: The specific version of the model to use.
Defaults to 'main'.
Returns:
A dictionary containing the pooling type and whether
A dictionary containing the pooling type and whether
normalization is used, or None if no pooling configuration is found.
"""
modules_file_name = "modules.json"
modules_dict = None
if file_or_path_exists(model=model,
config_name=modules_file_name,
revision=revision):
if file_or_path_exists(
model=model, config_name=modules_file_name, revision=revision
):
modules_dict = get_hf_file_to_dict(modules_file_name, model, revision)
if modules_dict is None:
@@ -738,20 +762,31 @@ def get_pooling_config(model: str,
logger.info("Found sentence-transformers modules configuration.")
pooling = next((item for item in modules_dict
if item["type"] == "sentence_transformers.models.Pooling"),
None)
pooling = next(
(
item
for item in modules_dict
if item["type"] == "sentence_transformers.models.Pooling"
),
None,
)
normalize = bool(
next((item for item in modules_dict
if item["type"] == "sentence_transformers.models.Normalize"),
False))
next(
(
item
for item in modules_dict
if item["type"] == "sentence_transformers.models.Normalize"
),
False,
)
)
if pooling:
pooling_file_name = "{}/config.json".format(pooling["path"])
pooling_dict = get_hf_file_to_dict(pooling_file_name, model, revision)
pooling_type_name = next(
(item for item, val in pooling_dict.items() if val is True), None)
(item for item, val in pooling_dict.items() if val is True), None
)
if pooling_type_name is not None:
pooling_type_name = get_pooling_config_name(pooling_type_name)
@@ -772,20 +807,19 @@ def get_pooling_config_name(pooling_name: str) -> Union[str, None]:
if "lasttoken" in pooling_name:
pooling_name = "last"
supported_pooling_types = ['LAST', 'ALL', 'CLS', 'STEP', 'MEAN']
supported_pooling_types = ["LAST", "ALL", "CLS", "STEP", "MEAN"]
pooling_type_name = pooling_name.upper()
if pooling_type_name in supported_pooling_types:
return pooling_type_name
raise NotImplementedError(
f"Pooling type {pooling_type_name} not supported")
raise NotImplementedError(f"Pooling type {pooling_type_name} not supported")
@cache
def get_sentence_transformer_tokenizer_config(model: Union[str, Path],
revision: Optional[str] = 'main'
):
def get_sentence_transformer_tokenizer_config(
model: Union[str, Path], revision: Optional[str] = "main"
):
"""
Returns the tokenization configuration dictionary for a
given Sentence Transformer BERT model.
@@ -812,9 +846,10 @@ def get_sentence_transformer_tokenizer_config(model: Union[str, Path],
encoder_dict = None
for config_file in sentence_transformer_config_files:
if try_get_local_file(model=model,
file_name=config_file,
revision=revision) is not None:
if (
try_get_local_file(model=model, file_name=config_file, revision=revision)
is not None
):
encoder_dict = get_hf_file_to_dict(config_file, model, revision)
if encoder_dict:
break
@@ -822,16 +857,15 @@ def get_sentence_transformer_tokenizer_config(model: Union[str, Path],
if not encoder_dict and not Path(model).is_absolute():
try:
# If model is on HuggingfaceHub, get the repo files
repo_files = list_repo_files(model,
revision=revision,
token=_get_hf_token())
repo_files = list_repo_files(
model, revision=revision, token=_get_hf_token()
)
except Exception:
repo_files = []
for config_name in sentence_transformer_config_files:
if config_name in repo_files:
encoder_dict = get_hf_file_to_dict(config_name, model,
revision)
encoder_dict = get_hf_file_to_dict(config_name, model, revision)
if encoder_dict:
break
@@ -848,34 +882,39 @@ def get_sentence_transformer_tokenizer_config(model: Union[str, Path],
def maybe_register_config_serialize_by_value() -> None:
"""Try to register HF model configuration class to serialize by value
If trust_remote_code is set, and the model's config file specifies an
`AutoConfig` class, then the config class is typically an instance of
a custom class imported from the HF modules cache.
If trust_remote_code is set, and the model's config file specifies an
`AutoConfig` class, then the config class is typically an instance of
a custom class imported from the HF modules cache.
Examples:
Examples:
>>> from transformers import AutoConfig
>>> klass = AutoConfig.from_pretrained('meta-llama/Meta-Llama-3-8B', trust_remote_code=True)
>>> klass.__class__ # transformers.models.llama.configuration_llama.LlamaConfig
>>> import transformers_modules # error, not initialized
>>> klass = AutoConfig.from_pretrained('deepseek-ai/DeepSeek-V2.5', trust_remote_code=True)
>>> import transformers_modules # success, initialized
>>> klass.__class__ # transformers_modules.deepseek-ai.DeepSeek-V2.5.98b11844770b2c3ffc18b175c758a803640f4e77.configuration_deepseek.DeepseekV2Config
>>> from transformers import AutoConfig
>>> klass = AutoConfig.from_pretrained(
... "meta-llama/Meta-Llama-3-8B", trust_remote_code=True
... )
>>> klass.__class__ # transformers.models.llama.configuration_llama.LlamaConfig
>>> import transformers_modules # error, not initialized
>>> klass = AutoConfig.from_pretrained(
... "deepseek-ai/DeepSeek-V2.5", trust_remote_code=True
... )
>>> import transformers_modules # success, initialized
>>> klass.__class__ # transformers_modules.deepseek-ai.DeepSeek-V2.5.98b11844770b2c3ffc18b175c758a803640f4e77.configuration_deepseek.DeepseekV2Config
In the DeepSeek example, the config class is an instance of a custom
class that is not serializable by default. This class will not be
importable in spawned workers, and won't exist at all on
other nodes, which breaks serialization of the config.
In the DeepSeek example, the config class is an instance of a custom
class that is not serializable by default. This class will not be
importable in spawned workers, and won't exist at all on
other nodes, which breaks serialization of the config.
In this function we tell the cloudpickle serialization library to pass
instances of these generated classes by value instead of by reference,
i.e. the class definition is serialized along with its data so that the
class module does not need to be importable on the receiving end.
In this function we tell the cloudpickle serialization library to pass
instances of these generated classes by value instead of by reference,
i.e. the class definition is serialized along with its data so that the
class module does not need to be importable on the receiving end.
See: https://github.com/cloudpipe/cloudpickle?tab=readme-ov-file#overriding-pickles-serialization-mechanism-for-importable-constructs
""" # noqa
See: https://github.com/cloudpipe/cloudpickle?tab=readme-ov-file#overriding-pickles-serialization-mechanism-for-importable-constructs
""" # noqa
try:
import transformers_modules
transformers_modules_available = True
except ImportError:
transformers_modules_available = False
@@ -892,7 +931,7 @@ def maybe_register_config_serialize_by_value() -> None:
# serialization of VllmConfig objects that may contain custom configs
# from transformers_modules
def _reduce_config(config: VllmConfig):
return (pickle.loads, (cloudpickle.dumps(config), ))
return (pickle.loads, (cloudpickle.dumps(config),))
multiprocessing.reducer.register(VllmConfig, _reduce_config)
@@ -902,6 +941,7 @@ def maybe_register_config_serialize_by_value() -> None:
# ray vendors its own version of cloudpickle
from vllm.executor.ray_utils import ray
if ray:
ray.cloudpickle.register_pickle_by_value(transformers_modules)
@@ -911,7 +951,8 @@ def maybe_register_config_serialize_by_value() -> None:
" trust_remote_code with by-value serialization. This may"
" lead to a later error. If remote code is not needed"
" remove `--trust-remote-code`",
exc_info=e)
exc_info=e,
)
def get_hf_image_processor_config(
@@ -926,10 +967,9 @@ def get_hf_image_processor_config(
# Separate model folder from file path for GGUF models
if check_gguf_file(model):
model = Path(model).parent
return get_image_processor_config(model,
token=hf_token,
revision=revision,
**kwargs)
return get_image_processor_config(
model, token=hf_token, revision=revision, **kwargs
)
def get_hf_text_config(config: PretrainedConfig):
@@ -984,8 +1024,9 @@ def try_get_safetensors_metadata(
)
try:
return with_retry(get_safetensors_metadata_partial,
"Error retrieving safetensors")
return with_retry(
get_safetensors_metadata_partial, "Error retrieving safetensors"
)
except Exception:
return None
@@ -1018,9 +1059,9 @@ def get_safetensors_params_metadata(
safetensors_to_check = model_path.glob("*.safetensors")
full_metadata = {
param_name: info
for file_path in safetensors_to_check if file_path.is_file()
for param_name, info in parse_safetensors_file_metadata(
file_path).items()
for file_path in safetensors_to_check
if file_path.is_file()
for param_name, info in parse_safetensors_file_metadata(file_path).items()
}
else:
repo_mt = try_get_safetensors_metadata(model, revision=revision)
@@ -1040,7 +1081,8 @@ def _download_mistral_config_file(model, revision) -> dict:
raise ValueError(
f"Failed to load mistral '{config_file_name}' config for model "
f"{model}. Please check if the model is a mistral-format model "
f"and if the config file exists.")
f"and if the config file exists."
)
assert isinstance(config_dict, dict)
return config_dict
@@ -1049,10 +1091,12 @@ def _maybe_retrieve_max_pos_from_hf(model, revision, **kwargs) -> int:
max_position_embeddings = 128_000
try:
trust_remote_code_val = kwargs.get("trust_remote_code", False)
hf_config = get_config(model=model,
trust_remote_code=trust_remote_code_val,
revision=revision,
config_format="hf")
hf_config = get_config(
model=model,
trust_remote_code=trust_remote_code_val,
revision=revision,
config_format="hf",
)
if hf_value := hf_config.get_text_config().max_position_embeddings:
max_position_embeddings = hf_value
except Exception as e:
@@ -1060,7 +1104,8 @@ def _maybe_retrieve_max_pos_from_hf(model, revision, **kwargs) -> int:
"The params.json file is missing 'max_position_embeddings'"
" and could not get a value from the HF config."
" Defaulting to 128000",
exc_info=e)
exc_info=e,
)
return max_position_embeddings
@@ -1076,29 +1121,28 @@ def get_model_path(model: Union[str, Path], revision: Optional[str] = None):
if envs.VLLM_USE_MODELSCOPE:
from modelscope.hub.snapshot_download import snapshot_download
return snapshot_download(model_id=model, **common_kwargs)
from huggingface_hub import snapshot_download
return snapshot_download(repo_id=model, **common_kwargs)
def get_hf_file_bytes(file_name: str,
model: Union[str, Path],
revision: Optional[str] = 'main') -> Optional[bytes]:
def get_hf_file_bytes(
file_name: str, model: Union[str, Path], revision: Optional[str] = "main"
) -> Optional[bytes]:
"""Get file contents from HuggingFace repository as bytes."""
file_path = try_get_local_file(model=model,
file_name=file_name,
revision=revision)
file_path = try_get_local_file(model=model, file_name=file_name, revision=revision)
if file_path is None:
hf_hub_file = hf_hub_download(model,
file_name,
revision=revision,
token=_get_hf_token())
hf_hub_file = hf_hub_download(
model, file_name, revision=revision, token=_get_hf_token()
)
file_path = Path(hf_hub_file)
if file_path is not None and file_path.is_file():
with open(file_path, 'rb') as file:
with open(file_path, "rb") as file:
return file.read()
return None

View File

@@ -9,12 +9,13 @@ from transformers import PretrainedConfig
class ConfigParserBase(ABC):
@abstractmethod
def parse(self,
model: Union[str, Path],
trust_remote_code: bool,
revision: Optional[str] = None,
code_revision: Optional[str] = None,
**kwargs) -> tuple[dict, PretrainedConfig]:
def parse(
self,
model: Union[str, Path],
trust_remote_code: bool,
revision: Optional[str] = None,
code_revision: Optional[str] = None,
**kwargs,
) -> tuple[dict, PretrainedConfig]:
raise NotImplementedError

View File

@@ -12,6 +12,7 @@ from vllm.transformers_utils.configs.deepseek_v3 import DeepseekV3Config
from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config
from vllm.transformers_utils.configs.dotsocr import DotsOCRConfig
from vllm.transformers_utils.configs.eagle import EAGLEConfig
# RWConfig is for the original tiiuae/falcon-40b(-instruct) and
# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
# `FalconConfig` class from the official HuggingFace transformers library.
@@ -30,9 +31,11 @@ from vllm.transformers_utils.configs.ovis import OvisConfig
from vllm.transformers_utils.configs.qwen3_next import Qwen3NextConfig
from vllm.transformers_utils.configs.radio import RadioConfig
from vllm.transformers_utils.configs.speculators.base import SpeculatorsConfig
from vllm.transformers_utils.configs.step3_vl import (Step3TextConfig,
Step3VisionEncoderConfig,
Step3VLConfig)
from vllm.transformers_utils.configs.step3_vl import (
Step3TextConfig,
Step3VisionEncoderConfig,
Step3VLConfig,
)
from vllm.transformers_utils.configs.ultravox import UltravoxConfig
__all__ = [

View File

@@ -13,33 +13,35 @@ class ChatGLMConfig(PretrainedConfig):
"n_head_kv": "multi_query_group_num",
}
def __init__(self,
num_layers=28,
padded_vocab_size=65024,
hidden_size=4096,
ffn_hidden_size=13696,
kv_channels=128,
num_attention_heads=32,
seq_length=2048,
hidden_dropout=0.0,
attention_dropout=0.0,
layernorm_epsilon=1e-5,
rmsnorm=True,
apply_residual_connection_post_layernorm=False,
post_layer_norm=True,
add_bias_linear=False,
add_qkv_bias=False,
interleaved_qkv=False,
bias_dropout_fusion=True,
multi_query_attention=False,
multi_query_group_num=1,
apply_query_key_layer_scaling=True,
attention_softmax_in_fp32=True,
fp32_residual_connection=False,
quantization_bit=0,
pre_seq_len=None,
prefix_projection=False,
**kwargs):
def __init__(
self,
num_layers=28,
padded_vocab_size=65024,
hidden_size=4096,
ffn_hidden_size=13696,
kv_channels=128,
num_attention_heads=32,
seq_length=2048,
hidden_dropout=0.0,
attention_dropout=0.0,
layernorm_epsilon=1e-5,
rmsnorm=True,
apply_residual_connection_post_layernorm=False,
post_layer_norm=True,
add_bias_linear=False,
add_qkv_bias=False,
interleaved_qkv=False,
bias_dropout_fusion=True,
multi_query_attention=False,
multi_query_group_num=1,
apply_query_key_layer_scaling=True,
attention_softmax_in_fp32=True,
fp32_residual_connection=False,
quantization_bit=0,
pre_seq_len=None,
prefix_projection=False,
**kwargs,
):
self.num_layers = num_layers
self.vocab_size = padded_vocab_size
self.padded_vocab_size = padded_vocab_size
@@ -55,7 +57,8 @@ class ChatGLMConfig(PretrainedConfig):
self.layernorm_epsilon = layernorm_epsilon
self.rmsnorm = rmsnorm
self.apply_residual_connection_post_layernorm = (
apply_residual_connection_post_layernorm)
apply_residual_connection_post_layernorm
)
self.post_layer_norm = post_layer_norm
self.add_bias_linear = add_bias_linear
self.add_qkv_bias = add_qkv_bias

View File

@@ -7,7 +7,6 @@ logger = logging.get_logger(__name__)
class DeepseekV3Config(PretrainedConfig):
model_type = "deepseek_v3"
keys_to_ignore_at_inference = ["past_key_values"]
@@ -30,14 +29,14 @@ class DeepseekV3Config(PretrainedConfig):
qk_rope_head_dim=64,
v_head_dim=128,
qk_nope_head_dim=128,
topk_method='noaux_tc',
topk_method="noaux_tc",
n_group=8,
topk_group=4,
num_experts_per_tok=8,
moe_layer_freq=1,
first_k_dense_replace=3,
norm_topk_prob=True,
scoring_func='sigmoid',
scoring_func="sigmoid",
hidden_act="silu",
max_position_embeddings=4096,
initializer_range=0.02,

View File

@@ -25,20 +25,22 @@ class VisionEncoderConfig(PretrainedConfig):
deterministic: bool = False
num_recomputing_layers: int = 0
def __init__(self,
model_name: str = "vit_so400m_patch14_siglip_384.webli",
image_size: int = 384,
patch_size: int = 16,
width: int = 1024,
layers: int = 24,
heads: int = 16,
mlp_ratio: int = 4,
global_pool: str = "map",
ignore_head: bool = True,
class_token: bool = False,
num_classes: int = 0,
use_checkpoint: bool = False,
**kwargs):
def __init__(
self,
model_name: str = "vit_so400m_patch14_siglip_384.webli",
image_size: int = 384,
patch_size: int = 16,
width: int = 1024,
layers: int = 24,
heads: int = 16,
mlp_ratio: int = 4,
global_pool: str = "map",
ignore_head: bool = True,
class_token: bool = False,
num_classes: int = 0,
use_checkpoint: bool = False,
**kwargs,
):
self.model_name = model_name
self.image_size = image_size
self.patch_size = patch_size
@@ -65,14 +67,16 @@ class MlpProjectorConfig(PretrainedConfig):
downsample_ratio: int = 2
token_pooling: bool = False
def __init__(self,
projector_type: str = "downsample_mlp_gelu",
input_dim: int = 1152,
n_embed: int = 2048,
depth: int = 2,
mlp_ratio: int = 1,
downsample_ratio: int = 2,
**kwargs):
def __init__(
self,
projector_type: str = "downsample_mlp_gelu",
input_dim: int = 1152,
n_embed: int = 2048,
depth: int = 2,
mlp_ratio: int = 1,
downsample_ratio: int = 2,
**kwargs,
):
self.projector_type = projector_type
self.input_dim = input_dim
self.n_embed = n_embed
@@ -84,7 +88,6 @@ class MlpProjectorConfig(PretrainedConfig):
class DeepseekV2Config(PretrainedConfig):
model_type = "deepseek_v2"
keys_to_ignore_at_inference = ["past_key_values"]
@@ -106,14 +109,14 @@ class DeepseekV2Config(PretrainedConfig):
qk_rope_head_dim=64,
v_head_dim=128,
qk_nope_head_dim=128,
topk_method='gready',
topk_method="gready",
n_group=None,
topk_group=None,
num_experts_per_tok=None,
moe_layer_freq=1,
first_k_dense_replace=0,
norm_topk_prob=False,
scoring_func='softmax',
scoring_func="softmax",
aux_loss_alpha=0.001,
seq_aux=True,
hidden_act="silu",
@@ -191,14 +194,15 @@ class DeepseekVLV2Config(PretrainedConfig):
tile_tag: str = "2D"
global_view_pos: str = "head"
candidate_resolutions: tuple[tuple[int, int]] = ((384, 384), )
candidate_resolutions: tuple[tuple[int, int]] = ((384, 384),)
def __init__(self,
tile_tag: str = "tile_tag",
global_view_pos: str = "head",
candidate_resolutions: tuple[tuple[int,
int]] = ((384, 384), ),
**kwargs):
def __init__(
self,
tile_tag: str = "tile_tag",
global_view_pos: str = "head",
candidate_resolutions: tuple[tuple[int, int]] = ((384, 384),),
**kwargs,
):
super().__init__(**kwargs)
vision_config = kwargs.get("vision_config", {})

View File

@@ -53,12 +53,14 @@ class DotsVisionConfig(PretrainedConfig):
class DotsOCRConfig(Qwen2Config):
model_type = "dots_ocr"
def __init__(self,
image_token_id=151665,
video_token_id=151656,
vision_config: Optional[dict] = None,
*args,
**kwargs):
def __init__(
self,
image_token_id=151665,
video_token_id=151656,
vision_config: Optional[dict] = None,
*args,
**kwargs,
):
super().__init__(*args, **kwargs)
self.image_token_id = image_token_id
self.video_token_id = video_token_id

View File

@@ -12,12 +12,13 @@ from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config
class EAGLEConfig(PretrainedConfig):
model_type = "eagle"
def __init__(self,
model: Union[PretrainedConfig, dict, None] = None,
truncated_vocab_size: Optional[int] = None,
method: Optional[str] = 'eagle',
**kwargs):
def __init__(
self,
model: Union[PretrainedConfig, dict, None] = None,
truncated_vocab_size: Optional[int] = None,
method: Optional[str] = "eagle",
**kwargs,
):
model_config: Union[PretrainedConfig, DeepseekV2Config, None]
if isinstance(model, dict):
archs = model.get("architectures", [])
@@ -31,8 +32,7 @@ class EAGLEConfig(PretrainedConfig):
model_config = model
for k, v in kwargs.items():
if k != "architectures" and k != "model_type" and hasattr(
model_config, k):
if k != "architectures" and k != "model_type" and hasattr(model_config, k):
setattr(model_config, k, v)
self.model = model_config
@@ -40,31 +40,39 @@ class EAGLEConfig(PretrainedConfig):
if self.model is None:
self.truncated_vocab_size = None
else:
self.truncated_vocab_size = self.model.vocab_size if \
truncated_vocab_size is None else truncated_vocab_size
self.truncated_vocab_size = (
self.model.vocab_size
if truncated_vocab_size is None
else truncated_vocab_size
)
# Eagle model name should follow naming convention of
# LlamaForCausalLM -> EagleLlamaForCausalLM
# LlamaForCausalLM -> Eagle3LlamaForCausalLM
# LlamaForCausalLMEagle3 -> LlamaForCausalLMEagle3
if method == "eagle":
assert self.model is not None, \
assert self.model is not None, (
"model should not be None when method is eagle"
)
kwargs["architectures"] = [
f"Eagle{arch}" if not arch.startswith("Eagle") \
else arch for arch in self.model.architectures
f"Eagle{arch}" if not arch.startswith("Eagle") else arch
for arch in self.model.architectures
]
elif method == "eagle3":
assert self.model is not None, \
assert self.model is not None, (
"model should not be None when method is eagle3"
)
kwargs["architectures"] = [
arch if arch.startswith("Eagle3") or arch.endswith("Eagle3")
else f"Eagle3{arch}" for arch in self.model.architectures
arch
if arch.startswith("Eagle3") or arch.endswith("Eagle3")
else f"Eagle3{arch}"
for arch in self.model.architectures
]
else:
raise ValueError(f"Invalid method {method}. "
"Supported methods are eagle and eagle3.")
raise ValueError(
f"Invalid method {method}. Supported methods are eagle and eagle3."
)
super().__init__(**kwargs)
@@ -80,5 +88,6 @@ class EAGLEConfig(PretrainedConfig):
**kwargs,
) -> "EAGLEConfig":
config_dict, kwargs = cls.get_config_dict(
pretrained_model_name_or_path, **kwargs)
pretrained_model_name_or_path, **kwargs
)
return cls.from_dict(config_dict, **kwargs)

View File

@@ -19,6 +19,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Falcon configuration"""
from transformers.configuration_utils import PretrainedConfig
@@ -77,9 +78,7 @@ class RWConfig(PretrainedConfig):
# Hack for falcon-40b
self.new_decoder_architecture = True
super().__init__(bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
**kwargs)
super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
@property
def head_dim(self):

View File

@@ -75,7 +75,7 @@ class JAISConfig(PretrainedConfig):
Whether or not the model should return the last key/values
attentions (not used by all models).
scale_attn_by_inverse_layer_idx (`bool`, *optional*, default `True`):
Whether to additionally scale attention weights
Whether to additionally scale attention weights
by `1 / layer_idx + 1`.
reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
Whether to scale keys (K) prior to computing attention
@@ -209,29 +209,35 @@ class JAISConfig(PretrainedConfig):
if self.alibi_scaling is None:
return
if (not isinstance(self.alibi_scaling, dict)
or len(self.alibi_scaling) != 2):
if not isinstance(self.alibi_scaling, dict) or len(self.alibi_scaling) != 2:
raise ValueError(
"`alibi_scaling` must be a dictionary with two fields, "
"`type` and `factor` or `type` and `train_seq_len`, "
f"got {self.alibi_scaling}")
f"got {self.alibi_scaling}"
)
alibi_scaling_type = self.alibi_scaling.get("type", None)
alibi_scaling_factor = self.alibi_scaling.get("factor", None)
alibi_dynamic_scaling = self.alibi_scaling.get("train_seq_len", None)
if alibi_scaling_type is None or alibi_scaling_type != "linear":
raise ValueError(f"`alibi_scaling`'s type field must be 'linear', "
f"got {alibi_scaling_type}")
if (alibi_scaling_factor is not None
and not isinstance(alibi_scaling_factor, float)
or (alibi_scaling_factor is not None
and alibi_scaling_factor <= 1.0)):
raise ValueError(
f"`alibi_scaling`'s type field must be 'linear', "
f"got {alibi_scaling_type}"
)
if (
alibi_scaling_factor is not None
and not isinstance(alibi_scaling_factor, float)
or (alibi_scaling_factor is not None and alibi_scaling_factor <= 1.0)
):
raise ValueError(
f"`alibi_scaling`'s factor field must be a float > 1.0, "
f"got {alibi_scaling_factor}")
if (alibi_dynamic_scaling is not None
and not isinstance(alibi_dynamic_scaling, int)
or (alibi_dynamic_scaling is not None
and alibi_dynamic_scaling <= 1)):
f"got {alibi_scaling_factor}"
)
if (
alibi_dynamic_scaling is not None
and not isinstance(alibi_dynamic_scaling, int)
or (alibi_dynamic_scaling is not None and alibi_dynamic_scaling <= 1)
):
raise ValueError(
f"`alibi_scaling`'s `train_seq_len` field must be an "
f"integer > 1, got {alibi_dynamic_scaling}")
f"integer > 1, got {alibi_dynamic_scaling}"
)

View File

@@ -12,13 +12,15 @@ from vllm.transformers_utils.configs.moonvit import MoonViTConfig
class KimiVLConfig(PretrainedConfig):
model_type = "kimi_vl"
def __init__(self,
vision_config: Optional[Union[dict, MoonViTConfig]] = None,
text_config: Optional[Union[dict, DeepseekV2Config]] = None,
ignore_index: int = -100,
media_placeholder_token_id: int = 163605,
pad_token_id: int = 0,
**kwargs):
def __init__(
self,
vision_config: Optional[Union[dict, MoonViTConfig]] = None,
text_config: Optional[Union[dict, DeepseekV2Config]] = None,
ignore_index: int = -100,
media_placeholder_token_id: int = 163605,
pad_token_id: int = 0,
**kwargs,
):
if vision_config is None:
vision_config = MoonViTConfig()
elif isinstance(vision_config, dict):

View File

@@ -10,16 +10,17 @@ from transformers import PretrainedConfig
class MedusaConfig(PretrainedConfig):
model_type = "medusa"
def __init__(self,
hidden_size: int = 4096,
vocab_size: int = 32001,
num_heads: int = 5,
num_hidden_layers: int = 1,
max_paths: int = 64,
topk: int = 10,
truncated_vocab_size: Optional[int] = None,
**kwargs):
def __init__(
self,
hidden_size: int = 4096,
vocab_size: int = 32001,
num_heads: int = 5,
num_hidden_layers: int = 1,
max_paths: int = 64,
topk: int = 10,
truncated_vocab_size: Optional[int] = None,
**kwargs,
):
self.hidden_size = hidden_size
self.vocab_size = vocab_size
self.num_heads = num_heads
@@ -27,8 +28,9 @@ class MedusaConfig(PretrainedConfig):
self.max_paths = max_paths
self.topk = topk
self.max_seq_len = int(2**20)
self.truncated_vocab_size = vocab_size if truncated_vocab_size is None\
else truncated_vocab_size
self.truncated_vocab_size = (
vocab_size if truncated_vocab_size is None else truncated_vocab_size
)
if "architectures" not in kwargs:
kwargs["architectures"] = ["MedusaModel"]
@@ -41,12 +43,13 @@ class MedusaConfig(PretrainedConfig):
**kwargs,
) -> "MedusaConfig":
config_dict, kwargs = cls.get_config_dict(
pretrained_model_name_or_path, **kwargs)
pretrained_model_name_or_path, **kwargs
)
for k in list(config_dict.keys()):
if 'num' in k:
if 'heads' in k:
if "num" in k:
if "heads" in k:
config_dict["num_heads"] = config_dict.pop(k)
elif 'layers' in k:
elif "layers" in k:
config_dict["num_hidden_layers"] = config_dict.pop(k)
return cls.from_dict(config_dict, **kwargs)

View File

@@ -25,7 +25,8 @@ from typing import Optional, Union
from transformers import PretrainedConfig
from transformers.models.qwen2_5_omni.configuration_qwen2_5_omni import (
Qwen2_5OmniTextConfig)
Qwen2_5OmniTextConfig,
)
class DashengConfig(PretrainedConfig):
@@ -91,11 +92,13 @@ class MiDashengLMConfig(PretrainedConfig):
audio_token_id: Optional[int] = None,
**kwargs,
):
self.audio_encoder_config = DashengConfig(
**(audio_encoder_config or {}))
self.audio_encoder_config = DashengConfig(**(audio_encoder_config or {}))
self.subsample_factor = subsample_factor
self.text_config = (Qwen2_5OmniTextConfig(
**text_config) if text_config else Qwen2_5OmniTextConfig())
self.text_config = (
Qwen2_5OmniTextConfig(**text_config)
if text_config
else Qwen2_5OmniTextConfig()
)
self.text_config.rope_scaling = None # uses_mrope is false
self.audio_token_id = audio_token_id
super().__init__(**kwargs)

View File

@@ -9,8 +9,7 @@ from vllm.logger import init_logger
logger = init_logger(__name__)
def adapt_config_dict(config_dict: dict[str, Any],
**kwargs) -> PretrainedConfig:
def adapt_config_dict(config_dict: dict[str, Any], **kwargs) -> PretrainedConfig:
config_dict.update(kwargs)
config_dict = _remap_general_mistral_args(config_dict)
@@ -25,15 +24,16 @@ def adapt_config_dict(config_dict: dict[str, Any],
if bool(config_dict.get("yarn")):
config_dict = _remap_mistral_yarn_args(config_dict)
is_vision = ((config_dict.get("multimodal")
or {}).get("vision_encoder_args")
or config_dict.get("vision_encoder"))
is_vision = (config_dict.get("multimodal") or {}).get(
"vision_encoder_args"
) or config_dict.get("vision_encoder")
is_audio = bool(
((config_dict.get("multimodal") or {}).get("whisper_model_args")
or {}).get("encoder_args"))
((config_dict.get("multimodal") or {}).get("whisper_model_args") or {}).get(
"encoder_args"
)
)
assert not (is_vision and is_audio), \
"Vision and audio are mutually exclusive"
assert not (is_vision and is_audio), "Vision and audio are mutually exclusive"
if is_vision:
config_dict = _remap_mistral_vision_args(config_dict)
@@ -77,7 +77,7 @@ def _remap_mistral_yarn_args(config: dict) -> dict:
config["rope_scaling"] = {
"rope_type": "yarn",
"mscale_all_dim": 1, # We hardcoded this to 1
**renamed_yarn_config
**renamed_yarn_config,
}
return config
@@ -105,8 +105,7 @@ def _remap_general_mistral_args(config: dict) -> dict:
if key in config:
config[new_key] = config.pop(key)
for new_key, (key,
default_value) in top_level_mapping_with_default.items():
for new_key, (key, default_value) in top_level_mapping_with_default.items():
config[new_key] = config.pop(key, default_value)
return config
@@ -116,16 +115,12 @@ def _remap_mistral_quantization_args(config: dict) -> dict:
quantization = config.get("quantization", {})
if quantization.get("qformat_weight") == "fp8_e4m3":
# This maps to the FP8 static per-tensor quantization scheme
quantization_config = {
"quant_method": "fp8",
"activation_scheme": "static"
}
quantization_config = {"quant_method": "fp8", "activation_scheme": "static"}
elif quantization.get("quant_method") == "compressed-tensors":
# Pass through the quantization config to compressed-tensors
quantization_config = quantization
else:
raise ValueError(
f"Found unknown quantization='{quantization}' in config")
raise ValueError(f"Found unknown quantization='{quantization}' in config")
config["quantization_config"] = quantization_config
@@ -139,13 +134,10 @@ def _remap_mistral_audio_args(config: dict) -> dict:
quant_config = config.get("quantization_config")
config = {
"model_type":
"whixtral",
"model_type": "whixtral",
"architectures": ["VoxtralForConditionalGeneration"],
"text_config":
PretrainedConfig.from_dict(config),
"audio_config":
WhisperConfig(
"text_config": PretrainedConfig.from_dict(config),
"audio_config": WhisperConfig(
num_mel_bins=encoder_args["audio_encoding_args"]["num_mel_bins"],
window_size=encoder_args["audio_encoding_args"]["window_size"],
sampling_rate=encoder_args["audio_encoding_args"]["sampling_rate"],
@@ -158,7 +150,7 @@ def _remap_mistral_audio_args(config: dict) -> dict:
vocab_size=encoder_args["vocab_size"],
max_source_positions=encoder_args["max_source_positions"],
is_encoder_decoder=False, # Override WhisperConfig default
)
),
}
if quant_config:
config["quantization_config"] = quant_config

View File

@@ -13,16 +13,18 @@ class MLPSpeculatorConfig(PretrainedConfig):
"hidden_size": "emb_dim",
}
def __init__(self,
vocab_size: int = 32000,
emb_dim: int = 4096,
inner_dim: int = 0,
n_predict: int = 3,
top_k_tokens_per_head: Optional[list[int]] = None,
n_candidates: int = 5,
tie_weights: bool = False,
scale_input: bool = False,
**kwargs):
def __init__(
self,
vocab_size: int = 32000,
emb_dim: int = 4096,
inner_dim: int = 0,
n_predict: int = 3,
top_k_tokens_per_head: Optional[list[int]] = None,
n_candidates: int = 5,
tie_weights: bool = False,
scale_input: bool = False,
**kwargs,
):
"""
Initialize an MLPSpeculatorConfig

View File

@@ -8,16 +8,16 @@ class MoonViTConfig(PretrainedConfig):
model_type = "moonvit"
def __init__(
self,
patch_size: int = 14,
init_pos_emb_height: int = 64,
init_pos_emb_width: int = 64,
num_attention_heads: int = 16,
num_hidden_layers: int = 27,
hidden_size: int = 1152,
intermediate_size: int = 4304,
merge_kernel_size: tuple[int, int] = (2, 2),
**kwargs,
self,
patch_size: int = 14,
init_pos_emb_height: int = 64,
init_pos_emb_width: int = 64,
num_attention_heads: int = 16,
num_hidden_layers: int = 27,
hidden_size: int = 1152,
intermediate_size: int = 4304,
merge_kernel_size: tuple[int, int] = (2, 2),
**kwargs,
):
super().__init__(**kwargs)
self.patch_size = patch_size

View File

@@ -62,7 +62,7 @@ class NemotronConfig(PretrainedConfig):
(MQA) otherwise GQA is used. When converting a multi-head
checkpoint to a GQA checkpoint, each group key and value
head should be constructed by meanpooling all the original
heads within that group. For more details checkout
heads within that group. For more details checkout
[this paper](https://arxiv.org/pdf/2305.13245.pdf). If it
is not specified, will default to `num_attention_heads`.
hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`):
@@ -147,8 +147,9 @@ class NemotronConfig(PretrainedConfig):
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
head_dim = head_dim or kwargs.get("kv_channels")
self.head_dim = head_dim if head_dim is not None else (
hidden_size // num_attention_heads)
self.head_dim = (
head_dim if head_dim is not None else (hidden_size // num_attention_heads)
)
# for backward compatibility
if num_key_value_heads is None:
@@ -162,8 +163,11 @@ class NemotronConfig(PretrainedConfig):
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
# for backward compatibility
partial_rotary_factor = kwargs.get("rope_percent") or kwargs.get(
"rope_percentage") or partial_rotary_factor
partial_rotary_factor = (
kwargs.get("rope_percent")
or kwargs.get("rope_percentage")
or partial_rotary_factor
)
self.partial_rotary_factor = partial_rotary_factor
self._rope_scaling_validation()
self.attention_bias = attention_bias
@@ -185,21 +189,24 @@ class NemotronConfig(PretrainedConfig):
if self.rope_scaling is None:
return
if not isinstance(self.rope_scaling, dict) or len(
self.rope_scaling) != 2:
if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
raise ValueError(
"`rope_scaling` must be a dictionary with two fields, "
f"`type` and `factor`, got {self.rope_scaling}")
f"`type` and `factor`, got {self.rope_scaling}"
)
rope_scaling_type = self.rope_scaling.get("type", None)
rope_scaling_factor = self.rope_scaling.get("factor", None)
if rope_scaling_type is None or rope_scaling_type not in [
"linear", "dynamic"
]:
if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
raise ValueError(
"`rope_scaling`'s type field must be one of ['linear', "
f"'dynamic'], got {rope_scaling_type}")
if rope_scaling_factor is None or not isinstance(
rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
f"'dynamic'], got {rope_scaling_type}"
)
if (
rope_scaling_factor is None
or not isinstance(rope_scaling_factor, float)
or rope_scaling_factor <= 1.0
):
raise ValueError(
"`rope_scaling`'s factor field must be a float > 1, got "
f"{rope_scaling_factor}")
f"{rope_scaling_factor}"
)

View File

@@ -203,11 +203,11 @@ class NemotronHConfig(PretrainedConfig):
# Validate hybrid_override_pattern
# M: Mamba2, *: Attention, -: MLP
assert len(self.hybrid_override_pattern) == self.num_hidden_layers, (
"hybrid_override_pattern must have same length as "
"num_hidden_layers")
"hybrid_override_pattern must have same length as num_hidden_layers"
)
assert re.match(r"^[*-M]+$", self.hybrid_override_pattern), (
"hybrid_override_pattern must only contain characters "
"'M', '*', or '-'")
"hybrid_override_pattern must only contain characters 'M', '*', or '-'"
)
# for backward compatibility
if num_key_value_heads is None:
@@ -253,7 +253,10 @@ class NemotronHConfig(PretrainedConfig):
@property
def layers_block_type(self):
return [
"mamba" if self.hybrid_override_pattern[i] == "M" else
"attention" if self.hybrid_override_pattern[i] == "*" else "mlp"
"mamba"
if self.hybrid_override_pattern[i] == "M"
else "attention"
if self.hybrid_override_pattern[i] == "*"
else "mlp"
for i in range(self.num_hidden_layers)
]

View File

@@ -5,7 +5,6 @@ from transformers.configuration_utils import PretrainedConfig
class Olmo3Config(PretrainedConfig):
model_type = "olmo3"
keys_to_ignore_at_inference = ["past_key_values"]

View File

@@ -16,8 +16,7 @@
# limitations under the License.
"""Qwen3-Next model configuration"""
from transformers.configuration_utils import (PretrainedConfig,
layer_type_validation)
from transformers.configuration_utils import PretrainedConfig, layer_type_validation
from transformers.modeling_rope_utils import rope_config_validation
from transformers.utils import logging

View File

@@ -81,11 +81,11 @@ class RadioConfig(PretrainedConfig):
self.initializer_factor = initializer_factor
self.hidden_act = hidden_act
self.max_img_size = max_img_size
self.norm_mean = list(norm_mean) if isinstance(norm_mean,
(tuple,
list)) else norm_mean
self.norm_std = list(norm_std) if isinstance(norm_std,
(tuple,
list)) else norm_std
self.norm_mean = (
list(norm_mean) if isinstance(norm_mean, (tuple, list)) else norm_mean
)
self.norm_std = (
list(norm_std) if isinstance(norm_std, (tuple, list)) else norm_std
)
self.reg_tokens = reg_tokens
super().__init__(**kwargs)

View File

@@ -5,7 +5,6 @@ SUPPORTED_SPECULATORS_TYPES = {}
def register_speculator(name):
def decorator(fn):
SUPPORTED_SPECULATORS_TYPES[name] = fn
return fn
@@ -17,7 +16,7 @@ def register_speculator(name):
def update_eagle3(config_dict: dict, vllm_config: dict) -> None:
"""
Apply Eagle-3 specific configuration transformations.
Eagle-3 specific fields:
- draft_vocab_size: Size of the draft model's vocabulary
- target_hidden_size: Hidden size of the target model
@@ -27,6 +26,5 @@ def update_eagle3(config_dict: dict, vllm_config: dict) -> None:
vllm_config["draft_vocab_size"] = config_dict.get("draft_vocab_size")
if config_dict.get("target_hidden_size") is not None:
vllm_config["target_hidden_size"] = config_dict["target_hidden_size"]
vllm_config["norm_before_residual"] = config_dict.get(
"norm_before_residual", True)
vllm_config["norm_before_residual"] = config_dict.get("norm_before_residual", True)
vllm_config["architectures"] = ["Eagle3LlamaForCausalLM"]

View File

@@ -6,7 +6,8 @@ from typing import Any, Union
from transformers import PretrainedConfig
from vllm.transformers_utils.configs.speculators.algos import (
SUPPORTED_SPECULATORS_TYPES)
SUPPORTED_SPECULATORS_TYPES,
)
__all__ = ["SpeculatorsConfig"]
@@ -21,27 +22,27 @@ class SpeculatorsConfig(PretrainedConfig):
**kwargs,
) -> "SpeculatorsConfig":
"""Load speculators Eagle config and convert to vLLM format."""
config_dict, _ = cls.get_config_dict(pretrained_model_name_or_path,
**kwargs)
config_dict, _ = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
vllm_config = cls.extract_vllm_speculative_config(config_dict)
return cls(**vllm_config)
@classmethod
def extract_vllm_speculative_config(
cls, config_dict: dict[str, Any]) -> dict[str, Any]:
cls, config_dict: dict[str, Any]
) -> dict[str, Any]:
speculators_model_type = config_dict.get("speculators_model_type")
if speculators_model_type not in SUPPORTED_SPECULATORS_TYPES:
raise ValueError(
f"Expected one of: {SUPPORTED_SPECULATORS_TYPES}. "
"Please ensure you're loading a speculators-format model.")
"Please ensure you're loading a speculators-format model."
)
# validate fields
# TODO: @dsikka - use speculators pydantic model to validate
cls.validate_speculators_config(config_dict=config_dict)
# Convert from speculators config -> format that can be ingested by vLLM
vllm_config = cls.build_vllm_speculative_config(
config_dict=config_dict)
vllm_config = cls.build_vllm_speculative_config(config_dict=config_dict)
# Apply anything specific to the supported algorithm
algo_updater = SUPPORTED_SPECULATORS_TYPES[speculators_model_type]
algo_updater(config_dict=config_dict, vllm_config=vllm_config)
@@ -64,11 +65,13 @@ class SpeculatorsConfig(PretrainedConfig):
if not isinstance(config_dict["transformer_layer_config"], dict):
raise TypeError(
"'transformer_layer_config' must be a dictionary if provided")
"'transformer_layer_config' must be a dictionary if provided"
)
@classmethod
def build_vllm_speculative_config(
cls, config_dict: dict[str, Any]) -> dict[str, Any]:
cls, config_dict: dict[str, Any]
) -> dict[str, Any]:
"""
Build vLLM-compatible speculative configuration from speculators format.
@@ -94,14 +97,14 @@ class SpeculatorsConfig(PretrainedConfig):
if num_speculative_tokens is None:
raise ValueError(
"Missing 'speculative_tokens' in proposal method. "
f"Got: {first_method}")
f"Missing 'speculative_tokens' in proposal method. Got: {first_method}"
)
# Build base vLLM speculative configuration
vllm_config = {
"method": config_dict.get("speculators_model_type"),
"num_speculative_tokens": num_speculative_tokens,
"target_model": spec_config.get("verifier")["name_or_path"]
"target_model": spec_config.get("verifier")["name_or_path"],
}
# Merge transformer layer configuration if present

View File

@@ -59,13 +59,64 @@ class Step3TextConfig(PretrainedConfig):
share_q_dim: int = 2048,
head_dim: int = 256,
norm_expert_weight: bool = False,
moe_layers_enum: tuple[int,
...] = (4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
55, 56, 57, 58, 59),
moe_layers_enum: tuple[int, ...] = (
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17,
18,
19,
20,
21,
22,
23,
24,
25,
26,
27,
28,
29,
30,
31,
32,
33,
34,
35,
36,
37,
38,
39,
40,
41,
42,
43,
44,
45,
46,
47,
48,
49,
50,
51,
52,
53,
54,
55,
56,
57,
58,
59,
),
**kwargs,
) -> None:
self.hidden_size = hidden_size

View File

@@ -42,6 +42,7 @@ class UltravoxConfig(transformers.PretrainedConfig):
projector or at the end. Versions v0.4.1 and below
use `False`, but v0.5 and above use `True`.
"""
wrapped_model_config: transformers.PretrainedConfig
model_type = "ultravox"
audio_token = "<|audio|>"
@@ -76,15 +77,17 @@ class UltravoxConfig(transformers.PretrainedConfig):
if text_model_id is None:
text_config = text_config or {}
self.wrapped_model_config = transformers.CONFIG_MAPPING[
text_config.get("model_type", "llama")](**text_config)
text_config.get("model_type", "llama")
](**text_config)
# N.B. May set the audio_config below.
self.audio_model_id = audio_model_id
if audio_model_id is None:
self.audio_model_id = None
audio_config = audio_config or {}
self.audio_config = transformers.CONFIG_MAPPING[audio_config.get(
"model_type", "whisper")](**audio_config)
self.audio_config = transformers.CONFIG_MAPPING[
audio_config.get("model_type", "whisper")
](**audio_config)
super().__init__(**kwargs)
@@ -99,8 +102,7 @@ class UltravoxConfig(transformers.PretrainedConfig):
if key == "text_model_id" and value is not None:
from vllm.transformers_utils.config import get_config
self.wrapped_model_config = get_config(value,
trust_remote_code=False)
self.wrapped_model_config = get_config(value, trust_remote_code=False)
elif key == "audio_model_id" and value is not None:
from vllm.transformers_utils.config import get_config

View File

@@ -30,8 +30,9 @@ def _convert_tokens_to_string_with_added_encoders(
current_sub_text: list[str] = []
convert_tokens_to_string = tokenizer.convert_tokens_to_string
added_vocab_set = set(tokenizer.get_added_vocab())
all_special_tokens = set(
tokenizer.all_special_tokens) if skip_special_tokens else ()
all_special_tokens = (
set(tokenizer.all_special_tokens) if skip_special_tokens else ()
)
for token in output_tokens:
# Use precomputed set for skip-special check
@@ -70,11 +71,11 @@ def convert_prompt_ids_to_tokens(
# We do not need to convert the whole prompt to tokens.
# Offset a little more in case we have special tokens.
new_tokens = tokenizer.convert_ids_to_tokens(
prompt_ids[-INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET - 2:],
skip_special_tokens=skip_special_tokens)
prompt_ids[-INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET - 2 :],
skip_special_tokens=skip_special_tokens,
)
read_offset = len(new_tokens)
prefix_offset = max(
read_offset - INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET, 0)
prefix_offset = max(read_offset - INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET, 0)
# This is required to guard against out-of-vocab prompt token ids
_replace_none_with_empty(new_tokens) # type: ignore[arg-type]
return new_tokens, prefix_offset, read_offset
@@ -92,7 +93,7 @@ def convert_ids_list_to_tokens(
Returns:
Python list of token string representations
"""
token_str_lst = []
for token_id in token_ids:
@@ -144,18 +145,17 @@ def detokenize_incrementally(
# This is the first iteration for this sequence
is_first_iter = prev_tokens is None
if is_first_iter:
(prev_tokens, prefix_offset,
read_offset) = convert_prompt_ids_to_tokens(
tokenizer,
all_input_ids[:-1],
skip_special_tokens=skip_special_tokens)
(prev_tokens, prefix_offset, read_offset) = convert_prompt_ids_to_tokens(
tokenizer, all_input_ids[:-1], skip_special_tokens=skip_special_tokens
)
assert prev_tokens is not None
# If the new token id is out of bounds, return an empty string.
if 0 <= new_token_id < len(tokenizer):
# Put new_token_id in a list so skip_special_tokens is respected
new_tokens = tokenizer.convert_ids_to_tokens(
[new_token_id], skip_special_tokens=skip_special_tokens)
[new_token_id], skip_special_tokens=skip_special_tokens
)
if isinstance(new_tokens, str):
new_tokens = [new_tokens]
else:
@@ -171,9 +171,9 @@ def detokenize_incrementally(
# surrounding ids.
if tokenizer.is_fast or not tokenizer.get_added_vocab():
prefix_text = tokenizer.convert_tokens_to_string(
output_tokens[prefix_offset:read_offset])
new_text = tokenizer.convert_tokens_to_string(
output_tokens[prefix_offset:])
output_tokens[prefix_offset:read_offset]
)
new_text = tokenizer.convert_tokens_to_string(output_tokens[prefix_offset:])
else:
prefix_text = _convert_tokens_to_string_with_added_encoders(
tokenizer,
@@ -195,5 +195,5 @@ def detokenize_incrementally(
# by the model
return new_tokens, "", prefix_offset, read_offset
new_text = new_text[len(prefix_text):]
new_text = new_text[len(prefix_text) :]
return new_tokens, new_text, read_offset, len(output_tokens)

View File

@@ -4,8 +4,12 @@
from functools import lru_cache
from typing import TYPE_CHECKING, Any, Optional, Union, cast
from transformers import (AutoFeatureExtractor, AutoImageProcessor,
AutoProcessor, AutoVideoProcessor)
from transformers import (
AutoFeatureExtractor,
AutoImageProcessor,
AutoProcessor,
AutoVideoProcessor,
)
from transformers.feature_extraction_utils import FeatureExtractionMixin
from transformers.image_processing_utils import BaseImageProcessor
from transformers.processing_utils import ProcessorMixin
@@ -121,15 +125,18 @@ def get_processor(
"a custom processor not yet available in the HuggingFace "
"transformers library, consider setting "
"`trust_remote_code=True` in LLM or using the "
"`--trust-remote-code` flag in the CLI.")
"`--trust-remote-code` flag in the CLI."
)
raise RuntimeError(err_msg) from e
else:
raise e
if not isinstance(processor, processor_cls):
raise TypeError("Invalid type of HuggingFace processor. "
f"Expected type: {processor_cls}, but "
f"found type: {type(processor)}")
raise TypeError(
"Invalid type of HuggingFace processor. "
f"Expected type: {processor_cls}, but "
f"found type: {type(processor)}"
)
return processor
@@ -158,7 +165,7 @@ def get_feature_extractor(
trust_remote_code: bool = False,
**kwargs: Any,
):
"""Load an audio feature extractor for the given model name
"""Load an audio feature extractor for the given model name
via HuggingFace."""
try:
feature_extractor = AutoFeatureExtractor.from_pretrained(
@@ -166,7 +173,8 @@ def get_feature_extractor(
*args,
revision=revision,
trust_remote_code=trust_remote_code,
**kwargs)
**kwargs,
)
except ValueError as e:
# If the error pertains to the processor class not existing or not
# currently being imported, suggest using the --trust-remote-code flag.
@@ -177,7 +185,8 @@ def get_feature_extractor(
"extractor is a custom extractor not yet available in the "
"HuggingFace transformers library, consider setting "
"`trust_remote_code=True` in LLM or using the "
"`--trust-remote-code` flag in the CLI.")
"`--trust-remote-code` flag in the CLI."
)
raise RuntimeError(err_msg) from e
else:
raise e
@@ -213,7 +222,8 @@ def get_image_processor(
*args,
revision=revision,
trust_remote_code=trust_remote_code,
**kwargs)
**kwargs,
)
except ValueError as e:
# If the error pertains to the processor class not existing or not
# currently being imported, suggest using the --trust-remote-code flag.
@@ -224,7 +234,8 @@ def get_image_processor(
"a custom processor not yet available in the HuggingFace "
"transformers library, consider setting "
"`trust_remote_code=True` in LLM or using the "
"`--trust-remote-code` flag in the CLI.")
"`--trust-remote-code` flag in the CLI."
)
raise RuntimeError(err_msg) from e
else:
raise e
@@ -263,7 +274,8 @@ def get_video_processor(
*args,
revision=revision,
trust_remote_code=trust_remote_code,
**kwargs)
**kwargs,
)
except ValueError as e:
# If the error pertains to the processor class not existing or not
# currently being imported, suggest using the --trust-remote-code flag.
@@ -274,7 +286,8 @@ def get_video_processor(
"a custom processor not yet available in the HuggingFace "
"transformers library, consider setting "
"`trust_remote_code=True` in LLM or using the "
"`--trust-remote-code` flag in the CLI.")
"`--trust-remote-code` flag in the CLI."
)
raise RuntimeError(err_msg) from e
else:
raise e

View File

@@ -8,8 +8,7 @@ reasons:
- There is a need to override the existing processor to support vLLM.
"""
from vllm.transformers_utils.processors.deepseek_vl2 import (
DeepseekVLV2Processor)
from vllm.transformers_utils.processors.deepseek_vl2 import DeepseekVLV2Processor
from vllm.transformers_utils.processors.ovis import OvisProcessor
from vllm.transformers_utils.processors.ovis2_5 import Ovis2_5Processor

View File

@@ -30,8 +30,7 @@ import PIL
import torch
from transformers import AutoProcessor, BatchFeature
from transformers.image_utils import ImageInput
from transformers.processing_utils import (ProcessingKwargs, ProcessorMixin,
Unpack)
from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
from vllm.multimodal.image import convert_image_mode

View File

@@ -9,33 +9,31 @@ import PIL
import torch
from transformers import AutoProcessor, BatchFeature
from transformers.image_utils import ImageInput
from transformers.processing_utils import (ProcessingKwargs, ProcessorMixin,
Unpack)
from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
__all__ = ['Ovis2_5Processor']
__all__ = ["Ovis2_5Processor"]
IMAGE_TOKEN = "<image>"
VIDEO_TOKEN = "<video>"
MIN_PIXELS = 448 * 448
MAX_PIXELS = 1792 * 1792
class Ovis2_5ProcessorKwargs(ProcessingKwargs,
total=False): # type: ignore[call-arg]
class Ovis2_5ProcessorKwargs(ProcessingKwargs, total=False): # type: ignore[call-arg]
_defaults = {
"text_kwargs": {
"padding": False,
},
"images_kwargs": {
'convert_to_rgb': True,
'min_pixels': MIN_PIXELS,
'max_pixels': MAX_PIXELS,
"convert_to_rgb": True,
"min_pixels": MIN_PIXELS,
"max_pixels": MAX_PIXELS,
},
"videos_kwargs": {
'convert_to_rgb': True,
'min_pixels': MIN_PIXELS,
'max_pixels': MAX_PIXELS,
}
"convert_to_rgb": True,
"min_pixels": MIN_PIXELS,
"max_pixels": MAX_PIXELS,
},
}
@@ -43,8 +41,8 @@ class Ovis2_5Processor(ProcessorMixin):
r"""
Constructs an Ovis processor which wraps an Ovis image processor
and a Qwen2 tokenizer into a single processor.
[`OvisProcessor`] offers all the functionalities of
[`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`].
[`OvisProcessor`] offers all the functionalities of
[`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`].
See the [`~OvisProcessor.__call__`] and [`~OvisProcessor.decode`]
for more information.
Args:
@@ -81,9 +79,7 @@ class Ovis2_5Processor(ProcessorMixin):
self.patch_size = patch_size
self.hidden_stride = hidden_stride
self.temporal_patch_size = temporal_patch_size
super().__init__(image_processor,
tokenizer,
chat_template=chat_template)
super().__init__(image_processor, tokenizer, chat_template=chat_template)
@cached_property
def extra_special_tokens(self):
@@ -96,7 +92,7 @@ class Ovis2_5Processor(ProcessorMixin):
"image_end": -302,
"video_start": -303,
"video_end": -304,
'image_pad': image_pad_token_id,
"image_pad": image_pad_token_id,
}
return extra_special_tokens
@@ -104,8 +100,9 @@ class Ovis2_5Processor(ProcessorMixin):
self,
images: ImageInput = None,
videos: Union[np.ndarray, list[ImageInput]] = None,
text: Union[TextInput, PreTokenizedInput, list[TextInput],
list[PreTokenizedInput]] = None,
text: Union[
TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]
] = None,
**kwargs: Unpack[Ovis2_5ProcessorKwargs],
) -> BatchFeature:
"""
@@ -148,9 +145,9 @@ class Ovis2_5Processor(ProcessorMixin):
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
- **input_ids** -- list of token ids to be fed to a model.
Returned when `text` is not `None`.
- **attention_mask** -- list of indices specifying which tokens
- **attention_mask** -- list of indices specifying which tokens
should be attended to by the model (when
`return_attention_mask=True` or if *"attention_mask"*
`return_attention_mask=True` or if *"attention_mask"*
is in `self.model_input_names` and if `text` is not `None`).
- **pixel_values** -- Pixel values to be fed to a model.
Returned when `images` is not `None`.
@@ -177,9 +174,9 @@ class Ovis2_5Processor(ProcessorMixin):
grids = []
# Process each image
for image in images if isinstance(images, list) else [images]:
pixel_values, image_placeholders, grid = (
self.preprocess_multidata(
images=image, **output_kwargs["images_kwargs"]))
pixel_values, image_placeholders, grid = self.preprocess_multidata(
images=image, **output_kwargs["images_kwargs"]
)
processed_images.append(pixel_values)
image_placeholders_list.append(image_placeholders)
grids.append(grid)
@@ -196,16 +193,15 @@ class Ovis2_5Processor(ProcessorMixin):
grids = []
# Process each video
for video in videos if isinstance(videos, list) else [videos]:
pixel_values, video_placeholders, grid = (
self.preprocess_multidata(
video=video, **output_kwargs["videos_kwargs"]))
pixel_values, video_placeholders, grid = self.preprocess_multidata(
video=video, **output_kwargs["videos_kwargs"]
)
processed_videos.append(pixel_values)
videos_placeholders_list.append(video_placeholders)
grids.append(grid)
# assign all processed videos
if processed_videos:
visual_features[
"video_placeholders"] = videos_placeholders_list
visual_features["video_placeholders"] = videos_placeholders_list
output["video_pixel_values"] = processed_videos
output["video_grids"] = grids
@@ -220,14 +216,16 @@ class Ovis2_5Processor(ProcessorMixin):
image_idx = 0
video_idx = 0
for ids_tensor in tokenized_batched_text:
has_image_tokens = (image_token_id in ids_tensor
and "image_placeholders" in visual_features
and image_idx < len(
visual_features["image_placeholders"]))
has_video_tokens = (video_token_id in ids_tensor
and "video_placeholders" in visual_features
and video_idx < len(
visual_features["video_placeholders"]))
has_image_tokens = (
image_token_id in ids_tensor
and "image_placeholders" in visual_features
and image_idx < len(visual_features["image_placeholders"])
)
has_video_tokens = (
video_token_id in ids_tensor
and "video_placeholders" in visual_features
and video_idx < len(visual_features["video_placeholders"])
)
if has_image_tokens or has_video_tokens:
# Convert to list for easier manipulation
ids_list = ids_tensor.tolist()
@@ -237,13 +235,13 @@ class Ovis2_5Processor(ProcessorMixin):
for token_id in ids_list:
if token_id == image_token_id:
new_ids.extend(
visual_features["image_placeholders"]
[image_idx])
visual_features["image_placeholders"][image_idx]
)
image_idx += 1
elif token_id == video_token_id:
new_ids.extend(
visual_features["video_placeholders"]
[video_idx])
visual_features["video_placeholders"][video_idx]
)
video_idx += 1
else:
new_ids.append(token_id)
@@ -260,8 +258,7 @@ class Ovis2_5Processor(ProcessorMixin):
# If only images were provided
return BatchFeature(data=visual_features)
def _tokenize_with_visual_symbol(self,
text_list: list[str]) -> torch.LongTensor:
def _tokenize_with_visual_symbol(self, text_list: list[str]) -> torch.LongTensor:
batch_token_ids = []
for text in text_list:
token_ids = []
@@ -288,21 +285,24 @@ class Ovis2_5Processor(ProcessorMixin):
return torch.tensor(batch_token_ids, dtype=torch.long)
# Copied from qwen2_vl
def smart_resize(self,
height: int,
width: int,
factor: int = 28,
min_pixels: int = MIN_PIXELS,
max_pixels: int = MAX_PIXELS):
def smart_resize(
self,
height: int,
width: int,
factor: int = 28,
min_pixels: int = MIN_PIXELS,
max_pixels: int = MAX_PIXELS,
):
"""Rescales the image so that the following conditions are met:
1. Both dimensions (height and width) are divisible by 'factor'.
2. The total number of pixels is within the range
2. The total number of pixels is within the range
['min_pixels', 'max_pixels'].
3. The aspect ratio of the image is maintained as closely as possible.
"""
if height < factor or width < factor:
print(f"height:{height} or width:{width} must be "
f"larger than factor:{factor}")
print(
f"height:{height} or width:{width} must be larger than factor:{factor}"
)
if height < width:
width = round(factor / height * width)
height = factor
@@ -311,8 +311,10 @@ class Ovis2_5Processor(ProcessorMixin):
width = factor
elif max(height, width) / min(height, width) > 200:
print(f"absolute aspect ratio must be smaller than 200, "
f"got {max(height, width) / min(height, width)}")
print(
f"absolute aspect ratio must be smaller than 200, "
f"got {max(height, width) / min(height, width)}"
)
if height > width:
height = 200 * width
else:
@@ -335,29 +337,27 @@ class Ovis2_5Processor(ProcessorMixin):
def construct_visual_indicators(self, grid, is_video: bool = False):
if is_video:
start_token = self.get_token_value('video_start')
end_token = self.get_token_value('video_end')
start_token = self.get_token_value("video_start")
end_token = self.get_token_value("video_end")
else:
start_token = self.get_token_value('image_start')
end_token = self.get_token_value('image_end')
start_token = self.get_token_value("image_start")
end_token = self.get_token_value("image_end")
image_placeholders = [start_token, self.get_token_value('visual_atom')]
image_placeholders = [start_token, self.get_token_value("visual_atom")]
if grid[0] * grid[1] > 1:
for r in range(grid[0]):
for c in range(grid[1]):
image_placeholders.append(
self.get_token_value('visual_atom'))
image_placeholders.append(self.get_token_value("visual_atom"))
image_placeholders.append(end_token)
return image_placeholders
def construct_visual_placeholders(self, grid, is_video: bool = False):
visual_placeholders = self.construct_visual_indicators((1, 1),
is_video)
visual_placeholders = self.construct_visual_indicators((1, 1), is_video)
image_atom_token_id = self.get_token_value('visual_atom')
image_atom_token_id = self.get_token_value("visual_atom")
# Extract the padding token ID from tokenizer
image_padding_token_id = self.get_token_value('image_pad')
image_padding_token_id = self.get_token_value("image_pad")
num_image_atoms = grid[0] * grid[1] * grid[2]
num_image_atoms //= self.hidden_stride**2
@@ -367,8 +367,9 @@ class Ovis2_5Processor(ProcessorMixin):
padded_placeholder_tokens = []
for token in visual_placeholders:
if token == image_atom_token_id:
padded_placeholder_tokens.extend([image_padding_token_id] *
num_image_atoms)
padded_placeholder_tokens.extend(
[image_padding_token_id] * num_image_atoms
)
else:
padded_placeholder_tokens.append(image_padding_token_id)
return padded_placeholder_tokens
@@ -380,7 +381,7 @@ class Ovis2_5Processor(ProcessorMixin):
convert_to_rgb: Optional[bool] = True,
min_pixels: int = MIN_PIXELS,
max_pixels: int = MAX_PIXELS,
return_tensors: Optional[str] = 'pt',
return_tensors: Optional[str] = "pt",
):
is_video = False
if images is not None:
@@ -396,11 +397,12 @@ class Ovis2_5Processor(ProcessorMixin):
images.append(image)
elif isinstance(video, list):
images = video
min_pixels = min(max_pixels if max_pixels is not None else MAX_PIXELS,
min_pixels if min_pixels is not None else MIN_PIXELS)
min_pixels = min(
max_pixels if max_pixels is not None else MAX_PIXELS,
min_pixels if min_pixels is not None else MIN_PIXELS,
)
images = [
image.convert("RGB")
if convert_to_rgb and image.mode != 'RGB' else image
image.convert("RGB") if convert_to_rgb and image.mode != "RGB" else image
for image in images
]
@@ -417,14 +419,16 @@ class Ovis2_5Processor(ProcessorMixin):
)
new_size = dict(height=resized_height, width=resized_width)
image_pt = self.image_processor.preprocess(
image, size=new_size, return_tensors="np")['pixel_values'][0]
image, size=new_size, return_tensors="np"
)["pixel_values"][0]
processed_images.append(image_pt)
patches = np.array(processed_images)
if patches.shape[0] % self.temporal_patch_size != 0:
num_to_pad = self.temporal_patch_size - (patches.shape[0] %
self.temporal_patch_size)
num_to_pad = self.temporal_patch_size - (
patches.shape[0] % self.temporal_patch_size
)
repeats = np.repeat(patches[-1][np.newaxis], num_to_pad, axis=0)
patches = np.concatenate([patches, repeats], axis=0)
channel = patches.shape[1]
@@ -445,14 +449,18 @@ class Ovis2_5Processor(ProcessorMixin):
)
patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
flatten_patches = patches.reshape(
grid_t * grid_h * grid_w, channel * self.temporal_patch_size *
self.patch_size * self.patch_size)
grid_t * grid_h * grid_w,
channel * self.temporal_patch_size * self.patch_size * self.patch_size,
)
visual_placeholders = self.construct_visual_placeholders(
[grid_t, grid_h, grid_w], is_video)
return torch.tensor(
flatten_patches), visual_placeholders, torch.tensor(
[[grid_t, grid_h, grid_w]])
[grid_t, grid_h, grid_w], is_video
)
return (
torch.tensor(flatten_patches),
visual_placeholders,
torch.tensor([[grid_t, grid_h, grid_w]]),
)
AutoProcessor.register("Ovis2_5Processor", Ovis2_5Processor)

View File

@@ -14,7 +14,7 @@ from vllm.utils import PlaceholderModule
logger = init_logger(__name__)
SUPPORTED_SCHEMES = ['s3://', 'gs://']
SUPPORTED_SCHEMES = ["s3://", "gs://"]
try:
from runai_model_streamer import list_safetensors as runai_list_safetensors
@@ -22,11 +22,9 @@ try:
except (ImportError, OSError):
# see https://github.com/run-ai/runai-model-streamer/issues/26
# OSError will be raised on arm64 platform
runai_model_streamer = PlaceholderModule(
"runai_model_streamer") # type: ignore[assignment]
runai_model_streamer = PlaceholderModule("runai_model_streamer") # type: ignore[assignment]
runai_pull_files = runai_model_streamer.placeholder_attr("pull_files")
runai_list_safetensors = runai_model_streamer.placeholder_attr(
"list_safetensors")
runai_list_safetensors = runai_model_streamer.placeholder_attr("list_safetensors")
def list_safetensors(path: str = "") -> list[str]:
@@ -65,8 +63,10 @@ class ObjectStorageModel:
signal.signal(sig, self._close_by_signal(existing_handler))
dir_name = os.path.join(
get_cache_dir(), "model_streamer",
hashlib.sha256(str(url).encode()).hexdigest()[:8])
get_cache_dir(),
"model_streamer",
hashlib.sha256(str(url).encode()).hexdigest()[:8],
)
if os.path.exists(dir_name):
shutil.rmtree(dir_name)
os.makedirs(dir_name)
@@ -78,7 +78,6 @@ class ObjectStorageModel:
shutil.rmtree(self.dir)
def _close_by_signal(self, existing_handler=None):
def new_handler(signum, frame):
self._close()
if existing_handler:
@@ -86,10 +85,12 @@ class ObjectStorageModel:
return new_handler
def pull_files(self,
model_path: str = "",
allow_pattern: Optional[list[str]] = None,
ignore_pattern: Optional[list[str]] = None) -> None:
def pull_files(
self,
model_path: str = "",
allow_pattern: Optional[list[str]] = None,
ignore_pattern: Optional[list[str]] = None,
) -> None:
"""
Pull files from object storage into the temporary directory.

View File

@@ -17,21 +17,25 @@ except ImportError:
def _filter_allow(paths: list[str], patterns: list[str]) -> list[str]:
return [
path for path in paths if any(
fnmatch.fnmatch(path, pattern) for pattern in patterns)
path
for path in paths
if any(fnmatch.fnmatch(path, pattern) for pattern in patterns)
]
def _filter_ignore(paths: list[str], patterns: list[str]) -> list[str]:
return [
path for path in paths
path
for path in paths
if not any(fnmatch.fnmatch(path, pattern) for pattern in patterns)
]
def glob(s3: Optional["BaseClient"] = None,
path: str = "",
allow_pattern: Optional[list[str]] = None) -> list[str]:
def glob(
s3: Optional["BaseClient"] = None,
path: str = "",
allow_pattern: Optional[list[str]] = None,
) -> list[str]:
"""
List full file names from S3 path and filter by allow pattern.
@@ -47,17 +51,15 @@ def glob(s3: Optional["BaseClient"] = None,
s3 = boto3.client("s3")
if not path.endswith("/"):
path = path + "/"
bucket_name, _, paths = list_files(s3,
path=path,
allow_pattern=allow_pattern)
bucket_name, _, paths = list_files(s3, path=path, allow_pattern=allow_pattern)
return [f"s3://{bucket_name}/{path}" for path in paths]
def list_files(
s3: "BaseClient",
path: str,
allow_pattern: Optional[list[str]] = None,
ignore_pattern: Optional[list[str]] = None
s3: "BaseClient",
path: str,
allow_pattern: Optional[list[str]] = None,
ignore_pattern: Optional[list[str]] = None,
) -> tuple[str, str, list[str]]:
"""
List files from S3 path and filter by pattern.
@@ -71,17 +73,17 @@ def list_files(
Returns:
tuple[str, str, list[str]]: A tuple where:
- The first element is the bucket name
- The second element is string represent the bucket
- The second element is string represent the bucket
and the prefix as a dir like string
- The third element is a list of files allowed or
- The third element is a list of files allowed or
disallowed by pattern
"""
parts = path.removeprefix('s3://').split('/')
prefix = '/'.join(parts[1:])
parts = path.removeprefix("s3://").split("/")
prefix = "/".join(parts[1:])
bucket_name = parts[0]
objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
paths = [obj['Key'] for obj in objects.get('Contents', [])]
paths = [obj["Key"] for obj in objects.get("Contents", [])]
paths = _filter_ignore(paths, ["*/"])
if allow_pattern is not None:

View File

@@ -10,14 +10,12 @@ from pathlib import Path
from typing import TYPE_CHECKING, Any, Optional, Union
import huggingface_hub
from transformers import (AutoTokenizer, PreTrainedTokenizer,
PreTrainedTokenizerFast)
from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
from typing_extensions import assert_never
from vllm import envs
from vllm.logger import init_logger
from vllm.transformers_utils.config import (
get_sentence_transformer_tokenizer_config)
from vllm.transformers_utils.config import get_sentence_transformer_tokenizer_config
from vllm.transformers_utils.tokenizers import MistralTokenizer
from vllm.transformers_utils.utils import check_gguf_file
@@ -32,8 +30,7 @@ else:
logger = init_logger(__name__)
AnyTokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast,
TokenizerBase]
AnyTokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast, TokenizerBase]
def decode_tokens(
@@ -50,8 +47,7 @@ def decode_tokens(
settings.
"""
if skip_special_tokens is not None:
return tokenizer.decode(token_ids,
skip_special_tokens=skip_special_tokens)
return tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
return tokenizer.decode(token_ids)
@@ -95,8 +91,7 @@ def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
tokenizer_all_special_ids = tokenizer.all_special_ids
tokenizer_all_special_tokens = tokenizer.all_special_tokens
tokenizer_all_special_tokens_extended = (
tokenizer.all_special_tokens_extended)
tokenizer_all_special_tokens_extended = tokenizer.all_special_tokens_extended
tokenizer_vocab = tokenizer.get_vocab()
tokenizer_len = len(tokenizer)
@@ -110,7 +105,6 @@ def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
max_token_id = max(max_token_id, tokenizer.vocab_size)
class CachedTokenizer(tokenizer.__class__): # type: ignore
@property
def all_special_ids(self) -> list[int]:
return tokenizer_all_special_ids
@@ -134,7 +128,7 @@ def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
return tokenizer_len
def __reduce__(self):
return get_cached_tokenizer, (tokenizer, )
return get_cached_tokenizer, (tokenizer,)
CachedTokenizer.__name__ = f"Cached{tokenizer.__class__.__name__}"
@@ -151,8 +145,7 @@ def get_tokenizer(
download_dir: Optional[str] = None,
**kwargs,
) -> AnyTokenizer:
"""Gets a tokenizer for the given model name via HuggingFace or ModelScope.
"""
"""Gets a tokenizer for the given model name via HuggingFace or ModelScope."""
if envs.VLLM_USE_MODELSCOPE:
# download model from ModelScope hub,
# lazy import so that modelscope is not required for normal use.
@@ -173,13 +166,13 @@ def get_tokenizer(
revision=revision,
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
# Ignore weights - we only need the tokenizer.
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"],
)
tokenizer_name = tokenizer_path
if tokenizer_mode == "slow":
if kwargs.get("use_fast", False):
raise ValueError(
"Cannot use the fast tokenizer in slow tokenizer mode.")
raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
kwargs["use_fast"] = False
if "truncation_side" not in kwargs:
@@ -195,23 +188,28 @@ def get_tokenizer(
is_from_mistral_org = str(tokenizer_name).split("/")[0] == "mistralai"
if is_from_mistral_org and tokenizer_mode != "mistral":
warnings.warn(
'It is strongly recommended to run mistral models with '
"It is strongly recommended to run mistral models with "
'`--tokenizer-mode "mistral"` to ensure correct '
'encoding and decoding.',
"encoding and decoding.",
FutureWarning,
stacklevel=2)
stacklevel=2,
)
tokenizer: AnyTokenizer
if tokenizer_mode == "mistral":
tokenizer = MistralTokenizer.from_pretrained(str(tokenizer_name),
revision=revision)
tokenizer = MistralTokenizer.from_pretrained(
str(tokenizer_name), revision=revision
)
elif tokenizer_mode == "custom":
from vllm.transformers_utils.tokenizer_base import TokenizerRegistry
tokenizer = TokenizerRegistry.get_tokenizer(str(tokenizer_name),
*args,
revision=revision,
download_dir=download_dir,
**kwargs)
tokenizer = TokenizerRegistry.get_tokenizer(
str(tokenizer_name),
*args,
revision=revision,
download_dir=download_dir,
**kwargs,
)
else:
try:
tokenizer = AutoTokenizer.from_pretrained(
@@ -226,13 +224,16 @@ def get_tokenizer(
# currently being imported,
# suggest using the --trust-remote-code flag.
if not trust_remote_code and (
"does not exist or is not currently imported." in str(e)
or "requires you to execute the tokenizer file" in str(e)):
err_msg = ("Failed to load the tokenizer. If the tokenizer "
"is a custom tokenizer not yet available in the "
"HuggingFace transformers library, consider "
"setting `trust_remote_code=True` in LLM or using "
"the `--trust-remote-code` flag in the CLI.")
"does not exist or is not currently imported." in str(e)
or "requires you to execute the tokenizer file" in str(e)
):
err_msg = (
"Failed to load the tokenizer. If the tokenizer "
"is a custom tokenizer not yet available in the "
"HuggingFace transformers library, consider "
"setting `trust_remote_code=True` in LLM or using "
"the `--trust-remote-code` flag in the CLI."
)
raise RuntimeError(err_msg) from e
else:
raise e
@@ -240,19 +241,21 @@ def get_tokenizer(
# The special_tokens in tokenizer should also be
# controlled by do_lower_case in encoder_config
encoder_config = get_sentence_transformer_tokenizer_config(
tokenizer_name, revision)
tokenizer_name, revision
)
if isinstance(encoder_config, dict) and encoder_config.get(
"do_lower_case", False):
"do_lower_case", False
):
special_tokens_map = {
k: v.lower()
for k, v in tokenizer.special_tokens_map.items()
k: v.lower() for k, v in tokenizer.special_tokens_map.items()
}
tokenizer.add_special_tokens(special_tokens_map)
if not isinstance(tokenizer, PreTrainedTokenizerFast):
logger.warning(
"Using a slow tokenizer. This might cause a significant "
"slowdown. Consider using a fast tokenizer instead.")
"slowdown. Consider using a fast tokenizer instead."
)
tokenizer = get_cached_tokenizer(tokenizer)
return tokenizer

View File

@@ -10,7 +10,6 @@ if TYPE_CHECKING:
class TokenizerBase(ABC):
@property
@abstractmethod
def all_special_tokens_extended(self) -> list[str]:
@@ -98,18 +97,22 @@ class TokenizerBase(ABC):
raise NotImplementedError()
@abstractmethod
def encode(self,
text: str,
truncation: Optional[bool] = None,
max_length: Optional[int] = None,
add_special_tokens: Optional[bool] = None) -> list[int]:
def encode(
self,
text: str,
truncation: Optional[bool] = None,
max_length: Optional[int] = None,
add_special_tokens: Optional[bool] = None,
) -> list[int]:
raise NotImplementedError()
@abstractmethod
def apply_chat_template(self,
messages: list["ChatCompletionMessageParam"],
tools: Optional[list[dict[str, Any]]] = None,
**kwargs) -> list[int]:
def apply_chat_template(
self,
messages: list["ChatCompletionMessageParam"],
tools: Optional[list[dict[str, Any]]] = None,
**kwargs,
) -> list[int]:
raise NotImplementedError()
@abstractmethod
@@ -117,9 +120,9 @@ class TokenizerBase(ABC):
raise NotImplementedError()
@abstractmethod
def decode(self,
ids: Union[list[int], int],
skip_special_tokens: bool = True) -> str:
def decode(
self, ids: Union[list[int], int], skip_special_tokens: bool = True
) -> str:
raise NotImplementedError()
@abstractmethod

View File

@@ -1,10 +1,16 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from .mistral import (MistralTokenizer, maybe_serialize_tool_calls,
truncate_tool_call_ids, validate_request_params)
from .mistral import (
MistralTokenizer,
maybe_serialize_tool_calls,
truncate_tool_call_ids,
validate_request_params,
)
__all__ = [
"MistralTokenizer", "maybe_serialize_tool_calls", "truncate_tool_call_ids",
"validate_request_params"
"MistralTokenizer",
"maybe_serialize_tool_calls",
"truncate_tool_call_ids",
"validate_request_params",
]

View File

@@ -20,7 +20,8 @@ if TYPE_CHECKING:
# will not be bothered by the dependency.
from mistral_common.protocol.instruct.request import ChatCompletionRequest
from mistral_common.tokens.tokenizers.mistral import (
MistralTokenizer as PublicMistralTokenizer)
MistralTokenizer as PublicMistralTokenizer,
)
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
@@ -51,7 +52,7 @@ def maybe_serialize_tool_calls(request: "ChatCompletionRequest"):
# - https://github.com/pydantic/pydantic/issues/9541
# TODO: remove when pydantic v2.11 is released
for i, message in enumerate(request.messages):
if message.get("role") == 'assistant':
if message.get("role") == "assistant":
tool_calls_validator = message.get("tool_calls", ().__iter__())
validated_tool_calls = []
while True:
@@ -67,7 +68,7 @@ def maybe_serialize_tool_calls(request: "ChatCompletionRequest"):
def truncate_tool_call_ids(request: "ChatCompletionRequest"):
"""Truncates tool call IDs for Mistral's ID requirements."""
for i, message in enumerate(request.messages):
if message.get("role") == 'assistant':
if message.get("role") == "assistant":
tool_calls = message.get("tool_calls", [])
for tool_call in tool_calls:
if len(tool_call["id"]) > 9:
@@ -95,17 +96,19 @@ def truncate_tool_call_ids(request: "ChatCompletionRequest"):
def validate_request_params(request: "ChatCompletionRequest"):
if (request.skip_special_tokens is not None
and not request.skip_special_tokens):
raise ValueError("skip_special_tokens=False is not supported "
"for Mistral tokenizers.")
if request.skip_special_tokens is not None and not request.skip_special_tokens:
raise ValueError(
"skip_special_tokens=False is not supported for Mistral tokenizers."
)
def list_local_repo_files(repo_id: str, revision: Optional[str]) -> list[str]:
repo_cache = os.path.join(
huggingface_hub.constants.HF_HUB_CACHE,
huggingface_hub.constants.REPO_ID_SEPARATOR.join(
["models", *repo_id.split("/")]))
["models", *repo_id.split("/")]
),
)
if revision is None:
revision_file = os.path.join(repo_cache, "refs", "main")
@@ -141,7 +144,8 @@ def find_tokenizer_file(files: list[str]):
raise OSError(
f"Found {len(matched_files)} files matching the "
f"pattern: `{file_pattern.pattern}`. Make sure that a Mistral "
f"tokenizer is present in {files}.")
f"tokenizer is present in {files}."
)
return matched_files[0]
@@ -149,22 +153,23 @@ def find_tokenizer_file(files: list[str]):
def _aggregate_content(content: list) -> list[dict[str, Any]]:
aggregated_content: list[dict[str, Any]] = []
for chunk in content:
if chunk.get("type"
) == "text" and aggregated_content and aggregated_content[
-1].get("type") == "text":
if (
chunk.get("type") == "text"
and aggregated_content
and aggregated_content[-1].get("type") == "text"
):
aggregated_content[-1]["text"] += "\n\n" + chunk.get("text")
else:
aggregated_content.append(chunk)
if len(aggregated_content) == 1 and aggregated_content[0].get(
"type") == "text":
if len(aggregated_content) == 1 and aggregated_content[0].get("type") == "text":
content = aggregated_content[0]["text"]
return content
def make_mistral_chat_completion_request(
messages: list["ChatCompletionMessageParam"],
tools: Optional[list[dict[str,
Any]]] = None) -> "ChatCompletionRequest":
messages: list["ChatCompletionMessageParam"],
tools: Optional[list[dict[str, Any]]] = None,
) -> "ChatCompletionRequest":
last_message = cast(dict[str, Any], messages[-1])
if last_message["role"] == "assistant":
last_message["prefix"] = True
@@ -188,8 +193,7 @@ def make_mistral_chat_completion_request(
# even if they are empty.
if tools:
for function in [
tool["function"] for tool in tools
if tool["type"] == "function"
tool["function"] for tool in tools if tool["type"] == "function"
]:
if function.get("parameters") is None:
function["parameters"] = {}
@@ -197,12 +201,11 @@ def make_mistral_chat_completion_request(
function["description"] = ""
from mistral_common.protocol.instruct.request import ChatCompletionRequest
return ChatCompletionRequest(messages=messages,
tools=tools) # type: ignore[type-var]
return ChatCompletionRequest(messages=messages, tools=tools) # type: ignore[type-var]
class MistralTokenizer(TokenizerBase):
def __init__(self, tokenizer: "PublicMistralTokenizer") -> None:
self.mistral = tokenizer
self.instruct = tokenizer.instruct_tokenizer
@@ -215,10 +218,13 @@ class MistralTokenizer(TokenizerBase):
self.is_tekken = isinstance(tokenizer_, Tekkenizer)
from mistral_common.tokens.tokenizers.sentencepiece import (
SentencePieceTokenizer)
SentencePieceTokenizer,
)
self.is_spm = isinstance(tokenizer_, SentencePieceTokenizer)
self._special_token_policy = (SpecialTokenPolicy.IGNORE
if self.is_tekken else None)
self._special_token_policy = (
SpecialTokenPolicy.IGNORE if self.is_tekken else None
)
if not (self.is_tekken or self.is_spm):
raise TypeError(f"Unsupported tokenizer: {type(tokenizer_)}")
@@ -226,57 +232,54 @@ class MistralTokenizer(TokenizerBase):
# Convert to a dict[str, int] to match protocol, but this is a lossy
# conversion. There may be multiple token ids that decode to the same
# string due to partial UTF-8 byte sequences being converted to <20>
self._vocab_dict = {
token: idx
for idx, token in enumerate(self._vocab)
}
self._vocab_dict = {token: idx for idx, token in enumerate(self._vocab)}
self.tokenizer = tokenizer_
self._max_token_id = self.vocab_size - 1
@classmethod
def from_pretrained(cls,
path_or_repo_id: str,
*,
revision: Optional[str] = None) -> "MistralTokenizer":
def from_pretrained(
cls, path_or_repo_id: str, *, revision: Optional[str] = None
) -> "MistralTokenizer":
if not Path(path_or_repo_id).exists():
assert len(path_or_repo_id.split("/")) == 2, (
"You have either provided a non-existent path: "
"{path_or_repo_id} or an invalid HF Hub repo id.")
"{path_or_repo_id} or an invalid HF Hub repo id."
)
tokenizer_file = cls._download_mistral_tokenizer_from_hf(
path_or_repo_id, revision)
path_or_repo_id, revision
)
elif Path(path_or_repo_id).is_dir():
tokenizer_file_name = find_tokenizer_file(
os.listdir(path_or_repo_id))
tokenizer_file_name = find_tokenizer_file(os.listdir(path_or_repo_id))
tokenizer_file = str(Path(path_or_repo_id) / tokenizer_file_name)
else:
assert Path(
path_or_repo_id).is_file(), f"Invalid path: {path_or_repo_id}"
assert Path(path_or_repo_id).is_file(), f"Invalid path: {path_or_repo_id}"
tokenizer_file = str(Path(path_or_repo_id))
from mistral_common.tokens.tokenizers.mistral import (
MistralTokenizer as PublicMistralTokenizer)
MistralTokenizer as PublicMistralTokenizer,
)
mistral_tokenizer = PublicMistralTokenizer.from_file(tokenizer_file)
return cls(mistral_tokenizer)
@staticmethod
def _download_mistral_tokenizer_from_hf(tokenizer_name: str,
revision: Optional[str]) -> str:
def _download_mistral_tokenizer_from_hf(
tokenizer_name: str, revision: Optional[str]
) -> str:
try:
hf_api = HfApi()
files = hf_api.list_repo_files(repo_id=tokenizer_name,
revision=revision)
files = hf_api.list_repo_files(repo_id=tokenizer_name, revision=revision)
except ConnectionError as exc:
files = list_local_repo_files(repo_id=tokenizer_name,
revision=revision)
files = list_local_repo_files(repo_id=tokenizer_name, revision=revision)
if len(files) == 0:
raise exc
filename = find_tokenizer_file(files)
tokenizer_file = hf_hub_download(tokenizer_name,
filename=filename,
revision=revision)
tokenizer_file = hf_hub_download(
tokenizer_name, filename=filename, revision=revision
)
return tokenizer_file
# the following attributes are set to fit vLLM's design and are used
@@ -290,10 +293,7 @@ class MistralTokenizer(TokenizerBase):
special_tokens = self.tokenizer.SPECIAL_TOKENS
else:
special_tokens = list(SpecialTokens)
return [
s.value if isinstance(s, SpecialTokens) else s
for s in special_tokens
]
return [s.value if isinstance(s, SpecialTokens) else s for s in special_tokens]
@property
def all_special_tokens(self) -> list[str]:
@@ -301,9 +301,7 @@ class MistralTokenizer(TokenizerBase):
@property
def all_special_ids(self) -> list[int]:
return [
self.all_special_tokens.index(t) for t in self.all_special_tokens
]
return [self.all_special_tokens.index(t) for t in self.all_special_tokens]
@property
def bos_token_id(self) -> int:
@@ -386,26 +384,29 @@ class MistralTokenizer(TokenizerBase):
input_ids = input_ids[:max_length]
return input_ids
def encode(self,
text: str,
truncation: Optional[bool] = None,
max_length: Optional[int] = None,
add_special_tokens: Optional[bool] = None) -> list[int]:
def encode(
self,
text: str,
truncation: Optional[bool] = None,
max_length: Optional[int] = None,
add_special_tokens: Optional[bool] = None,
) -> list[int]:
# `encode` should only be used for prompt completion
# it should never be used for chat_completion.
# For chat completion use `apply_chat_template`
if add_special_tokens is not None:
return self.tokenizer.encode(text,
bos=add_special_tokens,
eos=add_special_tokens)
return self.tokenizer.encode(
text, bos=add_special_tokens, eos=add_special_tokens
)
else:
return self.tokenizer.encode(text, bos=True, eos=False)
def apply_chat_template(self,
messages: list["ChatCompletionMessageParam"],
tools: Optional[list[dict[str, Any]]] = None,
**kwargs) -> list[int]:
def apply_chat_template(
self,
messages: list["ChatCompletionMessageParam"],
tools: Optional[list[dict[str, Any]]] = None,
**kwargs,
) -> list[int]:
request = make_mistral_chat_completion_request(messages, tools)
encoded = self.mistral.encode_chat_completion(request)
@@ -414,11 +415,15 @@ class MistralTokenizer(TokenizerBase):
def convert_tokens_to_string(self, tokens: list[str]) -> str:
from mistral_common.tokens.tokenizers.base import SpecialTokens
if self.is_tekken:
tokens = [
t for t in tokens
if (t is SpecialTokens.tool_calls
or t not in self.tokenizer._all_special_tokens)
t
for t in tokens
if (
t is SpecialTokens.tool_calls
or t not in self.tokenizer._all_special_tokens
)
]
if any(isinstance(t, bytes) for t in tokens):
@@ -426,20 +431,20 @@ class MistralTokenizer(TokenizerBase):
shift = self.tokenizer.num_special_tokens
def _token_to_id(t: str):
t_bytes = t.encode("utf-8") \
if not isinstance(t, bytes) else t
t_bytes = t.encode("utf-8") if not isinstance(t, bytes) else t
try:
return shift + \
self.tokenizer._tekken_token2id_nospecial[t_bytes]
return (
shift + self.tokenizer._tekken_token2id_nospecial[t_bytes]
)
except KeyError:
logger.warning(
"Failed to convert token %s to id,"
" replacing with <unk>", t_bytes)
"Failed to convert token %s to id, replacing with <unk>",
t_bytes,
)
return self.tokenizer.unk_id
ids = [_token_to_id(t) for t in tokens]
decoded = self.tokenizer.decode(ids,
self._special_token_policy)
decoded = self.tokenizer.decode(ids, self._special_token_policy)
else:
decoded = "".join(tokens)
else:
@@ -453,8 +458,10 @@ class MistralTokenizer(TokenizerBase):
if token in special_tokens:
if regular_tokens:
decoded_list.append(
self.tokenizer.decode(regular_tokens,
self._special_token_policy))
self.tokenizer.decode(
regular_tokens, self._special_token_policy
)
)
regular_tokens = []
decoded_list.append(token)
else:
@@ -462,19 +469,19 @@ class MistralTokenizer(TokenizerBase):
if regular_tokens:
decoded_list.append(
self.tokenizer.decode(regular_tokens,
self._special_token_policy))
self.tokenizer.decode(regular_tokens, self._special_token_policy)
)
decoded = ''.join(decoded_list)
decoded = "".join(decoded_list)
return decoded
def decode(self,
ids: Union[list[int], int],
skip_special_tokens: bool = True) -> str:
assert (
skip_special_tokens
), "skip_special_tokens=False is not supported for Mistral tokenizers."
def decode(
self, ids: Union[list[int], int], skip_special_tokens: bool = True
) -> str:
assert skip_special_tokens, (
"skip_special_tokens=False is not supported for Mistral tokenizers."
)
if isinstance(ids, int):
ids = [ids]
@@ -486,13 +493,12 @@ class MistralTokenizer(TokenizerBase):
skip_special_tokens: bool = True,
) -> list[str]:
from mistral_common.tokens.tokenizers.base import SpecialTokens
from mistral_common.tokens.tokenizers.instruct import (
InstructTokenizerV13)
from mistral_common.tokens.tokenizers.instruct import InstructTokenizerV13
# TODO(Patrick) - potentially allow special tokens to not be skipped
assert (
skip_special_tokens
), "skip_special_tokens=False is not supported for Mistral tokenizers."
assert skip_special_tokens, (
"skip_special_tokens=False is not supported for Mistral tokenizers."
)
assert self.is_tekken or self.is_spm, type(self.tokenizer)
@@ -507,8 +513,9 @@ class MistralTokenizer(TokenizerBase):
if self.instruct.END_THINK:
non_skip_special_tokens.add(self.instruct.END_THINK)
ids = [
i for i in ids if i > self.tokenizer.num_special_tokens
or i in non_skip_special_tokens
i
for i in ids
if i > self.tokenizer.num_special_tokens or i in non_skip_special_tokens
]
tokens = [self.tokenizer.id_to_piece(id) for id in ids]

View File

@@ -15,7 +15,7 @@ logger = init_logger(__name__)
def is_s3(model_or_path: str) -> bool:
return model_or_path.lower().startswith('s3://')
return model_or_path.lower().startswith("s3://")
def check_gguf_file(model: Union[str, PathLike]) -> bool:
@@ -43,13 +43,16 @@ def modelscope_list_repo_files(
) -> list[str]:
"""List files in a modelscope repo."""
from modelscope.hub.api import HubApi
api = HubApi()
api.login(token)
# same as huggingface_hub.list_repo_files
files = [
file['Path'] for file in api.get_model_files(
model_id=repo_id, revision=revision, recursive=True)
if file['Type'] == 'blob'
file["Path"]
for file in api.get_model_files(
model_id=repo_id, revision=revision, recursive=True
)
if file["Type"] == "blob"
]
return files
@@ -91,18 +94,18 @@ def maybe_model_redirect(model: str) -> str:
if not Path(model_redirect_path).exists():
return model
redirect_dict = (_maybe_json_dict(model_redirect_path)
or _maybe_space_split_dict(model_redirect_path))
if (redirect_model := redirect_dict.get(model)):
redirect_dict = _maybe_json_dict(model_redirect_path) or _maybe_space_split_dict(
model_redirect_path
)
if redirect_model := redirect_dict.get(model):
logger.info("model redirect: [ %s ] -> [ %s ]", model, redirect_model)
return redirect_model
return model
def parse_safetensors_file_metadata(
path: Union[str, PathLike]) -> dict[str, Any]:
def parse_safetensors_file_metadata(path: Union[str, PathLike]) -> dict[str, Any]:
with open(path, "rb") as f:
length_of_metadata = struct.unpack('<Q', f.read(8))[0]
metadata = json.loads(f.read(length_of_metadata).decode('utf-8'))
length_of_metadata = struct.unpack("<Q", f.read(8))[0]
metadata = json.loads(f.read(length_of_metadata).decode("utf-8"))
return metadata