Convert formatting to use ruff instead of yapf + isort (#26247)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -10,10 +10,11 @@ if envs.VLLM_USE_MODELSCOPE:
|
||||
from packaging import version
|
||||
|
||||
# patch_hub begins from modelscope>=1.18.1
|
||||
if version.parse(modelscope.__version__) <= version.parse('1.18.0'):
|
||||
if version.parse(modelscope.__version__) <= version.parse("1.18.0"):
|
||||
raise ImportError(
|
||||
'Using vLLM with ModelScope needs modelscope>=1.18.1, please '
|
||||
'install by `pip install modelscope -U`')
|
||||
"Using vLLM with ModelScope needs modelscope>=1.18.1, please "
|
||||
"install by `pip install modelscope -U`"
|
||||
)
|
||||
from modelscope.utils.hf_util import patch_hub
|
||||
|
||||
# Patch hub to download models from modelscope to speed up.
|
||||
@@ -21,4 +22,5 @@ if envs.VLLM_USE_MODELSCOPE:
|
||||
except ImportError as err:
|
||||
raise ImportError(
|
||||
"Please install modelscope>=1.18.1 via "
|
||||
"`pip install modelscope>=1.18.1` to use ModelScope.") from err
|
||||
"`pip install modelscope>=1.18.1` to use ModelScope."
|
||||
) from err
|
||||
|
||||
@@ -12,16 +12,14 @@ CHAT_TEMPLATES_DIR = Path(__file__).parent
|
||||
ChatTemplatePath = Union[Path, Callable[[str], Optional[Path]]]
|
||||
|
||||
|
||||
def _get_qwen_chat_template_fallback(
|
||||
tokenizer_name_or_path: str) -> Optional[Path]:
|
||||
def _get_qwen_chat_template_fallback(tokenizer_name_or_path: str) -> Optional[Path]:
|
||||
if tokenizer_name_or_path.endswith("-Chat"):
|
||||
return CHAT_TEMPLATES_DIR / "template_chatml.jinja"
|
||||
|
||||
return CHAT_TEMPLATES_DIR / "template_basic.jinja"
|
||||
|
||||
|
||||
def _get_minicpmv_chat_template_fallback(
|
||||
tokenizer_name_or_path: str) -> Optional[Path]:
|
||||
def _get_minicpmv_chat_template_fallback(tokenizer_name_or_path: str) -> Optional[Path]:
|
||||
# MiniCPM-V-4.5 version uses a dedicated template
|
||||
if "4.5" in tokenizer_name_or_path or "4_5" in tokenizer_name_or_path:
|
||||
return CHAT_TEMPLATES_DIR / "template_minicpmv45.jinja"
|
||||
@@ -51,8 +49,10 @@ def register_chat_template_fallback_path(
|
||||
if model_type in _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK:
|
||||
logger.warning(
|
||||
"Model type %s already has a chat template registered. "
|
||||
"It will be overwritten by the new chat template %s.", model_type,
|
||||
chat_template)
|
||||
"It will be overwritten by the new chat template %s.",
|
||||
model_type,
|
||||
chat_template,
|
||||
)
|
||||
|
||||
_MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK[model_type] = chat_template
|
||||
|
||||
|
||||
@@ -10,26 +10,32 @@ from pathlib import Path
|
||||
from typing import Any, Callable, Literal, Optional, TypeVar, Union
|
||||
|
||||
import huggingface_hub
|
||||
from huggingface_hub import get_safetensors_metadata, hf_hub_download
|
||||
from huggingface_hub import (
|
||||
get_safetensors_metadata,
|
||||
hf_hub_download,
|
||||
try_to_load_from_cache,
|
||||
)
|
||||
from huggingface_hub import list_repo_files as hf_list_repo_files
|
||||
from huggingface_hub import try_to_load_from_cache
|
||||
from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError,
|
||||
LocalEntryNotFoundError,
|
||||
RepositoryNotFoundError,
|
||||
RevisionNotFoundError)
|
||||
from huggingface_hub.utils import (
|
||||
EntryNotFoundError,
|
||||
HfHubHTTPError,
|
||||
LocalEntryNotFoundError,
|
||||
RepositoryNotFoundError,
|
||||
RevisionNotFoundError,
|
||||
)
|
||||
from transformers import GenerationConfig, PretrainedConfig
|
||||
from transformers.models.auto.image_processing_auto import (
|
||||
get_image_processor_config)
|
||||
from transformers.models.auto.modeling_auto import (
|
||||
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)
|
||||
from transformers.models.auto.image_processing_auto import get_image_processor_config
|
||||
from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
|
||||
from transformers.models.auto.tokenization_auto import get_tokenizer_config
|
||||
from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME
|
||||
|
||||
from vllm import envs
|
||||
from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.config_parser_base import ConfigParserBase
|
||||
from vllm.transformers_utils.utils import (check_gguf_file,
|
||||
parse_safetensors_file_metadata)
|
||||
from vllm.transformers_utils.utils import (
|
||||
check_gguf_file,
|
||||
parse_safetensors_file_metadata,
|
||||
)
|
||||
|
||||
if envs.VLLM_USE_MODELSCOPE:
|
||||
from modelscope import AutoConfig
|
||||
@@ -45,21 +51,21 @@ def _get_hf_token() -> Optional[str]:
|
||||
"""
|
||||
Get the HuggingFace token from environment variable.
|
||||
|
||||
Returns None if the token is not set, is an empty string,
|
||||
Returns None if the token is not set, is an empty string,
|
||||
or contains only whitespace.
|
||||
This follows the same pattern as huggingface_hub library which
|
||||
treats empty string tokens as None to avoid authentication errors.
|
||||
"""
|
||||
token = os.getenv('HF_TOKEN')
|
||||
token = os.getenv("HF_TOKEN")
|
||||
if token and token.strip():
|
||||
return token
|
||||
return None
|
||||
|
||||
|
||||
class LazyConfigDict(dict):
|
||||
|
||||
def __getitem__(self, key):
|
||||
import vllm.transformers_utils.configs as configs
|
||||
|
||||
return getattr(configs, super().__getitem__(key))
|
||||
|
||||
|
||||
@@ -84,30 +90,28 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
|
||||
ultravox="UltravoxConfig",
|
||||
step3_vl="Step3VLConfig",
|
||||
step3_text="Step3TextConfig",
|
||||
qwen3_next="Qwen3NextConfig")
|
||||
qwen3_next="Qwen3NextConfig",
|
||||
)
|
||||
|
||||
_CONFIG_ATTRS_MAPPING: dict[str, str] = {
|
||||
"llm_config": "text_config",
|
||||
}
|
||||
|
||||
_AUTO_CONFIG_KWARGS_OVERRIDES: dict[str, dict[str, Any]] = {
|
||||
"internvl_chat": {
|
||||
"has_no_defaults_at_init": True
|
||||
},
|
||||
"NVLM_D": {
|
||||
"has_no_defaults_at_init": True
|
||||
},
|
||||
"internvl_chat": {"has_no_defaults_at_init": True},
|
||||
"NVLM_D": {"has_no_defaults_at_init": True},
|
||||
}
|
||||
|
||||
|
||||
class HFConfigParser(ConfigParserBase):
|
||||
|
||||
def parse(self,
|
||||
model: Union[str, Path],
|
||||
trust_remote_code: bool,
|
||||
revision: Optional[str] = None,
|
||||
code_revision: Optional[str] = None,
|
||||
**kwargs) -> tuple[dict, PretrainedConfig]:
|
||||
def parse(
|
||||
self,
|
||||
model: Union[str, Path],
|
||||
trust_remote_code: bool,
|
||||
revision: Optional[str] = None,
|
||||
code_revision: Optional[str] = None,
|
||||
**kwargs,
|
||||
) -> tuple[dict, PretrainedConfig]:
|
||||
kwargs["local_files_only"] = huggingface_hub.constants.HF_HUB_OFFLINE
|
||||
config_dict, _ = PretrainedConfig.get_config_dict(
|
||||
model,
|
||||
@@ -119,8 +123,11 @@ class HFConfigParser(ConfigParserBase):
|
||||
# Use custom model class if it's in our registry
|
||||
model_type = config_dict.get("model_type")
|
||||
if model_type is None:
|
||||
model_type = "speculators" if config_dict.get(
|
||||
"speculators_config") is not None else model_type
|
||||
model_type = (
|
||||
"speculators"
|
||||
if config_dict.get("speculators_config") is not None
|
||||
else model_type
|
||||
)
|
||||
|
||||
if model_type in _CONFIG_REGISTRY:
|
||||
config_class = _CONFIG_REGISTRY[model_type]
|
||||
@@ -133,8 +140,7 @@ class HFConfigParser(ConfigParserBase):
|
||||
)
|
||||
else:
|
||||
try:
|
||||
kwargs = _maybe_update_auto_config_kwargs(
|
||||
kwargs, model_type=model_type)
|
||||
kwargs = _maybe_update_auto_config_kwargs(kwargs, model_type=model_type)
|
||||
config = AutoConfig.from_pretrained(
|
||||
model,
|
||||
trust_remote_code=trust_remote_code,
|
||||
@@ -144,15 +150,17 @@ class HFConfigParser(ConfigParserBase):
|
||||
**kwargs,
|
||||
)
|
||||
except ValueError as e:
|
||||
if (not trust_remote_code
|
||||
and "requires you to execute the configuration file"
|
||||
in str(e)):
|
||||
if (
|
||||
not trust_remote_code
|
||||
and "requires you to execute the configuration file" in str(e)
|
||||
):
|
||||
err_msg = (
|
||||
"Failed to load the model config. If the model "
|
||||
"is a custom model not yet available in the "
|
||||
"HuggingFace transformers library, consider setting "
|
||||
"`trust_remote_code=True` in LLM or using the "
|
||||
"`--trust-remote-code` flag in the CLI.")
|
||||
"`--trust-remote-code` flag in the CLI."
|
||||
)
|
||||
raise RuntimeError(err_msg) from e
|
||||
else:
|
||||
raise e
|
||||
@@ -161,20 +169,23 @@ class HFConfigParser(ConfigParserBase):
|
||||
|
||||
|
||||
class MistralConfigParser(ConfigParserBase):
|
||||
|
||||
def parse(self,
|
||||
model: Union[str, Path],
|
||||
trust_remote_code: bool,
|
||||
revision: Optional[str] = None,
|
||||
code_revision: Optional[str] = None,
|
||||
**kwargs) -> tuple[dict, PretrainedConfig]:
|
||||
def parse(
|
||||
self,
|
||||
model: Union[str, Path],
|
||||
trust_remote_code: bool,
|
||||
revision: Optional[str] = None,
|
||||
code_revision: Optional[str] = None,
|
||||
**kwargs,
|
||||
) -> tuple[dict, PretrainedConfig]:
|
||||
# This function loads a params.json config which
|
||||
# should be used when loading models in mistral format
|
||||
config_dict = _download_mistral_config_file(model, revision)
|
||||
if (max_position_embeddings :=
|
||||
config_dict.get("max_position_embeddings")) is None:
|
||||
if (
|
||||
max_position_embeddings := config_dict.get("max_position_embeddings")
|
||||
) is None:
|
||||
max_position_embeddings = _maybe_retrieve_max_pos_from_hf(
|
||||
model, revision, **kwargs)
|
||||
model, revision, **kwargs
|
||||
)
|
||||
config_dict["max_position_embeddings"] = max_position_embeddings
|
||||
|
||||
from vllm.transformers_utils.configs.mistral import adapt_config_dict
|
||||
@@ -183,8 +194,9 @@ class MistralConfigParser(ConfigParserBase):
|
||||
|
||||
# Mistral configs may define sliding_window as list[int]. Convert it
|
||||
# to int and add the layer_types list[str] to make it HF compatible
|
||||
if ((sliding_window := getattr(config, "sliding_window", None))
|
||||
and isinstance(sliding_window, list)):
|
||||
if (sliding_window := getattr(config, "sliding_window", None)) and isinstance(
|
||||
sliding_window, list
|
||||
):
|
||||
pattern_repeats = config.num_hidden_layers // len(sliding_window)
|
||||
layer_types = sliding_window * pattern_repeats
|
||||
config.layer_types = [
|
||||
@@ -216,44 +228,51 @@ def get_config_parser(config_format: str) -> ConfigParserBase:
|
||||
|
||||
|
||||
def register_config_parser(config_format: str):
|
||||
|
||||
"""Register a customized vllm config parser.
|
||||
When a config format is not supported by vllm, you can register a customized
|
||||
config parser to support it.
|
||||
Args:
|
||||
config_format (str): The config parser format name.
|
||||
Examples:
|
||||
When a config format is not supported by vllm, you can register a customized
|
||||
config parser to support it.
|
||||
Args:
|
||||
config_format (str): The config parser format name.
|
||||
Examples:
|
||||
|
||||
>>> from vllm.transformers_utils.config import (get_config_parser,
|
||||
register_config_parser)
|
||||
>>> from vllm.transformers_utils.config_parser_base import ConfigParserBase
|
||||
>>>
|
||||
>>> @register_config_parser("custom_config_parser")
|
||||
... class CustomConfigParser(ConfigParserBase):
|
||||
... def parse(self,
|
||||
... model: Union[str, Path],
|
||||
... trust_remote_code: bool,
|
||||
... revision: Optional[str] = None,
|
||||
... code_revision: Optional[str] = None,
|
||||
... **kwargs) -> tuple[dict, PretrainedConfig]:
|
||||
... raise NotImplementedError
|
||||
>>>
|
||||
>>> type(get_config_parser("custom_config_parser"))
|
||||
<class 'CustomConfigParser'>
|
||||
>>> from vllm.transformers_utils.config import (get_config_parser,
|
||||
register_config_parser)
|
||||
>>> from vllm.transformers_utils.config_parser_base import ConfigParserBase
|
||||
>>>
|
||||
>>> @register_config_parser("custom_config_parser")
|
||||
... class CustomConfigParser(ConfigParserBase):
|
||||
... def parse(
|
||||
... self,
|
||||
... model: Union[str, Path],
|
||||
... trust_remote_code: bool,
|
||||
... revision: Optional[str] = None,
|
||||
... code_revision: Optional[str] = None,
|
||||
... **kwargs,
|
||||
... ) -> tuple[dict, PretrainedConfig]:
|
||||
... raise NotImplementedError
|
||||
>>>
|
||||
>>> type(get_config_parser("custom_config_parser"))
|
||||
<class 'CustomConfigParser'>
|
||||
""" # noqa: E501
|
||||
|
||||
def _wrapper(config_parser_cls):
|
||||
if config_format in _CONFIG_FORMAT_TO_CONFIG_PARSER:
|
||||
logger.warning(
|
||||
"Config format `%s` is already registered, and will be "
|
||||
"overwritten by the new parser class `%s`.", config_format,
|
||||
config_parser_cls)
|
||||
"overwritten by the new parser class `%s`.",
|
||||
config_format,
|
||||
config_parser_cls,
|
||||
)
|
||||
if not issubclass(config_parser_cls, ConfigParserBase):
|
||||
raise ValueError("The config parser must be a subclass of "
|
||||
"`ConfigParserBase`.")
|
||||
raise ValueError(
|
||||
"The config parser must be a subclass of `ConfigParserBase`."
|
||||
)
|
||||
_CONFIG_FORMAT_TO_CONFIG_PARSER[config_format] = config_parser_cls
|
||||
logger.info("Registered config parser `%s` with config format `%s`",
|
||||
config_parser_cls, config_format)
|
||||
logger.info(
|
||||
"Registered config parser `%s` with config format `%s`",
|
||||
config_parser_cls,
|
||||
config_format,
|
||||
)
|
||||
return config_parser_cls
|
||||
|
||||
return _wrapper
|
||||
@@ -275,8 +294,9 @@ def with_retry(
|
||||
if attempt == max_retries - 1:
|
||||
logger.error("%s: %s", log_msg, e)
|
||||
raise
|
||||
logger.error("%s: %s, retrying %d of %d", log_msg, e, attempt + 1,
|
||||
max_retries)
|
||||
logger.error(
|
||||
"%s: %s, retrying %d of %d", log_msg, e, attempt + 1, max_retries
|
||||
)
|
||||
time.sleep(retry_delay)
|
||||
retry_delay *= 2
|
||||
|
||||
@@ -292,28 +312,27 @@ def list_repo_files(
|
||||
repo_type: Optional[str] = None,
|
||||
token: Union[str, bool, None] = None,
|
||||
) -> list[str]:
|
||||
|
||||
def lookup_files() -> list[str]:
|
||||
# directly list files if model is local
|
||||
if (local_path := Path(repo_id)).exists():
|
||||
return [
|
||||
str(file.relative_to(local_path))
|
||||
for file in local_path.rglob('*') if file.is_file()
|
||||
for file in local_path.rglob("*")
|
||||
if file.is_file()
|
||||
]
|
||||
# if model is remote, use hf_hub api to list files
|
||||
try:
|
||||
if envs.VLLM_USE_MODELSCOPE:
|
||||
from vllm.transformers_utils.utils import (
|
||||
modelscope_list_repo_files)
|
||||
return modelscope_list_repo_files(repo_id,
|
||||
revision=revision,
|
||||
token=os.getenv(
|
||||
"MODELSCOPE_API_TOKEN",
|
||||
None))
|
||||
return hf_list_repo_files(repo_id,
|
||||
revision=revision,
|
||||
repo_type=repo_type,
|
||||
token=token)
|
||||
from vllm.transformers_utils.utils import modelscope_list_repo_files
|
||||
|
||||
return modelscope_list_repo_files(
|
||||
repo_id,
|
||||
revision=revision,
|
||||
token=os.getenv("MODELSCOPE_API_TOKEN", None),
|
||||
)
|
||||
return hf_list_repo_files(
|
||||
repo_id, revision=revision, repo_type=repo_type, token=token
|
||||
)
|
||||
except huggingface_hub.errors.OfflineModeIsEnabled:
|
||||
# Don't raise in offline mode,
|
||||
# all we know is that we don't have this
|
||||
@@ -331,23 +350,23 @@ def file_exists(
|
||||
revision: Optional[str] = None,
|
||||
token: Union[str, bool, None] = None,
|
||||
) -> bool:
|
||||
file_list = list_repo_files(repo_id,
|
||||
repo_type=repo_type,
|
||||
revision=revision,
|
||||
token=token)
|
||||
file_list = list_repo_files(
|
||||
repo_id, repo_type=repo_type, revision=revision, token=token
|
||||
)
|
||||
return file_name in file_list
|
||||
|
||||
|
||||
# In offline mode the result can be a false negative
|
||||
def file_or_path_exists(model: Union[str, Path], config_name: str,
|
||||
revision: Optional[str]) -> bool:
|
||||
def file_or_path_exists(
|
||||
model: Union[str, Path], config_name: str, revision: Optional[str]
|
||||
) -> bool:
|
||||
if (local_path := Path(model)).exists():
|
||||
return (local_path / config_name).is_file()
|
||||
|
||||
# Offline mode support: Check if config file is cached already
|
||||
cached_filepath = try_to_load_from_cache(repo_id=model,
|
||||
filename=config_name,
|
||||
revision=revision)
|
||||
cached_filepath = try_to_load_from_cache(
|
||||
repo_id=model, filename=config_name, revision=revision
|
||||
)
|
||||
if isinstance(cached_filepath, str):
|
||||
# The config file exists in cache- we can continue trying to load
|
||||
return True
|
||||
@@ -356,10 +375,9 @@ def file_or_path_exists(model: Union[str, Path], config_name: str,
|
||||
# hf_hub. This will fail in offline mode.
|
||||
|
||||
# Call HF to check if the file exists
|
||||
return file_exists(str(model),
|
||||
config_name,
|
||||
revision=revision,
|
||||
token=_get_hf_token())
|
||||
return file_exists(
|
||||
str(model), config_name, revision=revision, token=_get_hf_token()
|
||||
)
|
||||
|
||||
|
||||
def patch_rope_scaling(config: PretrainedConfig) -> None:
|
||||
@@ -381,7 +399,8 @@ def patch_rope_scaling_dict(rope_scaling: dict[str, Any]) -> None:
|
||||
raise ValueError(
|
||||
f"Found conflicts between 'rope_type={rope_type}' (modern "
|
||||
f"field) and 'type={rope_type_legacy}' (legacy field). "
|
||||
"You should only specify one of them.")
|
||||
"You should only specify one of them."
|
||||
)
|
||||
|
||||
if "rope_type" not in rope_scaling and "type" in rope_scaling:
|
||||
rope_scaling["rope_type"] = rope_scaling["type"]
|
||||
@@ -409,8 +428,11 @@ def _uses_mrope(config: PretrainedConfig) -> bool:
|
||||
|
||||
def uses_mrope(config: PretrainedConfig) -> bool:
|
||||
"""Detect if the model with this config uses M-ROPE."""
|
||||
return _uses_mrope(config) or _uses_mrope(
|
||||
config.get_text_config()) or thinker_uses_mrope(config)
|
||||
return (
|
||||
_uses_mrope(config)
|
||||
or _uses_mrope(config.get_text_config())
|
||||
or thinker_uses_mrope(config)
|
||||
)
|
||||
|
||||
|
||||
def thinker_uses_mrope(config: PretrainedConfig) -> bool:
|
||||
@@ -432,8 +454,7 @@ def is_encoder_decoder(config: PretrainedConfig) -> bool:
|
||||
def _is_encoder_decoder(config: PretrainedConfig) -> bool:
|
||||
return getattr(config, "is_encoder_decoder", False)
|
||||
|
||||
return (_is_encoder_decoder(config)
|
||||
or _is_encoder_decoder(config.get_text_config()))
|
||||
return _is_encoder_decoder(config) or _is_encoder_decoder(config.get_text_config())
|
||||
|
||||
|
||||
def is_interleaved(config: PretrainedConfig) -> bool:
|
||||
@@ -462,8 +483,7 @@ def _maybe_remap_hf_config_attrs(config: PretrainedConfig) -> PretrainedConfig:
|
||||
if hasattr(config, old_attr):
|
||||
if not hasattr(config, new_attr):
|
||||
config.update({new_attr: getattr(config, old_attr)})
|
||||
logger.debug("Remapped config attribute '%s' to '%s'", old_attr,
|
||||
new_attr)
|
||||
logger.debug("Remapped config attribute '%s' to '%s'", old_attr, new_attr)
|
||||
return config
|
||||
|
||||
|
||||
@@ -512,11 +532,11 @@ def maybe_override_with_speculators(
|
||||
return model, tokenizer, vllm_speculative_config
|
||||
|
||||
# Speculators format detected - process overrides
|
||||
from vllm.transformers_utils.configs.speculators.base import (
|
||||
SpeculatorsConfig)
|
||||
from vllm.transformers_utils.configs.speculators.base import SpeculatorsConfig
|
||||
|
||||
speculative_config = SpeculatorsConfig.extract_vllm_speculative_config(
|
||||
config_dict=config_dict)
|
||||
config_dict=config_dict
|
||||
)
|
||||
|
||||
# Set the draft model to the speculators model
|
||||
speculative_config["model"] = model
|
||||
@@ -535,8 +555,7 @@ def get_config(
|
||||
code_revision: Optional[str] = None,
|
||||
config_format: Union[str, ConfigFormat] = "auto",
|
||||
hf_overrides_kw: Optional[dict[str, Any]] = None,
|
||||
hf_overrides_fn: Optional[Callable[[PretrainedConfig],
|
||||
PretrainedConfig]] = None,
|
||||
hf_overrides_fn: Optional[Callable[[PretrainedConfig], PretrainedConfig]] = None,
|
||||
**kwargs,
|
||||
) -> PretrainedConfig:
|
||||
# Separate model folder from file path for GGUF models
|
||||
@@ -548,12 +567,9 @@ def get_config(
|
||||
|
||||
if config_format == "auto":
|
||||
try:
|
||||
if is_gguf or file_or_path_exists(
|
||||
model, HF_CONFIG_NAME, revision=revision):
|
||||
if is_gguf or file_or_path_exists(model, HF_CONFIG_NAME, revision=revision):
|
||||
config_format = "hf"
|
||||
elif file_or_path_exists(model,
|
||||
MISTRAL_CONFIG_NAME,
|
||||
revision=revision):
|
||||
elif file_or_path_exists(model, MISTRAL_CONFIG_NAME, revision=revision):
|
||||
config_format = "mistral"
|
||||
else:
|
||||
raise ValueError(
|
||||
@@ -561,7 +577,8 @@ def get_config(
|
||||
"With config_format 'auto', ensure your model has either "
|
||||
"config.json (HF format) or params.json (Mistral format). "
|
||||
"Otherwise please specify your_custom_config_format "
|
||||
"in engine args for customized config parser.")
|
||||
"in engine args for customized config parser."
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
error_message = (
|
||||
@@ -576,7 +593,8 @@ def get_config(
|
||||
"'params.json'.\n"
|
||||
"3. For GGUF: pass the local path of the GGUF checkpoint.\n"
|
||||
" Loading GGUF from a remote repo directly is not yet "
|
||||
"supported.\n").format(model=model)
|
||||
"supported.\n"
|
||||
).format(model=model)
|
||||
|
||||
raise ValueError(error_message) from e
|
||||
|
||||
@@ -591,8 +609,7 @@ def get_config(
|
||||
# Special architecture mapping check for GGUF models
|
||||
if is_gguf:
|
||||
if config.model_type not in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
|
||||
raise RuntimeError(
|
||||
f"Can't get gguf config for {config.model_type}.")
|
||||
raise RuntimeError(f"Can't get gguf config for {config.model_type}.")
|
||||
model_type = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[config.model_type]
|
||||
config.update({"architectures": [model_type]})
|
||||
|
||||
@@ -602,29 +619,35 @@ def get_config(
|
||||
|
||||
# ModelOpt 0.29.0 and before saves the quantization config in a separate
|
||||
# "hf_quant_config.json" in the same directory as the model config file.
|
||||
if quantization_config is None \
|
||||
and file_or_path_exists(model, "hf_quant_config.json", revision):
|
||||
quantization_config = get_hf_file_to_dict("hf_quant_config.json",
|
||||
model, revision)
|
||||
if quantization_config is None and file_or_path_exists(
|
||||
model, "hf_quant_config.json", revision
|
||||
):
|
||||
quantization_config = get_hf_file_to_dict(
|
||||
"hf_quant_config.json", model, revision
|
||||
)
|
||||
|
||||
if quantization_config is not None:
|
||||
config.quantization_config = quantization_config
|
||||
# auto-enable DeepGEMM UE8M0 on Hopper if model config requests it
|
||||
scale_fmt = quantization_config.get("scale_fmt", None)
|
||||
if scale_fmt in ("ue8m0", ):
|
||||
if scale_fmt in ("ue8m0",):
|
||||
if not envs.is_set("VLLM_USE_DEEP_GEMM_E8M0_HOPPER"):
|
||||
os.environ["VLLM_USE_DEEP_GEMM_E8M0_HOPPER"] = "1"
|
||||
logger.info_once(
|
||||
("Detected quantization_config.scale_fmt=%s; "
|
||||
"enabling Hopper UE8M0."),
|
||||
(
|
||||
"Detected quantization_config.scale_fmt=%s; "
|
||||
"enabling Hopper UE8M0."
|
||||
),
|
||||
scale_fmt,
|
||||
)
|
||||
elif not envs.VLLM_USE_DEEP_GEMM_E8M0_HOPPER:
|
||||
logger.warning_once(
|
||||
("Model config requests UE8M0 "
|
||||
"(quantization_config.scale_fmt=%s), but "
|
||||
"VLLM_USE_DEEP_GEMM_E8M0_HOPPER=0 is set; "
|
||||
"Hopper UE8M0 disabled."),
|
||||
(
|
||||
"Model config requests UE8M0 "
|
||||
"(quantization_config.scale_fmt=%s), but "
|
||||
"VLLM_USE_DEEP_GEMM_E8M0_HOPPER=0 is set; "
|
||||
"Hopper UE8M0 disabled."
|
||||
),
|
||||
scale_fmt,
|
||||
)
|
||||
|
||||
@@ -643,17 +666,17 @@ def get_config(
|
||||
return config
|
||||
|
||||
|
||||
def try_get_local_file(model: Union[str, Path],
|
||||
file_name: str,
|
||||
revision: Optional[str] = 'main') -> Optional[Path]:
|
||||
def try_get_local_file(
|
||||
model: Union[str, Path], file_name: str, revision: Optional[str] = "main"
|
||||
) -> Optional[Path]:
|
||||
file_path = Path(model) / file_name
|
||||
if file_path.is_file():
|
||||
return file_path
|
||||
else:
|
||||
try:
|
||||
cached_filepath = try_to_load_from_cache(repo_id=model,
|
||||
filename=file_name,
|
||||
revision=revision)
|
||||
cached_filepath = try_to_load_from_cache(
|
||||
repo_id=model, filename=file_name, revision=revision
|
||||
)
|
||||
if isinstance(cached_filepath, str):
|
||||
return Path(cached_filepath)
|
||||
except ValueError:
|
||||
@@ -661,9 +684,9 @@ def try_get_local_file(model: Union[str, Path],
|
||||
return None
|
||||
|
||||
|
||||
def get_hf_file_to_dict(file_name: str,
|
||||
model: Union[str, Path],
|
||||
revision: Optional[str] = 'main'):
|
||||
def get_hf_file_to_dict(
|
||||
file_name: str, model: Union[str, Path], revision: Optional[str] = "main"
|
||||
):
|
||||
"""
|
||||
Downloads a file from the Hugging Face Hub and returns
|
||||
its contents as a dictionary.
|
||||
@@ -678,25 +701,27 @@ def get_hf_file_to_dict(file_name: str,
|
||||
the contents of the downloaded file.
|
||||
"""
|
||||
|
||||
file_path = try_get_local_file(model=model,
|
||||
file_name=file_name,
|
||||
revision=revision)
|
||||
file_path = try_get_local_file(model=model, file_name=file_name, revision=revision)
|
||||
|
||||
if file_path is None:
|
||||
try:
|
||||
hf_hub_file = hf_hub_download(model, file_name, revision=revision)
|
||||
except huggingface_hub.errors.OfflineModeIsEnabled:
|
||||
return None
|
||||
except (RepositoryNotFoundError, RevisionNotFoundError,
|
||||
EntryNotFoundError, LocalEntryNotFoundError) as e:
|
||||
except (
|
||||
RepositoryNotFoundError,
|
||||
RevisionNotFoundError,
|
||||
EntryNotFoundError,
|
||||
LocalEntryNotFoundError,
|
||||
) as e:
|
||||
logger.debug("File or repository not found in hf_hub_download", e)
|
||||
return None
|
||||
except HfHubHTTPError as e:
|
||||
logger.warning(
|
||||
"Cannot connect to Hugging Face Hub. Skipping file "
|
||||
"download for '%s':",
|
||||
"Cannot connect to Hugging Face Hub. Skipping file download for '%s':",
|
||||
file_name,
|
||||
exc_info=e)
|
||||
exc_info=e,
|
||||
)
|
||||
return None
|
||||
file_path = Path(hf_hub_file)
|
||||
|
||||
@@ -708,8 +733,7 @@ def get_hf_file_to_dict(file_name: str,
|
||||
|
||||
|
||||
@cache
|
||||
def get_pooling_config(model: str,
|
||||
revision: Optional[str] = 'main') -> Optional[dict]:
|
||||
def get_pooling_config(model: str, revision: Optional[str] = "main") -> Optional[dict]:
|
||||
"""
|
||||
This function gets the pooling and normalize
|
||||
config from the model - only applies to
|
||||
@@ -717,20 +741,20 @@ def get_pooling_config(model: str,
|
||||
|
||||
Args:
|
||||
model: The name of the Hugging Face model.
|
||||
revision: The specific version of the model to use.
|
||||
revision: The specific version of the model to use.
|
||||
Defaults to 'main'.
|
||||
|
||||
Returns:
|
||||
A dictionary containing the pooling type and whether
|
||||
A dictionary containing the pooling type and whether
|
||||
normalization is used, or None if no pooling configuration is found.
|
||||
"""
|
||||
|
||||
modules_file_name = "modules.json"
|
||||
|
||||
modules_dict = None
|
||||
if file_or_path_exists(model=model,
|
||||
config_name=modules_file_name,
|
||||
revision=revision):
|
||||
if file_or_path_exists(
|
||||
model=model, config_name=modules_file_name, revision=revision
|
||||
):
|
||||
modules_dict = get_hf_file_to_dict(modules_file_name, model, revision)
|
||||
|
||||
if modules_dict is None:
|
||||
@@ -738,20 +762,31 @@ def get_pooling_config(model: str,
|
||||
|
||||
logger.info("Found sentence-transformers modules configuration.")
|
||||
|
||||
pooling = next((item for item in modules_dict
|
||||
if item["type"] == "sentence_transformers.models.Pooling"),
|
||||
None)
|
||||
pooling = next(
|
||||
(
|
||||
item
|
||||
for item in modules_dict
|
||||
if item["type"] == "sentence_transformers.models.Pooling"
|
||||
),
|
||||
None,
|
||||
)
|
||||
normalize = bool(
|
||||
next((item for item in modules_dict
|
||||
if item["type"] == "sentence_transformers.models.Normalize"),
|
||||
False))
|
||||
next(
|
||||
(
|
||||
item
|
||||
for item in modules_dict
|
||||
if item["type"] == "sentence_transformers.models.Normalize"
|
||||
),
|
||||
False,
|
||||
)
|
||||
)
|
||||
|
||||
if pooling:
|
||||
|
||||
pooling_file_name = "{}/config.json".format(pooling["path"])
|
||||
pooling_dict = get_hf_file_to_dict(pooling_file_name, model, revision)
|
||||
pooling_type_name = next(
|
||||
(item for item, val in pooling_dict.items() if val is True), None)
|
||||
(item for item, val in pooling_dict.items() if val is True), None
|
||||
)
|
||||
|
||||
if pooling_type_name is not None:
|
||||
pooling_type_name = get_pooling_config_name(pooling_type_name)
|
||||
@@ -772,20 +807,19 @@ def get_pooling_config_name(pooling_name: str) -> Union[str, None]:
|
||||
if "lasttoken" in pooling_name:
|
||||
pooling_name = "last"
|
||||
|
||||
supported_pooling_types = ['LAST', 'ALL', 'CLS', 'STEP', 'MEAN']
|
||||
supported_pooling_types = ["LAST", "ALL", "CLS", "STEP", "MEAN"]
|
||||
pooling_type_name = pooling_name.upper()
|
||||
|
||||
if pooling_type_name in supported_pooling_types:
|
||||
return pooling_type_name
|
||||
|
||||
raise NotImplementedError(
|
||||
f"Pooling type {pooling_type_name} not supported")
|
||||
raise NotImplementedError(f"Pooling type {pooling_type_name} not supported")
|
||||
|
||||
|
||||
@cache
|
||||
def get_sentence_transformer_tokenizer_config(model: Union[str, Path],
|
||||
revision: Optional[str] = 'main'
|
||||
):
|
||||
def get_sentence_transformer_tokenizer_config(
|
||||
model: Union[str, Path], revision: Optional[str] = "main"
|
||||
):
|
||||
"""
|
||||
Returns the tokenization configuration dictionary for a
|
||||
given Sentence Transformer BERT model.
|
||||
@@ -812,9 +846,10 @@ def get_sentence_transformer_tokenizer_config(model: Union[str, Path],
|
||||
encoder_dict = None
|
||||
|
||||
for config_file in sentence_transformer_config_files:
|
||||
if try_get_local_file(model=model,
|
||||
file_name=config_file,
|
||||
revision=revision) is not None:
|
||||
if (
|
||||
try_get_local_file(model=model, file_name=config_file, revision=revision)
|
||||
is not None
|
||||
):
|
||||
encoder_dict = get_hf_file_to_dict(config_file, model, revision)
|
||||
if encoder_dict:
|
||||
break
|
||||
@@ -822,16 +857,15 @@ def get_sentence_transformer_tokenizer_config(model: Union[str, Path],
|
||||
if not encoder_dict and not Path(model).is_absolute():
|
||||
try:
|
||||
# If model is on HuggingfaceHub, get the repo files
|
||||
repo_files = list_repo_files(model,
|
||||
revision=revision,
|
||||
token=_get_hf_token())
|
||||
repo_files = list_repo_files(
|
||||
model, revision=revision, token=_get_hf_token()
|
||||
)
|
||||
except Exception:
|
||||
repo_files = []
|
||||
|
||||
for config_name in sentence_transformer_config_files:
|
||||
if config_name in repo_files:
|
||||
encoder_dict = get_hf_file_to_dict(config_name, model,
|
||||
revision)
|
||||
encoder_dict = get_hf_file_to_dict(config_name, model, revision)
|
||||
if encoder_dict:
|
||||
break
|
||||
|
||||
@@ -848,34 +882,39 @@ def get_sentence_transformer_tokenizer_config(model: Union[str, Path],
|
||||
def maybe_register_config_serialize_by_value() -> None:
|
||||
"""Try to register HF model configuration class to serialize by value
|
||||
|
||||
If trust_remote_code is set, and the model's config file specifies an
|
||||
`AutoConfig` class, then the config class is typically an instance of
|
||||
a custom class imported from the HF modules cache.
|
||||
If trust_remote_code is set, and the model's config file specifies an
|
||||
`AutoConfig` class, then the config class is typically an instance of
|
||||
a custom class imported from the HF modules cache.
|
||||
|
||||
Examples:
|
||||
Examples:
|
||||
|
||||
>>> from transformers import AutoConfig
|
||||
>>> klass = AutoConfig.from_pretrained('meta-llama/Meta-Llama-3-8B', trust_remote_code=True)
|
||||
>>> klass.__class__ # transformers.models.llama.configuration_llama.LlamaConfig
|
||||
>>> import transformers_modules # error, not initialized
|
||||
>>> klass = AutoConfig.from_pretrained('deepseek-ai/DeepSeek-V2.5', trust_remote_code=True)
|
||||
>>> import transformers_modules # success, initialized
|
||||
>>> klass.__class__ # transformers_modules.deepseek-ai.DeepSeek-V2.5.98b11844770b2c3ffc18b175c758a803640f4e77.configuration_deepseek.DeepseekV2Config
|
||||
>>> from transformers import AutoConfig
|
||||
>>> klass = AutoConfig.from_pretrained(
|
||||
... "meta-llama/Meta-Llama-3-8B", trust_remote_code=True
|
||||
... )
|
||||
>>> klass.__class__ # transformers.models.llama.configuration_llama.LlamaConfig
|
||||
>>> import transformers_modules # error, not initialized
|
||||
>>> klass = AutoConfig.from_pretrained(
|
||||
... "deepseek-ai/DeepSeek-V2.5", trust_remote_code=True
|
||||
... )
|
||||
>>> import transformers_modules # success, initialized
|
||||
>>> klass.__class__ # transformers_modules.deepseek-ai.DeepSeek-V2.5.98b11844770b2c3ffc18b175c758a803640f4e77.configuration_deepseek.DeepseekV2Config
|
||||
|
||||
In the DeepSeek example, the config class is an instance of a custom
|
||||
class that is not serializable by default. This class will not be
|
||||
importable in spawned workers, and won't exist at all on
|
||||
other nodes, which breaks serialization of the config.
|
||||
In the DeepSeek example, the config class is an instance of a custom
|
||||
class that is not serializable by default. This class will not be
|
||||
importable in spawned workers, and won't exist at all on
|
||||
other nodes, which breaks serialization of the config.
|
||||
|
||||
In this function we tell the cloudpickle serialization library to pass
|
||||
instances of these generated classes by value instead of by reference,
|
||||
i.e. the class definition is serialized along with its data so that the
|
||||
class module does not need to be importable on the receiving end.
|
||||
In this function we tell the cloudpickle serialization library to pass
|
||||
instances of these generated classes by value instead of by reference,
|
||||
i.e. the class definition is serialized along with its data so that the
|
||||
class module does not need to be importable on the receiving end.
|
||||
|
||||
See: https://github.com/cloudpipe/cloudpickle?tab=readme-ov-file#overriding-pickles-serialization-mechanism-for-importable-constructs
|
||||
""" # noqa
|
||||
See: https://github.com/cloudpipe/cloudpickle?tab=readme-ov-file#overriding-pickles-serialization-mechanism-for-importable-constructs
|
||||
""" # noqa
|
||||
try:
|
||||
import transformers_modules
|
||||
|
||||
transformers_modules_available = True
|
||||
except ImportError:
|
||||
transformers_modules_available = False
|
||||
@@ -892,7 +931,7 @@ def maybe_register_config_serialize_by_value() -> None:
|
||||
# serialization of VllmConfig objects that may contain custom configs
|
||||
# from transformers_modules
|
||||
def _reduce_config(config: VllmConfig):
|
||||
return (pickle.loads, (cloudpickle.dumps(config), ))
|
||||
return (pickle.loads, (cloudpickle.dumps(config),))
|
||||
|
||||
multiprocessing.reducer.register(VllmConfig, _reduce_config)
|
||||
|
||||
@@ -902,6 +941,7 @@ def maybe_register_config_serialize_by_value() -> None:
|
||||
|
||||
# ray vendors its own version of cloudpickle
|
||||
from vllm.executor.ray_utils import ray
|
||||
|
||||
if ray:
|
||||
ray.cloudpickle.register_pickle_by_value(transformers_modules)
|
||||
|
||||
@@ -911,7 +951,8 @@ def maybe_register_config_serialize_by_value() -> None:
|
||||
" trust_remote_code with by-value serialization. This may"
|
||||
" lead to a later error. If remote code is not needed"
|
||||
" remove `--trust-remote-code`",
|
||||
exc_info=e)
|
||||
exc_info=e,
|
||||
)
|
||||
|
||||
|
||||
def get_hf_image_processor_config(
|
||||
@@ -926,10 +967,9 @@ def get_hf_image_processor_config(
|
||||
# Separate model folder from file path for GGUF models
|
||||
if check_gguf_file(model):
|
||||
model = Path(model).parent
|
||||
return get_image_processor_config(model,
|
||||
token=hf_token,
|
||||
revision=revision,
|
||||
**kwargs)
|
||||
return get_image_processor_config(
|
||||
model, token=hf_token, revision=revision, **kwargs
|
||||
)
|
||||
|
||||
|
||||
def get_hf_text_config(config: PretrainedConfig):
|
||||
@@ -984,8 +1024,9 @@ def try_get_safetensors_metadata(
|
||||
)
|
||||
|
||||
try:
|
||||
return with_retry(get_safetensors_metadata_partial,
|
||||
"Error retrieving safetensors")
|
||||
return with_retry(
|
||||
get_safetensors_metadata_partial, "Error retrieving safetensors"
|
||||
)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
@@ -1018,9 +1059,9 @@ def get_safetensors_params_metadata(
|
||||
safetensors_to_check = model_path.glob("*.safetensors")
|
||||
full_metadata = {
|
||||
param_name: info
|
||||
for file_path in safetensors_to_check if file_path.is_file()
|
||||
for param_name, info in parse_safetensors_file_metadata(
|
||||
file_path).items()
|
||||
for file_path in safetensors_to_check
|
||||
if file_path.is_file()
|
||||
for param_name, info in parse_safetensors_file_metadata(file_path).items()
|
||||
}
|
||||
else:
|
||||
repo_mt = try_get_safetensors_metadata(model, revision=revision)
|
||||
@@ -1040,7 +1081,8 @@ def _download_mistral_config_file(model, revision) -> dict:
|
||||
raise ValueError(
|
||||
f"Failed to load mistral '{config_file_name}' config for model "
|
||||
f"{model}. Please check if the model is a mistral-format model "
|
||||
f"and if the config file exists.")
|
||||
f"and if the config file exists."
|
||||
)
|
||||
assert isinstance(config_dict, dict)
|
||||
return config_dict
|
||||
|
||||
@@ -1049,10 +1091,12 @@ def _maybe_retrieve_max_pos_from_hf(model, revision, **kwargs) -> int:
|
||||
max_position_embeddings = 128_000
|
||||
try:
|
||||
trust_remote_code_val = kwargs.get("trust_remote_code", False)
|
||||
hf_config = get_config(model=model,
|
||||
trust_remote_code=trust_remote_code_val,
|
||||
revision=revision,
|
||||
config_format="hf")
|
||||
hf_config = get_config(
|
||||
model=model,
|
||||
trust_remote_code=trust_remote_code_val,
|
||||
revision=revision,
|
||||
config_format="hf",
|
||||
)
|
||||
if hf_value := hf_config.get_text_config().max_position_embeddings:
|
||||
max_position_embeddings = hf_value
|
||||
except Exception as e:
|
||||
@@ -1060,7 +1104,8 @@ def _maybe_retrieve_max_pos_from_hf(model, revision, **kwargs) -> int:
|
||||
"The params.json file is missing 'max_position_embeddings'"
|
||||
" and could not get a value from the HF config."
|
||||
" Defaulting to 128000",
|
||||
exc_info=e)
|
||||
exc_info=e,
|
||||
)
|
||||
|
||||
return max_position_embeddings
|
||||
|
||||
@@ -1076,29 +1121,28 @@ def get_model_path(model: Union[str, Path], revision: Optional[str] = None):
|
||||
|
||||
if envs.VLLM_USE_MODELSCOPE:
|
||||
from modelscope.hub.snapshot_download import snapshot_download
|
||||
|
||||
return snapshot_download(model_id=model, **common_kwargs)
|
||||
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
return snapshot_download(repo_id=model, **common_kwargs)
|
||||
|
||||
|
||||
def get_hf_file_bytes(file_name: str,
|
||||
model: Union[str, Path],
|
||||
revision: Optional[str] = 'main') -> Optional[bytes]:
|
||||
def get_hf_file_bytes(
|
||||
file_name: str, model: Union[str, Path], revision: Optional[str] = "main"
|
||||
) -> Optional[bytes]:
|
||||
"""Get file contents from HuggingFace repository as bytes."""
|
||||
file_path = try_get_local_file(model=model,
|
||||
file_name=file_name,
|
||||
revision=revision)
|
||||
file_path = try_get_local_file(model=model, file_name=file_name, revision=revision)
|
||||
|
||||
if file_path is None:
|
||||
hf_hub_file = hf_hub_download(model,
|
||||
file_name,
|
||||
revision=revision,
|
||||
token=_get_hf_token())
|
||||
hf_hub_file = hf_hub_download(
|
||||
model, file_name, revision=revision, token=_get_hf_token()
|
||||
)
|
||||
file_path = Path(hf_hub_file)
|
||||
|
||||
if file_path is not None and file_path.is_file():
|
||||
with open(file_path, 'rb') as file:
|
||||
with open(file_path, "rb") as file:
|
||||
return file.read()
|
||||
|
||||
return None
|
||||
|
||||
@@ -9,12 +9,13 @@ from transformers import PretrainedConfig
|
||||
|
||||
|
||||
class ConfigParserBase(ABC):
|
||||
|
||||
@abstractmethod
|
||||
def parse(self,
|
||||
model: Union[str, Path],
|
||||
trust_remote_code: bool,
|
||||
revision: Optional[str] = None,
|
||||
code_revision: Optional[str] = None,
|
||||
**kwargs) -> tuple[dict, PretrainedConfig]:
|
||||
def parse(
|
||||
self,
|
||||
model: Union[str, Path],
|
||||
trust_remote_code: bool,
|
||||
revision: Optional[str] = None,
|
||||
code_revision: Optional[str] = None,
|
||||
**kwargs,
|
||||
) -> tuple[dict, PretrainedConfig]:
|
||||
raise NotImplementedError
|
||||
|
||||
@@ -12,6 +12,7 @@ from vllm.transformers_utils.configs.deepseek_v3 import DeepseekV3Config
|
||||
from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config
|
||||
from vllm.transformers_utils.configs.dotsocr import DotsOCRConfig
|
||||
from vllm.transformers_utils.configs.eagle import EAGLEConfig
|
||||
|
||||
# RWConfig is for the original tiiuae/falcon-40b(-instruct) and
|
||||
# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
|
||||
# `FalconConfig` class from the official HuggingFace transformers library.
|
||||
@@ -30,9 +31,11 @@ from vllm.transformers_utils.configs.ovis import OvisConfig
|
||||
from vllm.transformers_utils.configs.qwen3_next import Qwen3NextConfig
|
||||
from vllm.transformers_utils.configs.radio import RadioConfig
|
||||
from vllm.transformers_utils.configs.speculators.base import SpeculatorsConfig
|
||||
from vllm.transformers_utils.configs.step3_vl import (Step3TextConfig,
|
||||
Step3VisionEncoderConfig,
|
||||
Step3VLConfig)
|
||||
from vllm.transformers_utils.configs.step3_vl import (
|
||||
Step3TextConfig,
|
||||
Step3VisionEncoderConfig,
|
||||
Step3VLConfig,
|
||||
)
|
||||
from vllm.transformers_utils.configs.ultravox import UltravoxConfig
|
||||
|
||||
__all__ = [
|
||||
|
||||
@@ -13,33 +13,35 @@ class ChatGLMConfig(PretrainedConfig):
|
||||
"n_head_kv": "multi_query_group_num",
|
||||
}
|
||||
|
||||
def __init__(self,
|
||||
num_layers=28,
|
||||
padded_vocab_size=65024,
|
||||
hidden_size=4096,
|
||||
ffn_hidden_size=13696,
|
||||
kv_channels=128,
|
||||
num_attention_heads=32,
|
||||
seq_length=2048,
|
||||
hidden_dropout=0.0,
|
||||
attention_dropout=0.0,
|
||||
layernorm_epsilon=1e-5,
|
||||
rmsnorm=True,
|
||||
apply_residual_connection_post_layernorm=False,
|
||||
post_layer_norm=True,
|
||||
add_bias_linear=False,
|
||||
add_qkv_bias=False,
|
||||
interleaved_qkv=False,
|
||||
bias_dropout_fusion=True,
|
||||
multi_query_attention=False,
|
||||
multi_query_group_num=1,
|
||||
apply_query_key_layer_scaling=True,
|
||||
attention_softmax_in_fp32=True,
|
||||
fp32_residual_connection=False,
|
||||
quantization_bit=0,
|
||||
pre_seq_len=None,
|
||||
prefix_projection=False,
|
||||
**kwargs):
|
||||
def __init__(
|
||||
self,
|
||||
num_layers=28,
|
||||
padded_vocab_size=65024,
|
||||
hidden_size=4096,
|
||||
ffn_hidden_size=13696,
|
||||
kv_channels=128,
|
||||
num_attention_heads=32,
|
||||
seq_length=2048,
|
||||
hidden_dropout=0.0,
|
||||
attention_dropout=0.0,
|
||||
layernorm_epsilon=1e-5,
|
||||
rmsnorm=True,
|
||||
apply_residual_connection_post_layernorm=False,
|
||||
post_layer_norm=True,
|
||||
add_bias_linear=False,
|
||||
add_qkv_bias=False,
|
||||
interleaved_qkv=False,
|
||||
bias_dropout_fusion=True,
|
||||
multi_query_attention=False,
|
||||
multi_query_group_num=1,
|
||||
apply_query_key_layer_scaling=True,
|
||||
attention_softmax_in_fp32=True,
|
||||
fp32_residual_connection=False,
|
||||
quantization_bit=0,
|
||||
pre_seq_len=None,
|
||||
prefix_projection=False,
|
||||
**kwargs,
|
||||
):
|
||||
self.num_layers = num_layers
|
||||
self.vocab_size = padded_vocab_size
|
||||
self.padded_vocab_size = padded_vocab_size
|
||||
@@ -55,7 +57,8 @@ class ChatGLMConfig(PretrainedConfig):
|
||||
self.layernorm_epsilon = layernorm_epsilon
|
||||
self.rmsnorm = rmsnorm
|
||||
self.apply_residual_connection_post_layernorm = (
|
||||
apply_residual_connection_post_layernorm)
|
||||
apply_residual_connection_post_layernorm
|
||||
)
|
||||
self.post_layer_norm = post_layer_norm
|
||||
self.add_bias_linear = add_bias_linear
|
||||
self.add_qkv_bias = add_qkv_bias
|
||||
|
||||
@@ -7,7 +7,6 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class DeepseekV3Config(PretrainedConfig):
|
||||
|
||||
model_type = "deepseek_v3"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
@@ -30,14 +29,14 @@ class DeepseekV3Config(PretrainedConfig):
|
||||
qk_rope_head_dim=64,
|
||||
v_head_dim=128,
|
||||
qk_nope_head_dim=128,
|
||||
topk_method='noaux_tc',
|
||||
topk_method="noaux_tc",
|
||||
n_group=8,
|
||||
topk_group=4,
|
||||
num_experts_per_tok=8,
|
||||
moe_layer_freq=1,
|
||||
first_k_dense_replace=3,
|
||||
norm_topk_prob=True,
|
||||
scoring_func='sigmoid',
|
||||
scoring_func="sigmoid",
|
||||
hidden_act="silu",
|
||||
max_position_embeddings=4096,
|
||||
initializer_range=0.02,
|
||||
|
||||
@@ -25,20 +25,22 @@ class VisionEncoderConfig(PretrainedConfig):
|
||||
deterministic: bool = False
|
||||
num_recomputing_layers: int = 0
|
||||
|
||||
def __init__(self,
|
||||
model_name: str = "vit_so400m_patch14_siglip_384.webli",
|
||||
image_size: int = 384,
|
||||
patch_size: int = 16,
|
||||
width: int = 1024,
|
||||
layers: int = 24,
|
||||
heads: int = 16,
|
||||
mlp_ratio: int = 4,
|
||||
global_pool: str = "map",
|
||||
ignore_head: bool = True,
|
||||
class_token: bool = False,
|
||||
num_classes: int = 0,
|
||||
use_checkpoint: bool = False,
|
||||
**kwargs):
|
||||
def __init__(
|
||||
self,
|
||||
model_name: str = "vit_so400m_patch14_siglip_384.webli",
|
||||
image_size: int = 384,
|
||||
patch_size: int = 16,
|
||||
width: int = 1024,
|
||||
layers: int = 24,
|
||||
heads: int = 16,
|
||||
mlp_ratio: int = 4,
|
||||
global_pool: str = "map",
|
||||
ignore_head: bool = True,
|
||||
class_token: bool = False,
|
||||
num_classes: int = 0,
|
||||
use_checkpoint: bool = False,
|
||||
**kwargs,
|
||||
):
|
||||
self.model_name = model_name
|
||||
self.image_size = image_size
|
||||
self.patch_size = patch_size
|
||||
@@ -65,14 +67,16 @@ class MlpProjectorConfig(PretrainedConfig):
|
||||
downsample_ratio: int = 2
|
||||
token_pooling: bool = False
|
||||
|
||||
def __init__(self,
|
||||
projector_type: str = "downsample_mlp_gelu",
|
||||
input_dim: int = 1152,
|
||||
n_embed: int = 2048,
|
||||
depth: int = 2,
|
||||
mlp_ratio: int = 1,
|
||||
downsample_ratio: int = 2,
|
||||
**kwargs):
|
||||
def __init__(
|
||||
self,
|
||||
projector_type: str = "downsample_mlp_gelu",
|
||||
input_dim: int = 1152,
|
||||
n_embed: int = 2048,
|
||||
depth: int = 2,
|
||||
mlp_ratio: int = 1,
|
||||
downsample_ratio: int = 2,
|
||||
**kwargs,
|
||||
):
|
||||
self.projector_type = projector_type
|
||||
self.input_dim = input_dim
|
||||
self.n_embed = n_embed
|
||||
@@ -84,7 +88,6 @@ class MlpProjectorConfig(PretrainedConfig):
|
||||
|
||||
|
||||
class DeepseekV2Config(PretrainedConfig):
|
||||
|
||||
model_type = "deepseek_v2"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
@@ -106,14 +109,14 @@ class DeepseekV2Config(PretrainedConfig):
|
||||
qk_rope_head_dim=64,
|
||||
v_head_dim=128,
|
||||
qk_nope_head_dim=128,
|
||||
topk_method='gready',
|
||||
topk_method="gready",
|
||||
n_group=None,
|
||||
topk_group=None,
|
||||
num_experts_per_tok=None,
|
||||
moe_layer_freq=1,
|
||||
first_k_dense_replace=0,
|
||||
norm_topk_prob=False,
|
||||
scoring_func='softmax',
|
||||
scoring_func="softmax",
|
||||
aux_loss_alpha=0.001,
|
||||
seq_aux=True,
|
||||
hidden_act="silu",
|
||||
@@ -191,14 +194,15 @@ class DeepseekVLV2Config(PretrainedConfig):
|
||||
|
||||
tile_tag: str = "2D"
|
||||
global_view_pos: str = "head"
|
||||
candidate_resolutions: tuple[tuple[int, int]] = ((384, 384), )
|
||||
candidate_resolutions: tuple[tuple[int, int]] = ((384, 384),)
|
||||
|
||||
def __init__(self,
|
||||
tile_tag: str = "tile_tag",
|
||||
global_view_pos: str = "head",
|
||||
candidate_resolutions: tuple[tuple[int,
|
||||
int]] = ((384, 384), ),
|
||||
**kwargs):
|
||||
def __init__(
|
||||
self,
|
||||
tile_tag: str = "tile_tag",
|
||||
global_view_pos: str = "head",
|
||||
candidate_resolutions: tuple[tuple[int, int]] = ((384, 384),),
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
vision_config = kwargs.get("vision_config", {})
|
||||
|
||||
@@ -53,12 +53,14 @@ class DotsVisionConfig(PretrainedConfig):
|
||||
class DotsOCRConfig(Qwen2Config):
|
||||
model_type = "dots_ocr"
|
||||
|
||||
def __init__(self,
|
||||
image_token_id=151665,
|
||||
video_token_id=151656,
|
||||
vision_config: Optional[dict] = None,
|
||||
*args,
|
||||
**kwargs):
|
||||
def __init__(
|
||||
self,
|
||||
image_token_id=151665,
|
||||
video_token_id=151656,
|
||||
vision_config: Optional[dict] = None,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.image_token_id = image_token_id
|
||||
self.video_token_id = video_token_id
|
||||
|
||||
@@ -12,12 +12,13 @@ from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config
|
||||
class EAGLEConfig(PretrainedConfig):
|
||||
model_type = "eagle"
|
||||
|
||||
def __init__(self,
|
||||
model: Union[PretrainedConfig, dict, None] = None,
|
||||
truncated_vocab_size: Optional[int] = None,
|
||||
method: Optional[str] = 'eagle',
|
||||
**kwargs):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: Union[PretrainedConfig, dict, None] = None,
|
||||
truncated_vocab_size: Optional[int] = None,
|
||||
method: Optional[str] = "eagle",
|
||||
**kwargs,
|
||||
):
|
||||
model_config: Union[PretrainedConfig, DeepseekV2Config, None]
|
||||
if isinstance(model, dict):
|
||||
archs = model.get("architectures", [])
|
||||
@@ -31,8 +32,7 @@ class EAGLEConfig(PretrainedConfig):
|
||||
model_config = model
|
||||
|
||||
for k, v in kwargs.items():
|
||||
if k != "architectures" and k != "model_type" and hasattr(
|
||||
model_config, k):
|
||||
if k != "architectures" and k != "model_type" and hasattr(model_config, k):
|
||||
setattr(model_config, k, v)
|
||||
|
||||
self.model = model_config
|
||||
@@ -40,31 +40,39 @@ class EAGLEConfig(PretrainedConfig):
|
||||
if self.model is None:
|
||||
self.truncated_vocab_size = None
|
||||
else:
|
||||
self.truncated_vocab_size = self.model.vocab_size if \
|
||||
truncated_vocab_size is None else truncated_vocab_size
|
||||
self.truncated_vocab_size = (
|
||||
self.model.vocab_size
|
||||
if truncated_vocab_size is None
|
||||
else truncated_vocab_size
|
||||
)
|
||||
|
||||
# Eagle model name should follow naming convention of
|
||||
# LlamaForCausalLM -> EagleLlamaForCausalLM
|
||||
# LlamaForCausalLM -> Eagle3LlamaForCausalLM
|
||||
# LlamaForCausalLMEagle3 -> LlamaForCausalLMEagle3
|
||||
if method == "eagle":
|
||||
assert self.model is not None, \
|
||||
assert self.model is not None, (
|
||||
"model should not be None when method is eagle"
|
||||
)
|
||||
kwargs["architectures"] = [
|
||||
f"Eagle{arch}" if not arch.startswith("Eagle") \
|
||||
else arch for arch in self.model.architectures
|
||||
f"Eagle{arch}" if not arch.startswith("Eagle") else arch
|
||||
for arch in self.model.architectures
|
||||
]
|
||||
|
||||
elif method == "eagle3":
|
||||
assert self.model is not None, \
|
||||
assert self.model is not None, (
|
||||
"model should not be None when method is eagle3"
|
||||
)
|
||||
kwargs["architectures"] = [
|
||||
arch if arch.startswith("Eagle3") or arch.endswith("Eagle3")
|
||||
else f"Eagle3{arch}" for arch in self.model.architectures
|
||||
arch
|
||||
if arch.startswith("Eagle3") or arch.endswith("Eagle3")
|
||||
else f"Eagle3{arch}"
|
||||
for arch in self.model.architectures
|
||||
]
|
||||
else:
|
||||
raise ValueError(f"Invalid method {method}. "
|
||||
"Supported methods are eagle and eagle3.")
|
||||
raise ValueError(
|
||||
f"Invalid method {method}. Supported methods are eagle and eagle3."
|
||||
)
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@@ -80,5 +88,6 @@ class EAGLEConfig(PretrainedConfig):
|
||||
**kwargs,
|
||||
) -> "EAGLEConfig":
|
||||
config_dict, kwargs = cls.get_config_dict(
|
||||
pretrained_model_name_or_path, **kwargs)
|
||||
pretrained_model_name_or_path, **kwargs
|
||||
)
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Falcon configuration"""
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
|
||||
|
||||
@@ -77,9 +78,7 @@ class RWConfig(PretrainedConfig):
|
||||
# Hack for falcon-40b
|
||||
self.new_decoder_architecture = True
|
||||
|
||||
super().__init__(bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
**kwargs)
|
||||
super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
|
||||
|
||||
@property
|
||||
def head_dim(self):
|
||||
|
||||
@@ -75,7 +75,7 @@ class JAISConfig(PretrainedConfig):
|
||||
Whether or not the model should return the last key/values
|
||||
attentions (not used by all models).
|
||||
scale_attn_by_inverse_layer_idx (`bool`, *optional*, default `True`):
|
||||
Whether to additionally scale attention weights
|
||||
Whether to additionally scale attention weights
|
||||
by `1 / layer_idx + 1`.
|
||||
reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
|
||||
Whether to scale keys (K) prior to computing attention
|
||||
@@ -209,29 +209,35 @@ class JAISConfig(PretrainedConfig):
|
||||
if self.alibi_scaling is None:
|
||||
return
|
||||
|
||||
if (not isinstance(self.alibi_scaling, dict)
|
||||
or len(self.alibi_scaling) != 2):
|
||||
if not isinstance(self.alibi_scaling, dict) or len(self.alibi_scaling) != 2:
|
||||
raise ValueError(
|
||||
"`alibi_scaling` must be a dictionary with two fields, "
|
||||
"`type` and `factor` or `type` and `train_seq_len`, "
|
||||
f"got {self.alibi_scaling}")
|
||||
f"got {self.alibi_scaling}"
|
||||
)
|
||||
alibi_scaling_type = self.alibi_scaling.get("type", None)
|
||||
alibi_scaling_factor = self.alibi_scaling.get("factor", None)
|
||||
alibi_dynamic_scaling = self.alibi_scaling.get("train_seq_len", None)
|
||||
if alibi_scaling_type is None or alibi_scaling_type != "linear":
|
||||
raise ValueError(f"`alibi_scaling`'s type field must be 'linear', "
|
||||
f"got {alibi_scaling_type}")
|
||||
if (alibi_scaling_factor is not None
|
||||
and not isinstance(alibi_scaling_factor, float)
|
||||
or (alibi_scaling_factor is not None
|
||||
and alibi_scaling_factor <= 1.0)):
|
||||
raise ValueError(
|
||||
f"`alibi_scaling`'s type field must be 'linear', "
|
||||
f"got {alibi_scaling_type}"
|
||||
)
|
||||
if (
|
||||
alibi_scaling_factor is not None
|
||||
and not isinstance(alibi_scaling_factor, float)
|
||||
or (alibi_scaling_factor is not None and alibi_scaling_factor <= 1.0)
|
||||
):
|
||||
raise ValueError(
|
||||
f"`alibi_scaling`'s factor field must be a float > 1.0, "
|
||||
f"got {alibi_scaling_factor}")
|
||||
if (alibi_dynamic_scaling is not None
|
||||
and not isinstance(alibi_dynamic_scaling, int)
|
||||
or (alibi_dynamic_scaling is not None
|
||||
and alibi_dynamic_scaling <= 1)):
|
||||
f"got {alibi_scaling_factor}"
|
||||
)
|
||||
if (
|
||||
alibi_dynamic_scaling is not None
|
||||
and not isinstance(alibi_dynamic_scaling, int)
|
||||
or (alibi_dynamic_scaling is not None and alibi_dynamic_scaling <= 1)
|
||||
):
|
||||
raise ValueError(
|
||||
f"`alibi_scaling`'s `train_seq_len` field must be an "
|
||||
f"integer > 1, got {alibi_dynamic_scaling}")
|
||||
f"integer > 1, got {alibi_dynamic_scaling}"
|
||||
)
|
||||
|
||||
@@ -12,13 +12,15 @@ from vllm.transformers_utils.configs.moonvit import MoonViTConfig
|
||||
class KimiVLConfig(PretrainedConfig):
|
||||
model_type = "kimi_vl"
|
||||
|
||||
def __init__(self,
|
||||
vision_config: Optional[Union[dict, MoonViTConfig]] = None,
|
||||
text_config: Optional[Union[dict, DeepseekV2Config]] = None,
|
||||
ignore_index: int = -100,
|
||||
media_placeholder_token_id: int = 163605,
|
||||
pad_token_id: int = 0,
|
||||
**kwargs):
|
||||
def __init__(
|
||||
self,
|
||||
vision_config: Optional[Union[dict, MoonViTConfig]] = None,
|
||||
text_config: Optional[Union[dict, DeepseekV2Config]] = None,
|
||||
ignore_index: int = -100,
|
||||
media_placeholder_token_id: int = 163605,
|
||||
pad_token_id: int = 0,
|
||||
**kwargs,
|
||||
):
|
||||
if vision_config is None:
|
||||
vision_config = MoonViTConfig()
|
||||
elif isinstance(vision_config, dict):
|
||||
|
||||
@@ -10,16 +10,17 @@ from transformers import PretrainedConfig
|
||||
class MedusaConfig(PretrainedConfig):
|
||||
model_type = "medusa"
|
||||
|
||||
def __init__(self,
|
||||
hidden_size: int = 4096,
|
||||
vocab_size: int = 32001,
|
||||
num_heads: int = 5,
|
||||
num_hidden_layers: int = 1,
|
||||
max_paths: int = 64,
|
||||
topk: int = 10,
|
||||
truncated_vocab_size: Optional[int] = None,
|
||||
**kwargs):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
hidden_size: int = 4096,
|
||||
vocab_size: int = 32001,
|
||||
num_heads: int = 5,
|
||||
num_hidden_layers: int = 1,
|
||||
max_paths: int = 64,
|
||||
topk: int = 10,
|
||||
truncated_vocab_size: Optional[int] = None,
|
||||
**kwargs,
|
||||
):
|
||||
self.hidden_size = hidden_size
|
||||
self.vocab_size = vocab_size
|
||||
self.num_heads = num_heads
|
||||
@@ -27,8 +28,9 @@ class MedusaConfig(PretrainedConfig):
|
||||
self.max_paths = max_paths
|
||||
self.topk = topk
|
||||
self.max_seq_len = int(2**20)
|
||||
self.truncated_vocab_size = vocab_size if truncated_vocab_size is None\
|
||||
else truncated_vocab_size
|
||||
self.truncated_vocab_size = (
|
||||
vocab_size if truncated_vocab_size is None else truncated_vocab_size
|
||||
)
|
||||
if "architectures" not in kwargs:
|
||||
kwargs["architectures"] = ["MedusaModel"]
|
||||
|
||||
@@ -41,12 +43,13 @@ class MedusaConfig(PretrainedConfig):
|
||||
**kwargs,
|
||||
) -> "MedusaConfig":
|
||||
config_dict, kwargs = cls.get_config_dict(
|
||||
pretrained_model_name_or_path, **kwargs)
|
||||
pretrained_model_name_or_path, **kwargs
|
||||
)
|
||||
for k in list(config_dict.keys()):
|
||||
if 'num' in k:
|
||||
if 'heads' in k:
|
||||
if "num" in k:
|
||||
if "heads" in k:
|
||||
config_dict["num_heads"] = config_dict.pop(k)
|
||||
elif 'layers' in k:
|
||||
elif "layers" in k:
|
||||
config_dict["num_hidden_layers"] = config_dict.pop(k)
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
@@ -25,7 +25,8 @@ from typing import Optional, Union
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
from transformers.models.qwen2_5_omni.configuration_qwen2_5_omni import (
|
||||
Qwen2_5OmniTextConfig)
|
||||
Qwen2_5OmniTextConfig,
|
||||
)
|
||||
|
||||
|
||||
class DashengConfig(PretrainedConfig):
|
||||
@@ -91,11 +92,13 @@ class MiDashengLMConfig(PretrainedConfig):
|
||||
audio_token_id: Optional[int] = None,
|
||||
**kwargs,
|
||||
):
|
||||
self.audio_encoder_config = DashengConfig(
|
||||
**(audio_encoder_config or {}))
|
||||
self.audio_encoder_config = DashengConfig(**(audio_encoder_config or {}))
|
||||
self.subsample_factor = subsample_factor
|
||||
self.text_config = (Qwen2_5OmniTextConfig(
|
||||
**text_config) if text_config else Qwen2_5OmniTextConfig())
|
||||
self.text_config = (
|
||||
Qwen2_5OmniTextConfig(**text_config)
|
||||
if text_config
|
||||
else Qwen2_5OmniTextConfig()
|
||||
)
|
||||
self.text_config.rope_scaling = None # uses_mrope is false
|
||||
self.audio_token_id = audio_token_id
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@@ -9,8 +9,7 @@ from vllm.logger import init_logger
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def adapt_config_dict(config_dict: dict[str, Any],
|
||||
**kwargs) -> PretrainedConfig:
|
||||
def adapt_config_dict(config_dict: dict[str, Any], **kwargs) -> PretrainedConfig:
|
||||
config_dict.update(kwargs)
|
||||
config_dict = _remap_general_mistral_args(config_dict)
|
||||
|
||||
@@ -25,15 +24,16 @@ def adapt_config_dict(config_dict: dict[str, Any],
|
||||
if bool(config_dict.get("yarn")):
|
||||
config_dict = _remap_mistral_yarn_args(config_dict)
|
||||
|
||||
is_vision = ((config_dict.get("multimodal")
|
||||
or {}).get("vision_encoder_args")
|
||||
or config_dict.get("vision_encoder"))
|
||||
is_vision = (config_dict.get("multimodal") or {}).get(
|
||||
"vision_encoder_args"
|
||||
) or config_dict.get("vision_encoder")
|
||||
is_audio = bool(
|
||||
((config_dict.get("multimodal") or {}).get("whisper_model_args")
|
||||
or {}).get("encoder_args"))
|
||||
((config_dict.get("multimodal") or {}).get("whisper_model_args") or {}).get(
|
||||
"encoder_args"
|
||||
)
|
||||
)
|
||||
|
||||
assert not (is_vision and is_audio), \
|
||||
"Vision and audio are mutually exclusive"
|
||||
assert not (is_vision and is_audio), "Vision and audio are mutually exclusive"
|
||||
|
||||
if is_vision:
|
||||
config_dict = _remap_mistral_vision_args(config_dict)
|
||||
@@ -77,7 +77,7 @@ def _remap_mistral_yarn_args(config: dict) -> dict:
|
||||
config["rope_scaling"] = {
|
||||
"rope_type": "yarn",
|
||||
"mscale_all_dim": 1, # We hardcoded this to 1
|
||||
**renamed_yarn_config
|
||||
**renamed_yarn_config,
|
||||
}
|
||||
return config
|
||||
|
||||
@@ -105,8 +105,7 @@ def _remap_general_mistral_args(config: dict) -> dict:
|
||||
if key in config:
|
||||
config[new_key] = config.pop(key)
|
||||
|
||||
for new_key, (key,
|
||||
default_value) in top_level_mapping_with_default.items():
|
||||
for new_key, (key, default_value) in top_level_mapping_with_default.items():
|
||||
config[new_key] = config.pop(key, default_value)
|
||||
|
||||
return config
|
||||
@@ -116,16 +115,12 @@ def _remap_mistral_quantization_args(config: dict) -> dict:
|
||||
quantization = config.get("quantization", {})
|
||||
if quantization.get("qformat_weight") == "fp8_e4m3":
|
||||
# This maps to the FP8 static per-tensor quantization scheme
|
||||
quantization_config = {
|
||||
"quant_method": "fp8",
|
||||
"activation_scheme": "static"
|
||||
}
|
||||
quantization_config = {"quant_method": "fp8", "activation_scheme": "static"}
|
||||
elif quantization.get("quant_method") == "compressed-tensors":
|
||||
# Pass through the quantization config to compressed-tensors
|
||||
quantization_config = quantization
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Found unknown quantization='{quantization}' in config")
|
||||
raise ValueError(f"Found unknown quantization='{quantization}' in config")
|
||||
|
||||
config["quantization_config"] = quantization_config
|
||||
|
||||
@@ -139,13 +134,10 @@ def _remap_mistral_audio_args(config: dict) -> dict:
|
||||
|
||||
quant_config = config.get("quantization_config")
|
||||
config = {
|
||||
"model_type":
|
||||
"whixtral",
|
||||
"model_type": "whixtral",
|
||||
"architectures": ["VoxtralForConditionalGeneration"],
|
||||
"text_config":
|
||||
PretrainedConfig.from_dict(config),
|
||||
"audio_config":
|
||||
WhisperConfig(
|
||||
"text_config": PretrainedConfig.from_dict(config),
|
||||
"audio_config": WhisperConfig(
|
||||
num_mel_bins=encoder_args["audio_encoding_args"]["num_mel_bins"],
|
||||
window_size=encoder_args["audio_encoding_args"]["window_size"],
|
||||
sampling_rate=encoder_args["audio_encoding_args"]["sampling_rate"],
|
||||
@@ -158,7 +150,7 @@ def _remap_mistral_audio_args(config: dict) -> dict:
|
||||
vocab_size=encoder_args["vocab_size"],
|
||||
max_source_positions=encoder_args["max_source_positions"],
|
||||
is_encoder_decoder=False, # Override WhisperConfig default
|
||||
)
|
||||
),
|
||||
}
|
||||
if quant_config:
|
||||
config["quantization_config"] = quant_config
|
||||
|
||||
@@ -13,16 +13,18 @@ class MLPSpeculatorConfig(PretrainedConfig):
|
||||
"hidden_size": "emb_dim",
|
||||
}
|
||||
|
||||
def __init__(self,
|
||||
vocab_size: int = 32000,
|
||||
emb_dim: int = 4096,
|
||||
inner_dim: int = 0,
|
||||
n_predict: int = 3,
|
||||
top_k_tokens_per_head: Optional[list[int]] = None,
|
||||
n_candidates: int = 5,
|
||||
tie_weights: bool = False,
|
||||
scale_input: bool = False,
|
||||
**kwargs):
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size: int = 32000,
|
||||
emb_dim: int = 4096,
|
||||
inner_dim: int = 0,
|
||||
n_predict: int = 3,
|
||||
top_k_tokens_per_head: Optional[list[int]] = None,
|
||||
n_candidates: int = 5,
|
||||
tie_weights: bool = False,
|
||||
scale_input: bool = False,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Initialize an MLPSpeculatorConfig
|
||||
|
||||
|
||||
@@ -8,16 +8,16 @@ class MoonViTConfig(PretrainedConfig):
|
||||
model_type = "moonvit"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
patch_size: int = 14,
|
||||
init_pos_emb_height: int = 64,
|
||||
init_pos_emb_width: int = 64,
|
||||
num_attention_heads: int = 16,
|
||||
num_hidden_layers: int = 27,
|
||||
hidden_size: int = 1152,
|
||||
intermediate_size: int = 4304,
|
||||
merge_kernel_size: tuple[int, int] = (2, 2),
|
||||
**kwargs,
|
||||
self,
|
||||
patch_size: int = 14,
|
||||
init_pos_emb_height: int = 64,
|
||||
init_pos_emb_width: int = 64,
|
||||
num_attention_heads: int = 16,
|
||||
num_hidden_layers: int = 27,
|
||||
hidden_size: int = 1152,
|
||||
intermediate_size: int = 4304,
|
||||
merge_kernel_size: tuple[int, int] = (2, 2),
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self.patch_size = patch_size
|
||||
|
||||
@@ -62,7 +62,7 @@ class NemotronConfig(PretrainedConfig):
|
||||
(MQA) otherwise GQA is used. When converting a multi-head
|
||||
checkpoint to a GQA checkpoint, each group key and value
|
||||
head should be constructed by meanpooling all the original
|
||||
heads within that group. For more details checkout
|
||||
heads within that group. For more details checkout
|
||||
[this paper](https://arxiv.org/pdf/2305.13245.pdf). If it
|
||||
is not specified, will default to `num_attention_heads`.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`):
|
||||
@@ -147,8 +147,9 @@ class NemotronConfig(PretrainedConfig):
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
head_dim = head_dim or kwargs.get("kv_channels")
|
||||
self.head_dim = head_dim if head_dim is not None else (
|
||||
hidden_size // num_attention_heads)
|
||||
self.head_dim = (
|
||||
head_dim if head_dim is not None else (hidden_size // num_attention_heads)
|
||||
)
|
||||
|
||||
# for backward compatibility
|
||||
if num_key_value_heads is None:
|
||||
@@ -162,8 +163,11 @@ class NemotronConfig(PretrainedConfig):
|
||||
self.rope_theta = rope_theta
|
||||
self.rope_scaling = rope_scaling
|
||||
# for backward compatibility
|
||||
partial_rotary_factor = kwargs.get("rope_percent") or kwargs.get(
|
||||
"rope_percentage") or partial_rotary_factor
|
||||
partial_rotary_factor = (
|
||||
kwargs.get("rope_percent")
|
||||
or kwargs.get("rope_percentage")
|
||||
or partial_rotary_factor
|
||||
)
|
||||
self.partial_rotary_factor = partial_rotary_factor
|
||||
self._rope_scaling_validation()
|
||||
self.attention_bias = attention_bias
|
||||
@@ -185,21 +189,24 @@ class NemotronConfig(PretrainedConfig):
|
||||
if self.rope_scaling is None:
|
||||
return
|
||||
|
||||
if not isinstance(self.rope_scaling, dict) or len(
|
||||
self.rope_scaling) != 2:
|
||||
if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
|
||||
raise ValueError(
|
||||
"`rope_scaling` must be a dictionary with two fields, "
|
||||
f"`type` and `factor`, got {self.rope_scaling}")
|
||||
f"`type` and `factor`, got {self.rope_scaling}"
|
||||
)
|
||||
rope_scaling_type = self.rope_scaling.get("type", None)
|
||||
rope_scaling_factor = self.rope_scaling.get("factor", None)
|
||||
if rope_scaling_type is None or rope_scaling_type not in [
|
||||
"linear", "dynamic"
|
||||
]:
|
||||
if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
|
||||
raise ValueError(
|
||||
"`rope_scaling`'s type field must be one of ['linear', "
|
||||
f"'dynamic'], got {rope_scaling_type}")
|
||||
if rope_scaling_factor is None or not isinstance(
|
||||
rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
|
||||
f"'dynamic'], got {rope_scaling_type}"
|
||||
)
|
||||
if (
|
||||
rope_scaling_factor is None
|
||||
or not isinstance(rope_scaling_factor, float)
|
||||
or rope_scaling_factor <= 1.0
|
||||
):
|
||||
raise ValueError(
|
||||
"`rope_scaling`'s factor field must be a float > 1, got "
|
||||
f"{rope_scaling_factor}")
|
||||
f"{rope_scaling_factor}"
|
||||
)
|
||||
|
||||
@@ -203,11 +203,11 @@ class NemotronHConfig(PretrainedConfig):
|
||||
# Validate hybrid_override_pattern
|
||||
# M: Mamba2, *: Attention, -: MLP
|
||||
assert len(self.hybrid_override_pattern) == self.num_hidden_layers, (
|
||||
"hybrid_override_pattern must have same length as "
|
||||
"num_hidden_layers")
|
||||
"hybrid_override_pattern must have same length as num_hidden_layers"
|
||||
)
|
||||
assert re.match(r"^[*-M]+$", self.hybrid_override_pattern), (
|
||||
"hybrid_override_pattern must only contain characters "
|
||||
"'M', '*', or '-'")
|
||||
"hybrid_override_pattern must only contain characters 'M', '*', or '-'"
|
||||
)
|
||||
|
||||
# for backward compatibility
|
||||
if num_key_value_heads is None:
|
||||
@@ -253,7 +253,10 @@ class NemotronHConfig(PretrainedConfig):
|
||||
@property
|
||||
def layers_block_type(self):
|
||||
return [
|
||||
"mamba" if self.hybrid_override_pattern[i] == "M" else
|
||||
"attention" if self.hybrid_override_pattern[i] == "*" else "mlp"
|
||||
"mamba"
|
||||
if self.hybrid_override_pattern[i] == "M"
|
||||
else "attention"
|
||||
if self.hybrid_override_pattern[i] == "*"
|
||||
else "mlp"
|
||||
for i in range(self.num_hidden_layers)
|
||||
]
|
||||
|
||||
@@ -5,7 +5,6 @@ from transformers.configuration_utils import PretrainedConfig
|
||||
|
||||
|
||||
class Olmo3Config(PretrainedConfig):
|
||||
|
||||
model_type = "olmo3"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
|
||||
@@ -16,8 +16,7 @@
|
||||
# limitations under the License.
|
||||
"""Qwen3-Next model configuration"""
|
||||
|
||||
from transformers.configuration_utils import (PretrainedConfig,
|
||||
layer_type_validation)
|
||||
from transformers.configuration_utils import PretrainedConfig, layer_type_validation
|
||||
from transformers.modeling_rope_utils import rope_config_validation
|
||||
from transformers.utils import logging
|
||||
|
||||
|
||||
@@ -81,11 +81,11 @@ class RadioConfig(PretrainedConfig):
|
||||
self.initializer_factor = initializer_factor
|
||||
self.hidden_act = hidden_act
|
||||
self.max_img_size = max_img_size
|
||||
self.norm_mean = list(norm_mean) if isinstance(norm_mean,
|
||||
(tuple,
|
||||
list)) else norm_mean
|
||||
self.norm_std = list(norm_std) if isinstance(norm_std,
|
||||
(tuple,
|
||||
list)) else norm_std
|
||||
self.norm_mean = (
|
||||
list(norm_mean) if isinstance(norm_mean, (tuple, list)) else norm_mean
|
||||
)
|
||||
self.norm_std = (
|
||||
list(norm_std) if isinstance(norm_std, (tuple, list)) else norm_std
|
||||
)
|
||||
self.reg_tokens = reg_tokens
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@@ -5,7 +5,6 @@ SUPPORTED_SPECULATORS_TYPES = {}
|
||||
|
||||
|
||||
def register_speculator(name):
|
||||
|
||||
def decorator(fn):
|
||||
SUPPORTED_SPECULATORS_TYPES[name] = fn
|
||||
return fn
|
||||
@@ -17,7 +16,7 @@ def register_speculator(name):
|
||||
def update_eagle3(config_dict: dict, vllm_config: dict) -> None:
|
||||
"""
|
||||
Apply Eagle-3 specific configuration transformations.
|
||||
|
||||
|
||||
Eagle-3 specific fields:
|
||||
- draft_vocab_size: Size of the draft model's vocabulary
|
||||
- target_hidden_size: Hidden size of the target model
|
||||
@@ -27,6 +26,5 @@ def update_eagle3(config_dict: dict, vllm_config: dict) -> None:
|
||||
vllm_config["draft_vocab_size"] = config_dict.get("draft_vocab_size")
|
||||
if config_dict.get("target_hidden_size") is not None:
|
||||
vllm_config["target_hidden_size"] = config_dict["target_hidden_size"]
|
||||
vllm_config["norm_before_residual"] = config_dict.get(
|
||||
"norm_before_residual", True)
|
||||
vllm_config["norm_before_residual"] = config_dict.get("norm_before_residual", True)
|
||||
vllm_config["architectures"] = ["Eagle3LlamaForCausalLM"]
|
||||
|
||||
@@ -6,7 +6,8 @@ from typing import Any, Union
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
from vllm.transformers_utils.configs.speculators.algos import (
|
||||
SUPPORTED_SPECULATORS_TYPES)
|
||||
SUPPORTED_SPECULATORS_TYPES,
|
||||
)
|
||||
|
||||
__all__ = ["SpeculatorsConfig"]
|
||||
|
||||
@@ -21,27 +22,27 @@ class SpeculatorsConfig(PretrainedConfig):
|
||||
**kwargs,
|
||||
) -> "SpeculatorsConfig":
|
||||
"""Load speculators Eagle config and convert to vLLM format."""
|
||||
config_dict, _ = cls.get_config_dict(pretrained_model_name_or_path,
|
||||
**kwargs)
|
||||
config_dict, _ = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
vllm_config = cls.extract_vllm_speculative_config(config_dict)
|
||||
return cls(**vllm_config)
|
||||
|
||||
@classmethod
|
||||
def extract_vllm_speculative_config(
|
||||
cls, config_dict: dict[str, Any]) -> dict[str, Any]:
|
||||
cls, config_dict: dict[str, Any]
|
||||
) -> dict[str, Any]:
|
||||
speculators_model_type = config_dict.get("speculators_model_type")
|
||||
if speculators_model_type not in SUPPORTED_SPECULATORS_TYPES:
|
||||
raise ValueError(
|
||||
f"Expected one of: {SUPPORTED_SPECULATORS_TYPES}. "
|
||||
"Please ensure you're loading a speculators-format model.")
|
||||
"Please ensure you're loading a speculators-format model."
|
||||
)
|
||||
|
||||
# validate fields
|
||||
# TODO: @dsikka - use speculators pydantic model to validate
|
||||
cls.validate_speculators_config(config_dict=config_dict)
|
||||
# Convert from speculators config -> format that can be ingested by vLLM
|
||||
vllm_config = cls.build_vllm_speculative_config(
|
||||
config_dict=config_dict)
|
||||
vllm_config = cls.build_vllm_speculative_config(config_dict=config_dict)
|
||||
# Apply anything specific to the supported algorithm
|
||||
algo_updater = SUPPORTED_SPECULATORS_TYPES[speculators_model_type]
|
||||
algo_updater(config_dict=config_dict, vllm_config=vllm_config)
|
||||
@@ -64,11 +65,13 @@ class SpeculatorsConfig(PretrainedConfig):
|
||||
|
||||
if not isinstance(config_dict["transformer_layer_config"], dict):
|
||||
raise TypeError(
|
||||
"'transformer_layer_config' must be a dictionary if provided")
|
||||
"'transformer_layer_config' must be a dictionary if provided"
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def build_vllm_speculative_config(
|
||||
cls, config_dict: dict[str, Any]) -> dict[str, Any]:
|
||||
cls, config_dict: dict[str, Any]
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Build vLLM-compatible speculative configuration from speculators format.
|
||||
|
||||
@@ -94,14 +97,14 @@ class SpeculatorsConfig(PretrainedConfig):
|
||||
|
||||
if num_speculative_tokens is None:
|
||||
raise ValueError(
|
||||
"Missing 'speculative_tokens' in proposal method. "
|
||||
f"Got: {first_method}")
|
||||
f"Missing 'speculative_tokens' in proposal method. Got: {first_method}"
|
||||
)
|
||||
|
||||
# Build base vLLM speculative configuration
|
||||
vllm_config = {
|
||||
"method": config_dict.get("speculators_model_type"),
|
||||
"num_speculative_tokens": num_speculative_tokens,
|
||||
"target_model": spec_config.get("verifier")["name_or_path"]
|
||||
"target_model": spec_config.get("verifier")["name_or_path"],
|
||||
}
|
||||
|
||||
# Merge transformer layer configuration if present
|
||||
|
||||
@@ -59,13 +59,64 @@ class Step3TextConfig(PretrainedConfig):
|
||||
share_q_dim: int = 2048,
|
||||
head_dim: int = 256,
|
||||
norm_expert_weight: bool = False,
|
||||
moe_layers_enum: tuple[int,
|
||||
...] = (4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
|
||||
15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
|
||||
25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
|
||||
35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
|
||||
45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
|
||||
55, 56, 57, 58, 59),
|
||||
moe_layers_enum: tuple[int, ...] = (
|
||||
4,
|
||||
5,
|
||||
6,
|
||||
7,
|
||||
8,
|
||||
9,
|
||||
10,
|
||||
11,
|
||||
12,
|
||||
13,
|
||||
14,
|
||||
15,
|
||||
16,
|
||||
17,
|
||||
18,
|
||||
19,
|
||||
20,
|
||||
21,
|
||||
22,
|
||||
23,
|
||||
24,
|
||||
25,
|
||||
26,
|
||||
27,
|
||||
28,
|
||||
29,
|
||||
30,
|
||||
31,
|
||||
32,
|
||||
33,
|
||||
34,
|
||||
35,
|
||||
36,
|
||||
37,
|
||||
38,
|
||||
39,
|
||||
40,
|
||||
41,
|
||||
42,
|
||||
43,
|
||||
44,
|
||||
45,
|
||||
46,
|
||||
47,
|
||||
48,
|
||||
49,
|
||||
50,
|
||||
51,
|
||||
52,
|
||||
53,
|
||||
54,
|
||||
55,
|
||||
56,
|
||||
57,
|
||||
58,
|
||||
59,
|
||||
),
|
||||
**kwargs,
|
||||
) -> None:
|
||||
self.hidden_size = hidden_size
|
||||
|
||||
@@ -42,6 +42,7 @@ class UltravoxConfig(transformers.PretrainedConfig):
|
||||
projector or at the end. Versions v0.4.1 and below
|
||||
use `False`, but v0.5 and above use `True`.
|
||||
"""
|
||||
|
||||
wrapped_model_config: transformers.PretrainedConfig
|
||||
model_type = "ultravox"
|
||||
audio_token = "<|audio|>"
|
||||
@@ -76,15 +77,17 @@ class UltravoxConfig(transformers.PretrainedConfig):
|
||||
if text_model_id is None:
|
||||
text_config = text_config or {}
|
||||
self.wrapped_model_config = transformers.CONFIG_MAPPING[
|
||||
text_config.get("model_type", "llama")](**text_config)
|
||||
text_config.get("model_type", "llama")
|
||||
](**text_config)
|
||||
|
||||
# N.B. May set the audio_config below.
|
||||
self.audio_model_id = audio_model_id
|
||||
if audio_model_id is None:
|
||||
self.audio_model_id = None
|
||||
audio_config = audio_config or {}
|
||||
self.audio_config = transformers.CONFIG_MAPPING[audio_config.get(
|
||||
"model_type", "whisper")](**audio_config)
|
||||
self.audio_config = transformers.CONFIG_MAPPING[
|
||||
audio_config.get("model_type", "whisper")
|
||||
](**audio_config)
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@@ -99,8 +102,7 @@ class UltravoxConfig(transformers.PretrainedConfig):
|
||||
if key == "text_model_id" and value is not None:
|
||||
from vllm.transformers_utils.config import get_config
|
||||
|
||||
self.wrapped_model_config = get_config(value,
|
||||
trust_remote_code=False)
|
||||
self.wrapped_model_config = get_config(value, trust_remote_code=False)
|
||||
elif key == "audio_model_id" and value is not None:
|
||||
from vllm.transformers_utils.config import get_config
|
||||
|
||||
|
||||
@@ -30,8 +30,9 @@ def _convert_tokens_to_string_with_added_encoders(
|
||||
current_sub_text: list[str] = []
|
||||
convert_tokens_to_string = tokenizer.convert_tokens_to_string
|
||||
added_vocab_set = set(tokenizer.get_added_vocab())
|
||||
all_special_tokens = set(
|
||||
tokenizer.all_special_tokens) if skip_special_tokens else ()
|
||||
all_special_tokens = (
|
||||
set(tokenizer.all_special_tokens) if skip_special_tokens else ()
|
||||
)
|
||||
|
||||
for token in output_tokens:
|
||||
# Use precomputed set for skip-special check
|
||||
@@ -70,11 +71,11 @@ def convert_prompt_ids_to_tokens(
|
||||
# We do not need to convert the whole prompt to tokens.
|
||||
# Offset a little more in case we have special tokens.
|
||||
new_tokens = tokenizer.convert_ids_to_tokens(
|
||||
prompt_ids[-INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET - 2:],
|
||||
skip_special_tokens=skip_special_tokens)
|
||||
prompt_ids[-INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET - 2 :],
|
||||
skip_special_tokens=skip_special_tokens,
|
||||
)
|
||||
read_offset = len(new_tokens)
|
||||
prefix_offset = max(
|
||||
read_offset - INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET, 0)
|
||||
prefix_offset = max(read_offset - INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET, 0)
|
||||
# This is required to guard against out-of-vocab prompt token ids
|
||||
_replace_none_with_empty(new_tokens) # type: ignore[arg-type]
|
||||
return new_tokens, prefix_offset, read_offset
|
||||
@@ -92,7 +93,7 @@ def convert_ids_list_to_tokens(
|
||||
|
||||
Returns:
|
||||
Python list of token string representations
|
||||
|
||||
|
||||
"""
|
||||
token_str_lst = []
|
||||
for token_id in token_ids:
|
||||
@@ -144,18 +145,17 @@ def detokenize_incrementally(
|
||||
# This is the first iteration for this sequence
|
||||
is_first_iter = prev_tokens is None
|
||||
if is_first_iter:
|
||||
(prev_tokens, prefix_offset,
|
||||
read_offset) = convert_prompt_ids_to_tokens(
|
||||
tokenizer,
|
||||
all_input_ids[:-1],
|
||||
skip_special_tokens=skip_special_tokens)
|
||||
(prev_tokens, prefix_offset, read_offset) = convert_prompt_ids_to_tokens(
|
||||
tokenizer, all_input_ids[:-1], skip_special_tokens=skip_special_tokens
|
||||
)
|
||||
assert prev_tokens is not None
|
||||
|
||||
# If the new token id is out of bounds, return an empty string.
|
||||
if 0 <= new_token_id < len(tokenizer):
|
||||
# Put new_token_id in a list so skip_special_tokens is respected
|
||||
new_tokens = tokenizer.convert_ids_to_tokens(
|
||||
[new_token_id], skip_special_tokens=skip_special_tokens)
|
||||
[new_token_id], skip_special_tokens=skip_special_tokens
|
||||
)
|
||||
if isinstance(new_tokens, str):
|
||||
new_tokens = [new_tokens]
|
||||
else:
|
||||
@@ -171,9 +171,9 @@ def detokenize_incrementally(
|
||||
# surrounding ids.
|
||||
if tokenizer.is_fast or not tokenizer.get_added_vocab():
|
||||
prefix_text = tokenizer.convert_tokens_to_string(
|
||||
output_tokens[prefix_offset:read_offset])
|
||||
new_text = tokenizer.convert_tokens_to_string(
|
||||
output_tokens[prefix_offset:])
|
||||
output_tokens[prefix_offset:read_offset]
|
||||
)
|
||||
new_text = tokenizer.convert_tokens_to_string(output_tokens[prefix_offset:])
|
||||
else:
|
||||
prefix_text = _convert_tokens_to_string_with_added_encoders(
|
||||
tokenizer,
|
||||
@@ -195,5 +195,5 @@ def detokenize_incrementally(
|
||||
# by the model
|
||||
return new_tokens, "", prefix_offset, read_offset
|
||||
|
||||
new_text = new_text[len(prefix_text):]
|
||||
new_text = new_text[len(prefix_text) :]
|
||||
return new_tokens, new_text, read_offset, len(output_tokens)
|
||||
|
||||
@@ -4,8 +4,12 @@
|
||||
from functools import lru_cache
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union, cast
|
||||
|
||||
from transformers import (AutoFeatureExtractor, AutoImageProcessor,
|
||||
AutoProcessor, AutoVideoProcessor)
|
||||
from transformers import (
|
||||
AutoFeatureExtractor,
|
||||
AutoImageProcessor,
|
||||
AutoProcessor,
|
||||
AutoVideoProcessor,
|
||||
)
|
||||
from transformers.feature_extraction_utils import FeatureExtractionMixin
|
||||
from transformers.image_processing_utils import BaseImageProcessor
|
||||
from transformers.processing_utils import ProcessorMixin
|
||||
@@ -121,15 +125,18 @@ def get_processor(
|
||||
"a custom processor not yet available in the HuggingFace "
|
||||
"transformers library, consider setting "
|
||||
"`trust_remote_code=True` in LLM or using the "
|
||||
"`--trust-remote-code` flag in the CLI.")
|
||||
"`--trust-remote-code` flag in the CLI."
|
||||
)
|
||||
raise RuntimeError(err_msg) from e
|
||||
else:
|
||||
raise e
|
||||
|
||||
if not isinstance(processor, processor_cls):
|
||||
raise TypeError("Invalid type of HuggingFace processor. "
|
||||
f"Expected type: {processor_cls}, but "
|
||||
f"found type: {type(processor)}")
|
||||
raise TypeError(
|
||||
"Invalid type of HuggingFace processor. "
|
||||
f"Expected type: {processor_cls}, but "
|
||||
f"found type: {type(processor)}"
|
||||
)
|
||||
|
||||
return processor
|
||||
|
||||
@@ -158,7 +165,7 @@ def get_feature_extractor(
|
||||
trust_remote_code: bool = False,
|
||||
**kwargs: Any,
|
||||
):
|
||||
"""Load an audio feature extractor for the given model name
|
||||
"""Load an audio feature extractor for the given model name
|
||||
via HuggingFace."""
|
||||
try:
|
||||
feature_extractor = AutoFeatureExtractor.from_pretrained(
|
||||
@@ -166,7 +173,8 @@ def get_feature_extractor(
|
||||
*args,
|
||||
revision=revision,
|
||||
trust_remote_code=trust_remote_code,
|
||||
**kwargs)
|
||||
**kwargs,
|
||||
)
|
||||
except ValueError as e:
|
||||
# If the error pertains to the processor class not existing or not
|
||||
# currently being imported, suggest using the --trust-remote-code flag.
|
||||
@@ -177,7 +185,8 @@ def get_feature_extractor(
|
||||
"extractor is a custom extractor not yet available in the "
|
||||
"HuggingFace transformers library, consider setting "
|
||||
"`trust_remote_code=True` in LLM or using the "
|
||||
"`--trust-remote-code` flag in the CLI.")
|
||||
"`--trust-remote-code` flag in the CLI."
|
||||
)
|
||||
raise RuntimeError(err_msg) from e
|
||||
else:
|
||||
raise e
|
||||
@@ -213,7 +222,8 @@ def get_image_processor(
|
||||
*args,
|
||||
revision=revision,
|
||||
trust_remote_code=trust_remote_code,
|
||||
**kwargs)
|
||||
**kwargs,
|
||||
)
|
||||
except ValueError as e:
|
||||
# If the error pertains to the processor class not existing or not
|
||||
# currently being imported, suggest using the --trust-remote-code flag.
|
||||
@@ -224,7 +234,8 @@ def get_image_processor(
|
||||
"a custom processor not yet available in the HuggingFace "
|
||||
"transformers library, consider setting "
|
||||
"`trust_remote_code=True` in LLM or using the "
|
||||
"`--trust-remote-code` flag in the CLI.")
|
||||
"`--trust-remote-code` flag in the CLI."
|
||||
)
|
||||
raise RuntimeError(err_msg) from e
|
||||
else:
|
||||
raise e
|
||||
@@ -263,7 +274,8 @@ def get_video_processor(
|
||||
*args,
|
||||
revision=revision,
|
||||
trust_remote_code=trust_remote_code,
|
||||
**kwargs)
|
||||
**kwargs,
|
||||
)
|
||||
except ValueError as e:
|
||||
# If the error pertains to the processor class not existing or not
|
||||
# currently being imported, suggest using the --trust-remote-code flag.
|
||||
@@ -274,7 +286,8 @@ def get_video_processor(
|
||||
"a custom processor not yet available in the HuggingFace "
|
||||
"transformers library, consider setting "
|
||||
"`trust_remote_code=True` in LLM or using the "
|
||||
"`--trust-remote-code` flag in the CLI.")
|
||||
"`--trust-remote-code` flag in the CLI."
|
||||
)
|
||||
raise RuntimeError(err_msg) from e
|
||||
else:
|
||||
raise e
|
||||
|
||||
@@ -8,8 +8,7 @@ reasons:
|
||||
- There is a need to override the existing processor to support vLLM.
|
||||
"""
|
||||
|
||||
from vllm.transformers_utils.processors.deepseek_vl2 import (
|
||||
DeepseekVLV2Processor)
|
||||
from vllm.transformers_utils.processors.deepseek_vl2 import DeepseekVLV2Processor
|
||||
from vllm.transformers_utils.processors.ovis import OvisProcessor
|
||||
from vllm.transformers_utils.processors.ovis2_5 import Ovis2_5Processor
|
||||
|
||||
|
||||
@@ -30,8 +30,7 @@ import PIL
|
||||
import torch
|
||||
from transformers import AutoProcessor, BatchFeature
|
||||
from transformers.image_utils import ImageInput
|
||||
from transformers.processing_utils import (ProcessingKwargs, ProcessorMixin,
|
||||
Unpack)
|
||||
from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
|
||||
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
|
||||
|
||||
from vllm.multimodal.image import convert_image_mode
|
||||
|
||||
@@ -9,33 +9,31 @@ import PIL
|
||||
import torch
|
||||
from transformers import AutoProcessor, BatchFeature
|
||||
from transformers.image_utils import ImageInput
|
||||
from transformers.processing_utils import (ProcessingKwargs, ProcessorMixin,
|
||||
Unpack)
|
||||
from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
|
||||
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
|
||||
|
||||
__all__ = ['Ovis2_5Processor']
|
||||
__all__ = ["Ovis2_5Processor"]
|
||||
IMAGE_TOKEN = "<image>"
|
||||
VIDEO_TOKEN = "<video>"
|
||||
MIN_PIXELS = 448 * 448
|
||||
MAX_PIXELS = 1792 * 1792
|
||||
|
||||
|
||||
class Ovis2_5ProcessorKwargs(ProcessingKwargs,
|
||||
total=False): # type: ignore[call-arg]
|
||||
class Ovis2_5ProcessorKwargs(ProcessingKwargs, total=False): # type: ignore[call-arg]
|
||||
_defaults = {
|
||||
"text_kwargs": {
|
||||
"padding": False,
|
||||
},
|
||||
"images_kwargs": {
|
||||
'convert_to_rgb': True,
|
||||
'min_pixels': MIN_PIXELS,
|
||||
'max_pixels': MAX_PIXELS,
|
||||
"convert_to_rgb": True,
|
||||
"min_pixels": MIN_PIXELS,
|
||||
"max_pixels": MAX_PIXELS,
|
||||
},
|
||||
"videos_kwargs": {
|
||||
'convert_to_rgb': True,
|
||||
'min_pixels': MIN_PIXELS,
|
||||
'max_pixels': MAX_PIXELS,
|
||||
}
|
||||
"convert_to_rgb": True,
|
||||
"min_pixels": MIN_PIXELS,
|
||||
"max_pixels": MAX_PIXELS,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@@ -43,8 +41,8 @@ class Ovis2_5Processor(ProcessorMixin):
|
||||
r"""
|
||||
Constructs an Ovis processor which wraps an Ovis image processor
|
||||
and a Qwen2 tokenizer into a single processor.
|
||||
[`OvisProcessor`] offers all the functionalities of
|
||||
[`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`].
|
||||
[`OvisProcessor`] offers all the functionalities of
|
||||
[`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`].
|
||||
See the [`~OvisProcessor.__call__`] and [`~OvisProcessor.decode`]
|
||||
for more information.
|
||||
Args:
|
||||
@@ -81,9 +79,7 @@ class Ovis2_5Processor(ProcessorMixin):
|
||||
self.patch_size = patch_size
|
||||
self.hidden_stride = hidden_stride
|
||||
self.temporal_patch_size = temporal_patch_size
|
||||
super().__init__(image_processor,
|
||||
tokenizer,
|
||||
chat_template=chat_template)
|
||||
super().__init__(image_processor, tokenizer, chat_template=chat_template)
|
||||
|
||||
@cached_property
|
||||
def extra_special_tokens(self):
|
||||
@@ -96,7 +92,7 @@ class Ovis2_5Processor(ProcessorMixin):
|
||||
"image_end": -302,
|
||||
"video_start": -303,
|
||||
"video_end": -304,
|
||||
'image_pad': image_pad_token_id,
|
||||
"image_pad": image_pad_token_id,
|
||||
}
|
||||
return extra_special_tokens
|
||||
|
||||
@@ -104,8 +100,9 @@ class Ovis2_5Processor(ProcessorMixin):
|
||||
self,
|
||||
images: ImageInput = None,
|
||||
videos: Union[np.ndarray, list[ImageInput]] = None,
|
||||
text: Union[TextInput, PreTokenizedInput, list[TextInput],
|
||||
list[PreTokenizedInput]] = None,
|
||||
text: Union[
|
||||
TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]
|
||||
] = None,
|
||||
**kwargs: Unpack[Ovis2_5ProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
@@ -148,9 +145,9 @@ class Ovis2_5Processor(ProcessorMixin):
|
||||
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
|
||||
- **input_ids** -- list of token ids to be fed to a model.
|
||||
Returned when `text` is not `None`.
|
||||
- **attention_mask** -- list of indices specifying which tokens
|
||||
- **attention_mask** -- list of indices specifying which tokens
|
||||
should be attended to by the model (when
|
||||
`return_attention_mask=True` or if *"attention_mask"*
|
||||
`return_attention_mask=True` or if *"attention_mask"*
|
||||
is in `self.model_input_names` and if `text` is not `None`).
|
||||
- **pixel_values** -- Pixel values to be fed to a model.
|
||||
Returned when `images` is not `None`.
|
||||
@@ -177,9 +174,9 @@ class Ovis2_5Processor(ProcessorMixin):
|
||||
grids = []
|
||||
# Process each image
|
||||
for image in images if isinstance(images, list) else [images]:
|
||||
pixel_values, image_placeholders, grid = (
|
||||
self.preprocess_multidata(
|
||||
images=image, **output_kwargs["images_kwargs"]))
|
||||
pixel_values, image_placeholders, grid = self.preprocess_multidata(
|
||||
images=image, **output_kwargs["images_kwargs"]
|
||||
)
|
||||
processed_images.append(pixel_values)
|
||||
image_placeholders_list.append(image_placeholders)
|
||||
grids.append(grid)
|
||||
@@ -196,16 +193,15 @@ class Ovis2_5Processor(ProcessorMixin):
|
||||
grids = []
|
||||
# Process each video
|
||||
for video in videos if isinstance(videos, list) else [videos]:
|
||||
pixel_values, video_placeholders, grid = (
|
||||
self.preprocess_multidata(
|
||||
video=video, **output_kwargs["videos_kwargs"]))
|
||||
pixel_values, video_placeholders, grid = self.preprocess_multidata(
|
||||
video=video, **output_kwargs["videos_kwargs"]
|
||||
)
|
||||
processed_videos.append(pixel_values)
|
||||
videos_placeholders_list.append(video_placeholders)
|
||||
grids.append(grid)
|
||||
# assign all processed videos
|
||||
if processed_videos:
|
||||
visual_features[
|
||||
"video_placeholders"] = videos_placeholders_list
|
||||
visual_features["video_placeholders"] = videos_placeholders_list
|
||||
output["video_pixel_values"] = processed_videos
|
||||
output["video_grids"] = grids
|
||||
|
||||
@@ -220,14 +216,16 @@ class Ovis2_5Processor(ProcessorMixin):
|
||||
image_idx = 0
|
||||
video_idx = 0
|
||||
for ids_tensor in tokenized_batched_text:
|
||||
has_image_tokens = (image_token_id in ids_tensor
|
||||
and "image_placeholders" in visual_features
|
||||
and image_idx < len(
|
||||
visual_features["image_placeholders"]))
|
||||
has_video_tokens = (video_token_id in ids_tensor
|
||||
and "video_placeholders" in visual_features
|
||||
and video_idx < len(
|
||||
visual_features["video_placeholders"]))
|
||||
has_image_tokens = (
|
||||
image_token_id in ids_tensor
|
||||
and "image_placeholders" in visual_features
|
||||
and image_idx < len(visual_features["image_placeholders"])
|
||||
)
|
||||
has_video_tokens = (
|
||||
video_token_id in ids_tensor
|
||||
and "video_placeholders" in visual_features
|
||||
and video_idx < len(visual_features["video_placeholders"])
|
||||
)
|
||||
if has_image_tokens or has_video_tokens:
|
||||
# Convert to list for easier manipulation
|
||||
ids_list = ids_tensor.tolist()
|
||||
@@ -237,13 +235,13 @@ class Ovis2_5Processor(ProcessorMixin):
|
||||
for token_id in ids_list:
|
||||
if token_id == image_token_id:
|
||||
new_ids.extend(
|
||||
visual_features["image_placeholders"]
|
||||
[image_idx])
|
||||
visual_features["image_placeholders"][image_idx]
|
||||
)
|
||||
image_idx += 1
|
||||
elif token_id == video_token_id:
|
||||
new_ids.extend(
|
||||
visual_features["video_placeholders"]
|
||||
[video_idx])
|
||||
visual_features["video_placeholders"][video_idx]
|
||||
)
|
||||
video_idx += 1
|
||||
else:
|
||||
new_ids.append(token_id)
|
||||
@@ -260,8 +258,7 @@ class Ovis2_5Processor(ProcessorMixin):
|
||||
# If only images were provided
|
||||
return BatchFeature(data=visual_features)
|
||||
|
||||
def _tokenize_with_visual_symbol(self,
|
||||
text_list: list[str]) -> torch.LongTensor:
|
||||
def _tokenize_with_visual_symbol(self, text_list: list[str]) -> torch.LongTensor:
|
||||
batch_token_ids = []
|
||||
for text in text_list:
|
||||
token_ids = []
|
||||
@@ -288,21 +285,24 @@ class Ovis2_5Processor(ProcessorMixin):
|
||||
return torch.tensor(batch_token_ids, dtype=torch.long)
|
||||
|
||||
# Copied from qwen2_vl
|
||||
def smart_resize(self,
|
||||
height: int,
|
||||
width: int,
|
||||
factor: int = 28,
|
||||
min_pixels: int = MIN_PIXELS,
|
||||
max_pixels: int = MAX_PIXELS):
|
||||
def smart_resize(
|
||||
self,
|
||||
height: int,
|
||||
width: int,
|
||||
factor: int = 28,
|
||||
min_pixels: int = MIN_PIXELS,
|
||||
max_pixels: int = MAX_PIXELS,
|
||||
):
|
||||
"""Rescales the image so that the following conditions are met:
|
||||
1. Both dimensions (height and width) are divisible by 'factor'.
|
||||
2. The total number of pixels is within the range
|
||||
2. The total number of pixels is within the range
|
||||
['min_pixels', 'max_pixels'].
|
||||
3. The aspect ratio of the image is maintained as closely as possible.
|
||||
"""
|
||||
if height < factor or width < factor:
|
||||
print(f"height:{height} or width:{width} must be "
|
||||
f"larger than factor:{factor}")
|
||||
print(
|
||||
f"height:{height} or width:{width} must be larger than factor:{factor}"
|
||||
)
|
||||
if height < width:
|
||||
width = round(factor / height * width)
|
||||
height = factor
|
||||
@@ -311,8 +311,10 @@ class Ovis2_5Processor(ProcessorMixin):
|
||||
width = factor
|
||||
|
||||
elif max(height, width) / min(height, width) > 200:
|
||||
print(f"absolute aspect ratio must be smaller than 200, "
|
||||
f"got {max(height, width) / min(height, width)}")
|
||||
print(
|
||||
f"absolute aspect ratio must be smaller than 200, "
|
||||
f"got {max(height, width) / min(height, width)}"
|
||||
)
|
||||
if height > width:
|
||||
height = 200 * width
|
||||
else:
|
||||
@@ -335,29 +337,27 @@ class Ovis2_5Processor(ProcessorMixin):
|
||||
|
||||
def construct_visual_indicators(self, grid, is_video: bool = False):
|
||||
if is_video:
|
||||
start_token = self.get_token_value('video_start')
|
||||
end_token = self.get_token_value('video_end')
|
||||
start_token = self.get_token_value("video_start")
|
||||
end_token = self.get_token_value("video_end")
|
||||
else:
|
||||
start_token = self.get_token_value('image_start')
|
||||
end_token = self.get_token_value('image_end')
|
||||
start_token = self.get_token_value("image_start")
|
||||
end_token = self.get_token_value("image_end")
|
||||
|
||||
image_placeholders = [start_token, self.get_token_value('visual_atom')]
|
||||
image_placeholders = [start_token, self.get_token_value("visual_atom")]
|
||||
if grid[0] * grid[1] > 1:
|
||||
for r in range(grid[0]):
|
||||
for c in range(grid[1]):
|
||||
image_placeholders.append(
|
||||
self.get_token_value('visual_atom'))
|
||||
image_placeholders.append(self.get_token_value("visual_atom"))
|
||||
|
||||
image_placeholders.append(end_token)
|
||||
return image_placeholders
|
||||
|
||||
def construct_visual_placeholders(self, grid, is_video: bool = False):
|
||||
visual_placeholders = self.construct_visual_indicators((1, 1),
|
||||
is_video)
|
||||
visual_placeholders = self.construct_visual_indicators((1, 1), is_video)
|
||||
|
||||
image_atom_token_id = self.get_token_value('visual_atom')
|
||||
image_atom_token_id = self.get_token_value("visual_atom")
|
||||
# Extract the padding token ID from tokenizer
|
||||
image_padding_token_id = self.get_token_value('image_pad')
|
||||
image_padding_token_id = self.get_token_value("image_pad")
|
||||
|
||||
num_image_atoms = grid[0] * grid[1] * grid[2]
|
||||
num_image_atoms //= self.hidden_stride**2
|
||||
@@ -367,8 +367,9 @@ class Ovis2_5Processor(ProcessorMixin):
|
||||
padded_placeholder_tokens = []
|
||||
for token in visual_placeholders:
|
||||
if token == image_atom_token_id:
|
||||
padded_placeholder_tokens.extend([image_padding_token_id] *
|
||||
num_image_atoms)
|
||||
padded_placeholder_tokens.extend(
|
||||
[image_padding_token_id] * num_image_atoms
|
||||
)
|
||||
else:
|
||||
padded_placeholder_tokens.append(image_padding_token_id)
|
||||
return padded_placeholder_tokens
|
||||
@@ -380,7 +381,7 @@ class Ovis2_5Processor(ProcessorMixin):
|
||||
convert_to_rgb: Optional[bool] = True,
|
||||
min_pixels: int = MIN_PIXELS,
|
||||
max_pixels: int = MAX_PIXELS,
|
||||
return_tensors: Optional[str] = 'pt',
|
||||
return_tensors: Optional[str] = "pt",
|
||||
):
|
||||
is_video = False
|
||||
if images is not None:
|
||||
@@ -396,11 +397,12 @@ class Ovis2_5Processor(ProcessorMixin):
|
||||
images.append(image)
|
||||
elif isinstance(video, list):
|
||||
images = video
|
||||
min_pixels = min(max_pixels if max_pixels is not None else MAX_PIXELS,
|
||||
min_pixels if min_pixels is not None else MIN_PIXELS)
|
||||
min_pixels = min(
|
||||
max_pixels if max_pixels is not None else MAX_PIXELS,
|
||||
min_pixels if min_pixels is not None else MIN_PIXELS,
|
||||
)
|
||||
images = [
|
||||
image.convert("RGB")
|
||||
if convert_to_rgb and image.mode != 'RGB' else image
|
||||
image.convert("RGB") if convert_to_rgb and image.mode != "RGB" else image
|
||||
for image in images
|
||||
]
|
||||
|
||||
@@ -417,14 +419,16 @@ class Ovis2_5Processor(ProcessorMixin):
|
||||
)
|
||||
new_size = dict(height=resized_height, width=resized_width)
|
||||
image_pt = self.image_processor.preprocess(
|
||||
image, size=new_size, return_tensors="np")['pixel_values'][0]
|
||||
image, size=new_size, return_tensors="np"
|
||||
)["pixel_values"][0]
|
||||
|
||||
processed_images.append(image_pt)
|
||||
|
||||
patches = np.array(processed_images)
|
||||
if patches.shape[0] % self.temporal_patch_size != 0:
|
||||
num_to_pad = self.temporal_patch_size - (patches.shape[0] %
|
||||
self.temporal_patch_size)
|
||||
num_to_pad = self.temporal_patch_size - (
|
||||
patches.shape[0] % self.temporal_patch_size
|
||||
)
|
||||
repeats = np.repeat(patches[-1][np.newaxis], num_to_pad, axis=0)
|
||||
patches = np.concatenate([patches, repeats], axis=0)
|
||||
channel = patches.shape[1]
|
||||
@@ -445,14 +449,18 @@ class Ovis2_5Processor(ProcessorMixin):
|
||||
)
|
||||
patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
|
||||
flatten_patches = patches.reshape(
|
||||
grid_t * grid_h * grid_w, channel * self.temporal_patch_size *
|
||||
self.patch_size * self.patch_size)
|
||||
grid_t * grid_h * grid_w,
|
||||
channel * self.temporal_patch_size * self.patch_size * self.patch_size,
|
||||
)
|
||||
|
||||
visual_placeholders = self.construct_visual_placeholders(
|
||||
[grid_t, grid_h, grid_w], is_video)
|
||||
return torch.tensor(
|
||||
flatten_patches), visual_placeholders, torch.tensor(
|
||||
[[grid_t, grid_h, grid_w]])
|
||||
[grid_t, grid_h, grid_w], is_video
|
||||
)
|
||||
return (
|
||||
torch.tensor(flatten_patches),
|
||||
visual_placeholders,
|
||||
torch.tensor([[grid_t, grid_h, grid_w]]),
|
||||
)
|
||||
|
||||
|
||||
AutoProcessor.register("Ovis2_5Processor", Ovis2_5Processor)
|
||||
|
||||
@@ -14,7 +14,7 @@ from vllm.utils import PlaceholderModule
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
SUPPORTED_SCHEMES = ['s3://', 'gs://']
|
||||
SUPPORTED_SCHEMES = ["s3://", "gs://"]
|
||||
|
||||
try:
|
||||
from runai_model_streamer import list_safetensors as runai_list_safetensors
|
||||
@@ -22,11 +22,9 @@ try:
|
||||
except (ImportError, OSError):
|
||||
# see https://github.com/run-ai/runai-model-streamer/issues/26
|
||||
# OSError will be raised on arm64 platform
|
||||
runai_model_streamer = PlaceholderModule(
|
||||
"runai_model_streamer") # type: ignore[assignment]
|
||||
runai_model_streamer = PlaceholderModule("runai_model_streamer") # type: ignore[assignment]
|
||||
runai_pull_files = runai_model_streamer.placeholder_attr("pull_files")
|
||||
runai_list_safetensors = runai_model_streamer.placeholder_attr(
|
||||
"list_safetensors")
|
||||
runai_list_safetensors = runai_model_streamer.placeholder_attr("list_safetensors")
|
||||
|
||||
|
||||
def list_safetensors(path: str = "") -> list[str]:
|
||||
@@ -65,8 +63,10 @@ class ObjectStorageModel:
|
||||
signal.signal(sig, self._close_by_signal(existing_handler))
|
||||
|
||||
dir_name = os.path.join(
|
||||
get_cache_dir(), "model_streamer",
|
||||
hashlib.sha256(str(url).encode()).hexdigest()[:8])
|
||||
get_cache_dir(),
|
||||
"model_streamer",
|
||||
hashlib.sha256(str(url).encode()).hexdigest()[:8],
|
||||
)
|
||||
if os.path.exists(dir_name):
|
||||
shutil.rmtree(dir_name)
|
||||
os.makedirs(dir_name)
|
||||
@@ -78,7 +78,6 @@ class ObjectStorageModel:
|
||||
shutil.rmtree(self.dir)
|
||||
|
||||
def _close_by_signal(self, existing_handler=None):
|
||||
|
||||
def new_handler(signum, frame):
|
||||
self._close()
|
||||
if existing_handler:
|
||||
@@ -86,10 +85,12 @@ class ObjectStorageModel:
|
||||
|
||||
return new_handler
|
||||
|
||||
def pull_files(self,
|
||||
model_path: str = "",
|
||||
allow_pattern: Optional[list[str]] = None,
|
||||
ignore_pattern: Optional[list[str]] = None) -> None:
|
||||
def pull_files(
|
||||
self,
|
||||
model_path: str = "",
|
||||
allow_pattern: Optional[list[str]] = None,
|
||||
ignore_pattern: Optional[list[str]] = None,
|
||||
) -> None:
|
||||
"""
|
||||
Pull files from object storage into the temporary directory.
|
||||
|
||||
|
||||
@@ -17,21 +17,25 @@ except ImportError:
|
||||
|
||||
def _filter_allow(paths: list[str], patterns: list[str]) -> list[str]:
|
||||
return [
|
||||
path for path in paths if any(
|
||||
fnmatch.fnmatch(path, pattern) for pattern in patterns)
|
||||
path
|
||||
for path in paths
|
||||
if any(fnmatch.fnmatch(path, pattern) for pattern in patterns)
|
||||
]
|
||||
|
||||
|
||||
def _filter_ignore(paths: list[str], patterns: list[str]) -> list[str]:
|
||||
return [
|
||||
path for path in paths
|
||||
path
|
||||
for path in paths
|
||||
if not any(fnmatch.fnmatch(path, pattern) for pattern in patterns)
|
||||
]
|
||||
|
||||
|
||||
def glob(s3: Optional["BaseClient"] = None,
|
||||
path: str = "",
|
||||
allow_pattern: Optional[list[str]] = None) -> list[str]:
|
||||
def glob(
|
||||
s3: Optional["BaseClient"] = None,
|
||||
path: str = "",
|
||||
allow_pattern: Optional[list[str]] = None,
|
||||
) -> list[str]:
|
||||
"""
|
||||
List full file names from S3 path and filter by allow pattern.
|
||||
|
||||
@@ -47,17 +51,15 @@ def glob(s3: Optional["BaseClient"] = None,
|
||||
s3 = boto3.client("s3")
|
||||
if not path.endswith("/"):
|
||||
path = path + "/"
|
||||
bucket_name, _, paths = list_files(s3,
|
||||
path=path,
|
||||
allow_pattern=allow_pattern)
|
||||
bucket_name, _, paths = list_files(s3, path=path, allow_pattern=allow_pattern)
|
||||
return [f"s3://{bucket_name}/{path}" for path in paths]
|
||||
|
||||
|
||||
def list_files(
|
||||
s3: "BaseClient",
|
||||
path: str,
|
||||
allow_pattern: Optional[list[str]] = None,
|
||||
ignore_pattern: Optional[list[str]] = None
|
||||
s3: "BaseClient",
|
||||
path: str,
|
||||
allow_pattern: Optional[list[str]] = None,
|
||||
ignore_pattern: Optional[list[str]] = None,
|
||||
) -> tuple[str, str, list[str]]:
|
||||
"""
|
||||
List files from S3 path and filter by pattern.
|
||||
@@ -71,17 +73,17 @@ def list_files(
|
||||
Returns:
|
||||
tuple[str, str, list[str]]: A tuple where:
|
||||
- The first element is the bucket name
|
||||
- The second element is string represent the bucket
|
||||
- The second element is string represent the bucket
|
||||
and the prefix as a dir like string
|
||||
- The third element is a list of files allowed or
|
||||
- The third element is a list of files allowed or
|
||||
disallowed by pattern
|
||||
"""
|
||||
parts = path.removeprefix('s3://').split('/')
|
||||
prefix = '/'.join(parts[1:])
|
||||
parts = path.removeprefix("s3://").split("/")
|
||||
prefix = "/".join(parts[1:])
|
||||
bucket_name = parts[0]
|
||||
|
||||
objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
|
||||
paths = [obj['Key'] for obj in objects.get('Contents', [])]
|
||||
paths = [obj["Key"] for obj in objects.get("Contents", [])]
|
||||
|
||||
paths = _filter_ignore(paths, ["*/"])
|
||||
if allow_pattern is not None:
|
||||
|
||||
@@ -10,14 +10,12 @@ from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union
|
||||
|
||||
import huggingface_hub
|
||||
from transformers import (AutoTokenizer, PreTrainedTokenizer,
|
||||
PreTrainedTokenizerFast)
|
||||
from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
|
||||
from typing_extensions import assert_never
|
||||
|
||||
from vllm import envs
|
||||
from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.config import (
|
||||
get_sentence_transformer_tokenizer_config)
|
||||
from vllm.transformers_utils.config import get_sentence_transformer_tokenizer_config
|
||||
from vllm.transformers_utils.tokenizers import MistralTokenizer
|
||||
from vllm.transformers_utils.utils import check_gguf_file
|
||||
|
||||
@@ -32,8 +30,7 @@ else:
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
AnyTokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast,
|
||||
TokenizerBase]
|
||||
AnyTokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast, TokenizerBase]
|
||||
|
||||
|
||||
def decode_tokens(
|
||||
@@ -50,8 +47,7 @@ def decode_tokens(
|
||||
settings.
|
||||
"""
|
||||
if skip_special_tokens is not None:
|
||||
return tokenizer.decode(token_ids,
|
||||
skip_special_tokens=skip_special_tokens)
|
||||
return tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
|
||||
|
||||
return tokenizer.decode(token_ids)
|
||||
|
||||
@@ -95,8 +91,7 @@ def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
|
||||
|
||||
tokenizer_all_special_ids = tokenizer.all_special_ids
|
||||
tokenizer_all_special_tokens = tokenizer.all_special_tokens
|
||||
tokenizer_all_special_tokens_extended = (
|
||||
tokenizer.all_special_tokens_extended)
|
||||
tokenizer_all_special_tokens_extended = tokenizer.all_special_tokens_extended
|
||||
tokenizer_vocab = tokenizer.get_vocab()
|
||||
tokenizer_len = len(tokenizer)
|
||||
|
||||
@@ -110,7 +105,6 @@ def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
|
||||
max_token_id = max(max_token_id, tokenizer.vocab_size)
|
||||
|
||||
class CachedTokenizer(tokenizer.__class__): # type: ignore
|
||||
|
||||
@property
|
||||
def all_special_ids(self) -> list[int]:
|
||||
return tokenizer_all_special_ids
|
||||
@@ -134,7 +128,7 @@ def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
|
||||
return tokenizer_len
|
||||
|
||||
def __reduce__(self):
|
||||
return get_cached_tokenizer, (tokenizer, )
|
||||
return get_cached_tokenizer, (tokenizer,)
|
||||
|
||||
CachedTokenizer.__name__ = f"Cached{tokenizer.__class__.__name__}"
|
||||
|
||||
@@ -151,8 +145,7 @@ def get_tokenizer(
|
||||
download_dir: Optional[str] = None,
|
||||
**kwargs,
|
||||
) -> AnyTokenizer:
|
||||
"""Gets a tokenizer for the given model name via HuggingFace or ModelScope.
|
||||
"""
|
||||
"""Gets a tokenizer for the given model name via HuggingFace or ModelScope."""
|
||||
if envs.VLLM_USE_MODELSCOPE:
|
||||
# download model from ModelScope hub,
|
||||
# lazy import so that modelscope is not required for normal use.
|
||||
@@ -173,13 +166,13 @@ def get_tokenizer(
|
||||
revision=revision,
|
||||
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
|
||||
# Ignore weights - we only need the tokenizer.
|
||||
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
|
||||
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"],
|
||||
)
|
||||
tokenizer_name = tokenizer_path
|
||||
|
||||
if tokenizer_mode == "slow":
|
||||
if kwargs.get("use_fast", False):
|
||||
raise ValueError(
|
||||
"Cannot use the fast tokenizer in slow tokenizer mode.")
|
||||
raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
|
||||
kwargs["use_fast"] = False
|
||||
|
||||
if "truncation_side" not in kwargs:
|
||||
@@ -195,23 +188,28 @@ def get_tokenizer(
|
||||
is_from_mistral_org = str(tokenizer_name).split("/")[0] == "mistralai"
|
||||
if is_from_mistral_org and tokenizer_mode != "mistral":
|
||||
warnings.warn(
|
||||
'It is strongly recommended to run mistral models with '
|
||||
"It is strongly recommended to run mistral models with "
|
||||
'`--tokenizer-mode "mistral"` to ensure correct '
|
||||
'encoding and decoding.',
|
||||
"encoding and decoding.",
|
||||
FutureWarning,
|
||||
stacklevel=2)
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
tokenizer: AnyTokenizer
|
||||
if tokenizer_mode == "mistral":
|
||||
tokenizer = MistralTokenizer.from_pretrained(str(tokenizer_name),
|
||||
revision=revision)
|
||||
tokenizer = MistralTokenizer.from_pretrained(
|
||||
str(tokenizer_name), revision=revision
|
||||
)
|
||||
elif tokenizer_mode == "custom":
|
||||
from vllm.transformers_utils.tokenizer_base import TokenizerRegistry
|
||||
tokenizer = TokenizerRegistry.get_tokenizer(str(tokenizer_name),
|
||||
*args,
|
||||
revision=revision,
|
||||
download_dir=download_dir,
|
||||
**kwargs)
|
||||
|
||||
tokenizer = TokenizerRegistry.get_tokenizer(
|
||||
str(tokenizer_name),
|
||||
*args,
|
||||
revision=revision,
|
||||
download_dir=download_dir,
|
||||
**kwargs,
|
||||
)
|
||||
else:
|
||||
try:
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
@@ -226,13 +224,16 @@ def get_tokenizer(
|
||||
# currently being imported,
|
||||
# suggest using the --trust-remote-code flag.
|
||||
if not trust_remote_code and (
|
||||
"does not exist or is not currently imported." in str(e)
|
||||
or "requires you to execute the tokenizer file" in str(e)):
|
||||
err_msg = ("Failed to load the tokenizer. If the tokenizer "
|
||||
"is a custom tokenizer not yet available in the "
|
||||
"HuggingFace transformers library, consider "
|
||||
"setting `trust_remote_code=True` in LLM or using "
|
||||
"the `--trust-remote-code` flag in the CLI.")
|
||||
"does not exist or is not currently imported." in str(e)
|
||||
or "requires you to execute the tokenizer file" in str(e)
|
||||
):
|
||||
err_msg = (
|
||||
"Failed to load the tokenizer. If the tokenizer "
|
||||
"is a custom tokenizer not yet available in the "
|
||||
"HuggingFace transformers library, consider "
|
||||
"setting `trust_remote_code=True` in LLM or using "
|
||||
"the `--trust-remote-code` flag in the CLI."
|
||||
)
|
||||
raise RuntimeError(err_msg) from e
|
||||
else:
|
||||
raise e
|
||||
@@ -240,19 +241,21 @@ def get_tokenizer(
|
||||
# The special_tokens in tokenizer should also be
|
||||
# controlled by do_lower_case in encoder_config
|
||||
encoder_config = get_sentence_transformer_tokenizer_config(
|
||||
tokenizer_name, revision)
|
||||
tokenizer_name, revision
|
||||
)
|
||||
if isinstance(encoder_config, dict) and encoder_config.get(
|
||||
"do_lower_case", False):
|
||||
"do_lower_case", False
|
||||
):
|
||||
special_tokens_map = {
|
||||
k: v.lower()
|
||||
for k, v in tokenizer.special_tokens_map.items()
|
||||
k: v.lower() for k, v in tokenizer.special_tokens_map.items()
|
||||
}
|
||||
tokenizer.add_special_tokens(special_tokens_map)
|
||||
|
||||
if not isinstance(tokenizer, PreTrainedTokenizerFast):
|
||||
logger.warning(
|
||||
"Using a slow tokenizer. This might cause a significant "
|
||||
"slowdown. Consider using a fast tokenizer instead.")
|
||||
"slowdown. Consider using a fast tokenizer instead."
|
||||
)
|
||||
tokenizer = get_cached_tokenizer(tokenizer)
|
||||
|
||||
return tokenizer
|
||||
|
||||
@@ -10,7 +10,6 @@ if TYPE_CHECKING:
|
||||
|
||||
|
||||
class TokenizerBase(ABC):
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def all_special_tokens_extended(self) -> list[str]:
|
||||
@@ -98,18 +97,22 @@ class TokenizerBase(ABC):
|
||||
raise NotImplementedError()
|
||||
|
||||
@abstractmethod
|
||||
def encode(self,
|
||||
text: str,
|
||||
truncation: Optional[bool] = None,
|
||||
max_length: Optional[int] = None,
|
||||
add_special_tokens: Optional[bool] = None) -> list[int]:
|
||||
def encode(
|
||||
self,
|
||||
text: str,
|
||||
truncation: Optional[bool] = None,
|
||||
max_length: Optional[int] = None,
|
||||
add_special_tokens: Optional[bool] = None,
|
||||
) -> list[int]:
|
||||
raise NotImplementedError()
|
||||
|
||||
@abstractmethod
|
||||
def apply_chat_template(self,
|
||||
messages: list["ChatCompletionMessageParam"],
|
||||
tools: Optional[list[dict[str, Any]]] = None,
|
||||
**kwargs) -> list[int]:
|
||||
def apply_chat_template(
|
||||
self,
|
||||
messages: list["ChatCompletionMessageParam"],
|
||||
tools: Optional[list[dict[str, Any]]] = None,
|
||||
**kwargs,
|
||||
) -> list[int]:
|
||||
raise NotImplementedError()
|
||||
|
||||
@abstractmethod
|
||||
@@ -117,9 +120,9 @@ class TokenizerBase(ABC):
|
||||
raise NotImplementedError()
|
||||
|
||||
@abstractmethod
|
||||
def decode(self,
|
||||
ids: Union[list[int], int],
|
||||
skip_special_tokens: bool = True) -> str:
|
||||
def decode(
|
||||
self, ids: Union[list[int], int], skip_special_tokens: bool = True
|
||||
) -> str:
|
||||
raise NotImplementedError()
|
||||
|
||||
@abstractmethod
|
||||
|
||||
@@ -1,10 +1,16 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from .mistral import (MistralTokenizer, maybe_serialize_tool_calls,
|
||||
truncate_tool_call_ids, validate_request_params)
|
||||
from .mistral import (
|
||||
MistralTokenizer,
|
||||
maybe_serialize_tool_calls,
|
||||
truncate_tool_call_ids,
|
||||
validate_request_params,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"MistralTokenizer", "maybe_serialize_tool_calls", "truncate_tool_call_ids",
|
||||
"validate_request_params"
|
||||
"MistralTokenizer",
|
||||
"maybe_serialize_tool_calls",
|
||||
"truncate_tool_call_ids",
|
||||
"validate_request_params",
|
||||
]
|
||||
|
||||
@@ -20,7 +20,8 @@ if TYPE_CHECKING:
|
||||
# will not be bothered by the dependency.
|
||||
from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
||||
from mistral_common.tokens.tokenizers.mistral import (
|
||||
MistralTokenizer as PublicMistralTokenizer)
|
||||
MistralTokenizer as PublicMistralTokenizer,
|
||||
)
|
||||
|
||||
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
|
||||
|
||||
@@ -51,7 +52,7 @@ def maybe_serialize_tool_calls(request: "ChatCompletionRequest"):
|
||||
# - https://github.com/pydantic/pydantic/issues/9541
|
||||
# TODO: remove when pydantic v2.11 is released
|
||||
for i, message in enumerate(request.messages):
|
||||
if message.get("role") == 'assistant':
|
||||
if message.get("role") == "assistant":
|
||||
tool_calls_validator = message.get("tool_calls", ().__iter__())
|
||||
validated_tool_calls = []
|
||||
while True:
|
||||
@@ -67,7 +68,7 @@ def maybe_serialize_tool_calls(request: "ChatCompletionRequest"):
|
||||
def truncate_tool_call_ids(request: "ChatCompletionRequest"):
|
||||
"""Truncates tool call IDs for Mistral's ID requirements."""
|
||||
for i, message in enumerate(request.messages):
|
||||
if message.get("role") == 'assistant':
|
||||
if message.get("role") == "assistant":
|
||||
tool_calls = message.get("tool_calls", [])
|
||||
for tool_call in tool_calls:
|
||||
if len(tool_call["id"]) > 9:
|
||||
@@ -95,17 +96,19 @@ def truncate_tool_call_ids(request: "ChatCompletionRequest"):
|
||||
|
||||
|
||||
def validate_request_params(request: "ChatCompletionRequest"):
|
||||
if (request.skip_special_tokens is not None
|
||||
and not request.skip_special_tokens):
|
||||
raise ValueError("skip_special_tokens=False is not supported "
|
||||
"for Mistral tokenizers.")
|
||||
if request.skip_special_tokens is not None and not request.skip_special_tokens:
|
||||
raise ValueError(
|
||||
"skip_special_tokens=False is not supported for Mistral tokenizers."
|
||||
)
|
||||
|
||||
|
||||
def list_local_repo_files(repo_id: str, revision: Optional[str]) -> list[str]:
|
||||
repo_cache = os.path.join(
|
||||
huggingface_hub.constants.HF_HUB_CACHE,
|
||||
huggingface_hub.constants.REPO_ID_SEPARATOR.join(
|
||||
["models", *repo_id.split("/")]))
|
||||
["models", *repo_id.split("/")]
|
||||
),
|
||||
)
|
||||
|
||||
if revision is None:
|
||||
revision_file = os.path.join(repo_cache, "refs", "main")
|
||||
@@ -141,7 +144,8 @@ def find_tokenizer_file(files: list[str]):
|
||||
raise OSError(
|
||||
f"Found {len(matched_files)} files matching the "
|
||||
f"pattern: `{file_pattern.pattern}`. Make sure that a Mistral "
|
||||
f"tokenizer is present in {files}.")
|
||||
f"tokenizer is present in {files}."
|
||||
)
|
||||
|
||||
return matched_files[0]
|
||||
|
||||
@@ -149,22 +153,23 @@ def find_tokenizer_file(files: list[str]):
|
||||
def _aggregate_content(content: list) -> list[dict[str, Any]]:
|
||||
aggregated_content: list[dict[str, Any]] = []
|
||||
for chunk in content:
|
||||
if chunk.get("type"
|
||||
) == "text" and aggregated_content and aggregated_content[
|
||||
-1].get("type") == "text":
|
||||
if (
|
||||
chunk.get("type") == "text"
|
||||
and aggregated_content
|
||||
and aggregated_content[-1].get("type") == "text"
|
||||
):
|
||||
aggregated_content[-1]["text"] += "\n\n" + chunk.get("text")
|
||||
else:
|
||||
aggregated_content.append(chunk)
|
||||
if len(aggregated_content) == 1 and aggregated_content[0].get(
|
||||
"type") == "text":
|
||||
if len(aggregated_content) == 1 and aggregated_content[0].get("type") == "text":
|
||||
content = aggregated_content[0]["text"]
|
||||
return content
|
||||
|
||||
|
||||
def make_mistral_chat_completion_request(
|
||||
messages: list["ChatCompletionMessageParam"],
|
||||
tools: Optional[list[dict[str,
|
||||
Any]]] = None) -> "ChatCompletionRequest":
|
||||
messages: list["ChatCompletionMessageParam"],
|
||||
tools: Optional[list[dict[str, Any]]] = None,
|
||||
) -> "ChatCompletionRequest":
|
||||
last_message = cast(dict[str, Any], messages[-1])
|
||||
if last_message["role"] == "assistant":
|
||||
last_message["prefix"] = True
|
||||
@@ -188,8 +193,7 @@ def make_mistral_chat_completion_request(
|
||||
# even if they are empty.
|
||||
if tools:
|
||||
for function in [
|
||||
tool["function"] for tool in tools
|
||||
if tool["type"] == "function"
|
||||
tool["function"] for tool in tools if tool["type"] == "function"
|
||||
]:
|
||||
if function.get("parameters") is None:
|
||||
function["parameters"] = {}
|
||||
@@ -197,12 +201,11 @@ def make_mistral_chat_completion_request(
|
||||
function["description"] = ""
|
||||
|
||||
from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
||||
return ChatCompletionRequest(messages=messages,
|
||||
tools=tools) # type: ignore[type-var]
|
||||
|
||||
return ChatCompletionRequest(messages=messages, tools=tools) # type: ignore[type-var]
|
||||
|
||||
|
||||
class MistralTokenizer(TokenizerBase):
|
||||
|
||||
def __init__(self, tokenizer: "PublicMistralTokenizer") -> None:
|
||||
self.mistral = tokenizer
|
||||
self.instruct = tokenizer.instruct_tokenizer
|
||||
@@ -215,10 +218,13 @@ class MistralTokenizer(TokenizerBase):
|
||||
|
||||
self.is_tekken = isinstance(tokenizer_, Tekkenizer)
|
||||
from mistral_common.tokens.tokenizers.sentencepiece import (
|
||||
SentencePieceTokenizer)
|
||||
SentencePieceTokenizer,
|
||||
)
|
||||
|
||||
self.is_spm = isinstance(tokenizer_, SentencePieceTokenizer)
|
||||
self._special_token_policy = (SpecialTokenPolicy.IGNORE
|
||||
if self.is_tekken else None)
|
||||
self._special_token_policy = (
|
||||
SpecialTokenPolicy.IGNORE if self.is_tekken else None
|
||||
)
|
||||
if not (self.is_tekken or self.is_spm):
|
||||
raise TypeError(f"Unsupported tokenizer: {type(tokenizer_)}")
|
||||
|
||||
@@ -226,57 +232,54 @@ class MistralTokenizer(TokenizerBase):
|
||||
# Convert to a dict[str, int] to match protocol, but this is a lossy
|
||||
# conversion. There may be multiple token ids that decode to the same
|
||||
# string due to partial UTF-8 byte sequences being converted to <20>
|
||||
self._vocab_dict = {
|
||||
token: idx
|
||||
for idx, token in enumerate(self._vocab)
|
||||
}
|
||||
self._vocab_dict = {token: idx for idx, token in enumerate(self._vocab)}
|
||||
self.tokenizer = tokenizer_
|
||||
self._max_token_id = self.vocab_size - 1
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls,
|
||||
path_or_repo_id: str,
|
||||
*,
|
||||
revision: Optional[str] = None) -> "MistralTokenizer":
|
||||
def from_pretrained(
|
||||
cls, path_or_repo_id: str, *, revision: Optional[str] = None
|
||||
) -> "MistralTokenizer":
|
||||
if not Path(path_or_repo_id).exists():
|
||||
assert len(path_or_repo_id.split("/")) == 2, (
|
||||
"You have either provided a non-existent path: "
|
||||
"{path_or_repo_id} or an invalid HF Hub repo id.")
|
||||
"{path_or_repo_id} or an invalid HF Hub repo id."
|
||||
)
|
||||
tokenizer_file = cls._download_mistral_tokenizer_from_hf(
|
||||
path_or_repo_id, revision)
|
||||
path_or_repo_id, revision
|
||||
)
|
||||
elif Path(path_or_repo_id).is_dir():
|
||||
tokenizer_file_name = find_tokenizer_file(
|
||||
os.listdir(path_or_repo_id))
|
||||
tokenizer_file_name = find_tokenizer_file(os.listdir(path_or_repo_id))
|
||||
tokenizer_file = str(Path(path_or_repo_id) / tokenizer_file_name)
|
||||
else:
|
||||
assert Path(
|
||||
path_or_repo_id).is_file(), f"Invalid path: {path_or_repo_id}"
|
||||
assert Path(path_or_repo_id).is_file(), f"Invalid path: {path_or_repo_id}"
|
||||
tokenizer_file = str(Path(path_or_repo_id))
|
||||
|
||||
from mistral_common.tokens.tokenizers.mistral import (
|
||||
MistralTokenizer as PublicMistralTokenizer)
|
||||
MistralTokenizer as PublicMistralTokenizer,
|
||||
)
|
||||
|
||||
mistral_tokenizer = PublicMistralTokenizer.from_file(tokenizer_file)
|
||||
return cls(mistral_tokenizer)
|
||||
|
||||
@staticmethod
|
||||
def _download_mistral_tokenizer_from_hf(tokenizer_name: str,
|
||||
revision: Optional[str]) -> str:
|
||||
def _download_mistral_tokenizer_from_hf(
|
||||
tokenizer_name: str, revision: Optional[str]
|
||||
) -> str:
|
||||
try:
|
||||
hf_api = HfApi()
|
||||
files = hf_api.list_repo_files(repo_id=tokenizer_name,
|
||||
revision=revision)
|
||||
files = hf_api.list_repo_files(repo_id=tokenizer_name, revision=revision)
|
||||
except ConnectionError as exc:
|
||||
files = list_local_repo_files(repo_id=tokenizer_name,
|
||||
revision=revision)
|
||||
files = list_local_repo_files(repo_id=tokenizer_name, revision=revision)
|
||||
|
||||
if len(files) == 0:
|
||||
raise exc
|
||||
|
||||
filename = find_tokenizer_file(files)
|
||||
|
||||
tokenizer_file = hf_hub_download(tokenizer_name,
|
||||
filename=filename,
|
||||
revision=revision)
|
||||
tokenizer_file = hf_hub_download(
|
||||
tokenizer_name, filename=filename, revision=revision
|
||||
)
|
||||
return tokenizer_file
|
||||
|
||||
# the following attributes are set to fit vLLM's design and are used
|
||||
@@ -290,10 +293,7 @@ class MistralTokenizer(TokenizerBase):
|
||||
special_tokens = self.tokenizer.SPECIAL_TOKENS
|
||||
else:
|
||||
special_tokens = list(SpecialTokens)
|
||||
return [
|
||||
s.value if isinstance(s, SpecialTokens) else s
|
||||
for s in special_tokens
|
||||
]
|
||||
return [s.value if isinstance(s, SpecialTokens) else s for s in special_tokens]
|
||||
|
||||
@property
|
||||
def all_special_tokens(self) -> list[str]:
|
||||
@@ -301,9 +301,7 @@ class MistralTokenizer(TokenizerBase):
|
||||
|
||||
@property
|
||||
def all_special_ids(self) -> list[int]:
|
||||
return [
|
||||
self.all_special_tokens.index(t) for t in self.all_special_tokens
|
||||
]
|
||||
return [self.all_special_tokens.index(t) for t in self.all_special_tokens]
|
||||
|
||||
@property
|
||||
def bos_token_id(self) -> int:
|
||||
@@ -386,26 +384,29 @@ class MistralTokenizer(TokenizerBase):
|
||||
input_ids = input_ids[:max_length]
|
||||
return input_ids
|
||||
|
||||
def encode(self,
|
||||
text: str,
|
||||
truncation: Optional[bool] = None,
|
||||
max_length: Optional[int] = None,
|
||||
add_special_tokens: Optional[bool] = None) -> list[int]:
|
||||
def encode(
|
||||
self,
|
||||
text: str,
|
||||
truncation: Optional[bool] = None,
|
||||
max_length: Optional[int] = None,
|
||||
add_special_tokens: Optional[bool] = None,
|
||||
) -> list[int]:
|
||||
# `encode` should only be used for prompt completion
|
||||
# it should never be used for chat_completion.
|
||||
# For chat completion use `apply_chat_template`
|
||||
if add_special_tokens is not None:
|
||||
return self.tokenizer.encode(text,
|
||||
bos=add_special_tokens,
|
||||
eos=add_special_tokens)
|
||||
return self.tokenizer.encode(
|
||||
text, bos=add_special_tokens, eos=add_special_tokens
|
||||
)
|
||||
else:
|
||||
return self.tokenizer.encode(text, bos=True, eos=False)
|
||||
|
||||
def apply_chat_template(self,
|
||||
messages: list["ChatCompletionMessageParam"],
|
||||
tools: Optional[list[dict[str, Any]]] = None,
|
||||
**kwargs) -> list[int]:
|
||||
|
||||
def apply_chat_template(
|
||||
self,
|
||||
messages: list["ChatCompletionMessageParam"],
|
||||
tools: Optional[list[dict[str, Any]]] = None,
|
||||
**kwargs,
|
||||
) -> list[int]:
|
||||
request = make_mistral_chat_completion_request(messages, tools)
|
||||
encoded = self.mistral.encode_chat_completion(request)
|
||||
|
||||
@@ -414,11 +415,15 @@ class MistralTokenizer(TokenizerBase):
|
||||
|
||||
def convert_tokens_to_string(self, tokens: list[str]) -> str:
|
||||
from mistral_common.tokens.tokenizers.base import SpecialTokens
|
||||
|
||||
if self.is_tekken:
|
||||
tokens = [
|
||||
t for t in tokens
|
||||
if (t is SpecialTokens.tool_calls
|
||||
or t not in self.tokenizer._all_special_tokens)
|
||||
t
|
||||
for t in tokens
|
||||
if (
|
||||
t is SpecialTokens.tool_calls
|
||||
or t not in self.tokenizer._all_special_tokens
|
||||
)
|
||||
]
|
||||
|
||||
if any(isinstance(t, bytes) for t in tokens):
|
||||
@@ -426,20 +431,20 @@ class MistralTokenizer(TokenizerBase):
|
||||
shift = self.tokenizer.num_special_tokens
|
||||
|
||||
def _token_to_id(t: str):
|
||||
t_bytes = t.encode("utf-8") \
|
||||
if not isinstance(t, bytes) else t
|
||||
t_bytes = t.encode("utf-8") if not isinstance(t, bytes) else t
|
||||
try:
|
||||
return shift + \
|
||||
self.tokenizer._tekken_token2id_nospecial[t_bytes]
|
||||
return (
|
||||
shift + self.tokenizer._tekken_token2id_nospecial[t_bytes]
|
||||
)
|
||||
except KeyError:
|
||||
logger.warning(
|
||||
"Failed to convert token %s to id,"
|
||||
" replacing with <unk>", t_bytes)
|
||||
"Failed to convert token %s to id, replacing with <unk>",
|
||||
t_bytes,
|
||||
)
|
||||
return self.tokenizer.unk_id
|
||||
|
||||
ids = [_token_to_id(t) for t in tokens]
|
||||
decoded = self.tokenizer.decode(ids,
|
||||
self._special_token_policy)
|
||||
decoded = self.tokenizer.decode(ids, self._special_token_policy)
|
||||
else:
|
||||
decoded = "".join(tokens)
|
||||
else:
|
||||
@@ -453,8 +458,10 @@ class MistralTokenizer(TokenizerBase):
|
||||
if token in special_tokens:
|
||||
if regular_tokens:
|
||||
decoded_list.append(
|
||||
self.tokenizer.decode(regular_tokens,
|
||||
self._special_token_policy))
|
||||
self.tokenizer.decode(
|
||||
regular_tokens, self._special_token_policy
|
||||
)
|
||||
)
|
||||
regular_tokens = []
|
||||
decoded_list.append(token)
|
||||
else:
|
||||
@@ -462,19 +469,19 @@ class MistralTokenizer(TokenizerBase):
|
||||
|
||||
if regular_tokens:
|
||||
decoded_list.append(
|
||||
self.tokenizer.decode(regular_tokens,
|
||||
self._special_token_policy))
|
||||
self.tokenizer.decode(regular_tokens, self._special_token_policy)
|
||||
)
|
||||
|
||||
decoded = ''.join(decoded_list)
|
||||
decoded = "".join(decoded_list)
|
||||
|
||||
return decoded
|
||||
|
||||
def decode(self,
|
||||
ids: Union[list[int], int],
|
||||
skip_special_tokens: bool = True) -> str:
|
||||
assert (
|
||||
skip_special_tokens
|
||||
), "skip_special_tokens=False is not supported for Mistral tokenizers."
|
||||
def decode(
|
||||
self, ids: Union[list[int], int], skip_special_tokens: bool = True
|
||||
) -> str:
|
||||
assert skip_special_tokens, (
|
||||
"skip_special_tokens=False is not supported for Mistral tokenizers."
|
||||
)
|
||||
|
||||
if isinstance(ids, int):
|
||||
ids = [ids]
|
||||
@@ -486,13 +493,12 @@ class MistralTokenizer(TokenizerBase):
|
||||
skip_special_tokens: bool = True,
|
||||
) -> list[str]:
|
||||
from mistral_common.tokens.tokenizers.base import SpecialTokens
|
||||
from mistral_common.tokens.tokenizers.instruct import (
|
||||
InstructTokenizerV13)
|
||||
from mistral_common.tokens.tokenizers.instruct import InstructTokenizerV13
|
||||
|
||||
# TODO(Patrick) - potentially allow special tokens to not be skipped
|
||||
assert (
|
||||
skip_special_tokens
|
||||
), "skip_special_tokens=False is not supported for Mistral tokenizers."
|
||||
assert skip_special_tokens, (
|
||||
"skip_special_tokens=False is not supported for Mistral tokenizers."
|
||||
)
|
||||
|
||||
assert self.is_tekken or self.is_spm, type(self.tokenizer)
|
||||
|
||||
@@ -507,8 +513,9 @@ class MistralTokenizer(TokenizerBase):
|
||||
if self.instruct.END_THINK:
|
||||
non_skip_special_tokens.add(self.instruct.END_THINK)
|
||||
ids = [
|
||||
i for i in ids if i > self.tokenizer.num_special_tokens
|
||||
or i in non_skip_special_tokens
|
||||
i
|
||||
for i in ids
|
||||
if i > self.tokenizer.num_special_tokens or i in non_skip_special_tokens
|
||||
]
|
||||
|
||||
tokens = [self.tokenizer.id_to_piece(id) for id in ids]
|
||||
|
||||
@@ -15,7 +15,7 @@ logger = init_logger(__name__)
|
||||
|
||||
|
||||
def is_s3(model_or_path: str) -> bool:
|
||||
return model_or_path.lower().startswith('s3://')
|
||||
return model_or_path.lower().startswith("s3://")
|
||||
|
||||
|
||||
def check_gguf_file(model: Union[str, PathLike]) -> bool:
|
||||
@@ -43,13 +43,16 @@ def modelscope_list_repo_files(
|
||||
) -> list[str]:
|
||||
"""List files in a modelscope repo."""
|
||||
from modelscope.hub.api import HubApi
|
||||
|
||||
api = HubApi()
|
||||
api.login(token)
|
||||
# same as huggingface_hub.list_repo_files
|
||||
files = [
|
||||
file['Path'] for file in api.get_model_files(
|
||||
model_id=repo_id, revision=revision, recursive=True)
|
||||
if file['Type'] == 'blob'
|
||||
file["Path"]
|
||||
for file in api.get_model_files(
|
||||
model_id=repo_id, revision=revision, recursive=True
|
||||
)
|
||||
if file["Type"] == "blob"
|
||||
]
|
||||
return files
|
||||
|
||||
@@ -91,18 +94,18 @@ def maybe_model_redirect(model: str) -> str:
|
||||
if not Path(model_redirect_path).exists():
|
||||
return model
|
||||
|
||||
redirect_dict = (_maybe_json_dict(model_redirect_path)
|
||||
or _maybe_space_split_dict(model_redirect_path))
|
||||
if (redirect_model := redirect_dict.get(model)):
|
||||
redirect_dict = _maybe_json_dict(model_redirect_path) or _maybe_space_split_dict(
|
||||
model_redirect_path
|
||||
)
|
||||
if redirect_model := redirect_dict.get(model):
|
||||
logger.info("model redirect: [ %s ] -> [ %s ]", model, redirect_model)
|
||||
return redirect_model
|
||||
|
||||
return model
|
||||
|
||||
|
||||
def parse_safetensors_file_metadata(
|
||||
path: Union[str, PathLike]) -> dict[str, Any]:
|
||||
def parse_safetensors_file_metadata(path: Union[str, PathLike]) -> dict[str, Any]:
|
||||
with open(path, "rb") as f:
|
||||
length_of_metadata = struct.unpack('<Q', f.read(8))[0]
|
||||
metadata = json.loads(f.read(length_of_metadata).decode('utf-8'))
|
||||
length_of_metadata = struct.unpack("<Q", f.read(8))[0]
|
||||
metadata = json.loads(f.read(length_of_metadata).decode("utf-8"))
|
||||
return metadata
|
||||
|
||||
Reference in New Issue
Block a user