vllm/transformers_utils/tokenizer.py

import os
from typing import Optional, Union

import huggingface_hub
from transformers import (AutoTokenizer, PreTrainedTokenizer,
                          PreTrainedTokenizerFast)

from vllm.envs import VLLM_USE_MODELSCOPE
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.transformers_utils.tokenizers import BaichuanTokenizer
from vllm.utils import make_async

logger = init_logger(__name__)


def get_cached_tokenizer(
    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
    """Get tokenizer with cached properties.

    This will patch the tokenizer object in place.

    By default, transformers will recompute multiple tokenizer properties
    each time they are called, leading to a significant slowdown. This
    function caches these properties for faster access."""

    tokenizer_all_special_ids = set(tokenizer.all_special_ids)
    tokenizer_all_special_tokens_extended = (
        tokenizer.all_special_tokens_extended)
    tokenizer_all_special_tokens = set(tokenizer.all_special_tokens)
    tokenizer_len = len(tokenizer)

    class CachedTokenizer(tokenizer.__class__):  # type: ignore

        @property
        def all_special_ids(self):
            return tokenizer_all_special_ids

        @property
        def all_special_tokens(self):
            return tokenizer_all_special_tokens

        @property
        def all_special_tokens_extended(self):
            return tokenizer_all_special_tokens_extended

        def __len__(self):
            return tokenizer_len

    CachedTokenizer.__name__ = f"Cached{tokenizer.__class__.__name__}"

    tokenizer.__class__ = CachedTokenizer
    return tokenizer


def get_tokenizer(
    tokenizer_name: str,
    *args,
    tokenizer_mode: str = "auto",
    trust_remote_code: bool = False,
    revision: Optional[str] = None,
    download_dir: Optional[str] = None,
    **kwargs,
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
    """Gets a tokenizer for the given model name via HuggingFace or ModelScope.
    """
    if VLLM_USE_MODELSCOPE:
        # download model from ModelScope hub,
        # lazy import so that modelscope is not required for normal use.
        # pylint: disable=C.
        from modelscope.hub.snapshot_download import snapshot_download

        # Only set the tokenizer here, model will be downloaded on the workers.
        if not os.path.exists(tokenizer_name):
            tokenizer_path = snapshot_download(
                model_id=tokenizer_name,
                cache_dir=download_dir,
                revision=revision,
                local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
                # Ignore weights - we only need the tokenizer.
                ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
            tokenizer_name = tokenizer_path

    if tokenizer_mode == "slow":
        if kwargs.get("use_fast", False):
            raise ValueError(
                "Cannot use the fast tokenizer in slow tokenizer mode.")
        kwargs["use_fast"] = False

    try:
        tokenizer = AutoTokenizer.from_pretrained(
            tokenizer_name,
            *args,
            trust_remote_code=trust_remote_code,
            revision=revision,
            **kwargs)
    except ValueError as e:
        # If the error pertains to the tokenizer class not existing or not
        # currently being imported, suggest using the --trust-remote-code flag.
        if (not trust_remote_code and
            ("does not exist or is not currently imported." in str(e)
             or "requires you to execute the tokenizer file" in str(e))):
            err_msg = (
                "Failed to load the tokenizer. If the tokenizer is a custom "
                "tokenizer not yet available in the HuggingFace transformers "
                "library, consider setting `trust_remote_code=True` in LLM "
                "or using the `--trust-remote-code` flag in the CLI.")
            raise RuntimeError(err_msg) from e
        else:
            raise e
    except AttributeError as e:
        if "BaichuanTokenizer" in str(e):
            # This is for the error "'BaichuanTokenizer' object has no
            # attribute 'sp_model'".
            tokenizer = BaichuanTokenizer.from_pretrained(
                tokenizer_name,
                *args,
                trust_remote_code=trust_remote_code,
                revision=revision,
                **kwargs)
        else:
            raise e

    if not isinstance(tokenizer, PreTrainedTokenizerFast):
        logger.warning(
            "Using a slow tokenizer. This might cause a significant "
            "slowdown. Consider using a fast tokenizer instead.")
    return get_cached_tokenizer(tokenizer)


def get_lora_tokenizer(lora_request: LoRARequest, *args,
                       **kwargs) -> Optional[PreTrainedTokenizer]:
    if lora_request is None:
        return None
    try:
        tokenizer = get_tokenizer(lora_request.lora_local_path, *args,
                                  **kwargs)
    except OSError as e:
        # No tokenizer was found in the LoRA folder,
        # use base model tokenizer
        logger.warning(
            "No tokenizer found in %s, using base model tokenizer instead. "
            "(Exception: %s)", lora_request.lora_local_path, e)
        tokenizer = None
    return tokenizer


get_lora_tokenizer_async = make_async(get_lora_tokenizer)
[Core] Refactor model loading code (#4097) 2024-04-16 11:34:39 -07:00			`import os`
[Misc] Some minor simplifications to detokenization logic (#3670) Some simplifications made for clarity. Also moves detokenization-related functions from tokenizer.py to detokenizer.py. 2024-04-01 13:22:06 -07:00			`from typing import Optional, Union`
Use slow tokenizer for LLaMA (#84) 2023-05-09 16:03:44 -07:00
[Core] Support offline use of local cache for models (#4374) Signed-off-by: Prashant Gupta <prashantgupta@us.ibm.com> Co-authored-by: Travis Johnson <tjohnson31415@gmail.com> 2024-04-27 09:59:55 -07:00			`import huggingface_hub`
[Tokenizer] Add an option to specify tokenizer (#284) 2023-06-28 09:46:58 -07:00			`from transformers import (AutoTokenizer, PreTrainedTokenizer,`
Use slow tokenizer for LLaMA (#84) 2023-05-09 16:03:44 -07:00			`PreTrainedTokenizerFast)`

[Misc] centralize all usage of environment variables (#4548) 2024-05-02 11:13:25 -07:00			`from vllm.envs import VLLM_USE_MODELSCOPE`
Change the name to vLLM (#150) 2023-06-17 03:07:40 -07:00			`from vllm.logger import init_logger`
[Experimental] Add multi-LoRA support (#1804) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-01-24 00:26:37 +01:00			`from vllm.lora.request import LoRARequest`
[mypy] Add mypy type annotation part 1 (#4006) 2024-04-13 06:35:50 +09:00			`from vllm.transformers_utils.tokenizers import BaichuanTokenizer`
[CI] Try introducing isort. (#3495) 2024-03-25 23:59:47 +09:00			`from vllm.utils import make_async`
Incrementally decode output tokens (#121) 2023-05-23 20:46:32 -07:00
			`logger = init_logger(__name__)`

Use slow tokenizer for LLaMA (#84) 2023-05-09 16:03:44 -07:00
Asynchronous tokenization (#2879) 2024-03-15 16:37:01 -07:00			`def get_cached_tokenizer(`
			`tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]`
			`) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:`
			`"""Get tokenizer with cached properties.`

			`This will patch the tokenizer object in place.`

			`By default, transformers will recompute multiple tokenizer properties`
			`each time they are called, leading to a significant slowdown. This`
			`function caches these properties for faster access."""`

			`tokenizer_all_special_ids = set(tokenizer.all_special_ids)`
			`tokenizer_all_special_tokens_extended = (`
			`tokenizer.all_special_tokens_extended)`
			`tokenizer_all_special_tokens = set(tokenizer.all_special_tokens)`
[Core][Bugfix] cache len of tokenizer (#3741) 2024-03-29 18:46:39 -07:00			`tokenizer_len = len(tokenizer)`
Asynchronous tokenization (#2879) 2024-03-15 16:37:01 -07:00
[mypy] Add mypy type annotation part 1 (#4006) 2024-04-13 06:35:50 +09:00			`class CachedTokenizer(tokenizer.__class__): # type: ignore`
Asynchronous tokenization (#2879) 2024-03-15 16:37:01 -07:00
			`@property`
			`def all_special_ids(self):`
			`return tokenizer_all_special_ids`

			`@property`
			`def all_special_tokens(self):`
			`return tokenizer_all_special_tokens`

			`@property`
			`def all_special_tokens_extended(self):`
			`return tokenizer_all_special_tokens_extended`

[Core][Bugfix] cache len of tokenizer (#3741) 2024-03-29 18:46:39 -07:00			`def __len__(self):`
			`return tokenizer_len`

Asynchronous tokenization (#2879) 2024-03-15 16:37:01 -07:00			`CachedTokenizer.__name__ = f"Cached{tokenizer.__class__.__name__}"`

			`tokenizer.__class__ = CachedTokenizer`
			`return tokenizer`


Use slow tokenizer for LLaMA (#84) 2023-05-09 16:03:44 -07:00			`def get_tokenizer(`
[Tokenizer] Add an option to specify tokenizer (#284) 2023-06-28 09:46:58 -07:00			`tokenizer_name: str,`
Use slow tokenizer for LLaMA (#84) 2023-05-09 16:03:44 -07:00			`*args,`
[Quality] Add code formatter and linter (#326) 2023-07-03 11:31:55 -07:00			`tokenizer_mode: str = "auto",`
Add trust-remote-code flag to handle remote tokenizers (#364) 2023-07-07 20:04:58 +02:00			`trust_remote_code: bool = False,`
[Bugfix] Fix parameter name in `get_tokenizer` (#4107) 2024-04-26 10:10:48 +08:00			`revision: Optional[str] = None,`
[Core] Refactor model loading code (#4097) 2024-04-16 11:34:39 -07:00			`download_dir: Optional[str] = None,`
Use slow tokenizer for LLaMA (#84) 2023-05-09 16:03:44 -07:00			`**kwargs,`
			`) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:`
[Bugfix] Fix parameter name in `get_tokenizer` (#4107) 2024-04-26 10:10:48 +08:00			`"""Gets a tokenizer for the given model name via HuggingFace or ModelScope.`
			`"""`
[Core] Refactor model loading code (#4097) 2024-04-16 11:34:39 -07:00			`if VLLM_USE_MODELSCOPE:`
			`# download model from ModelScope hub,`
			`# lazy import so that modelscope is not required for normal use.`
			`# pylint: disable=C.`
			`from modelscope.hub.snapshot_download import snapshot_download`

			`# Only set the tokenizer here, model will be downloaded on the workers.`
			`if not os.path.exists(tokenizer_name):`
			`tokenizer_path = snapshot_download(`
			`model_id=tokenizer_name,`
			`cache_dir=download_dir,`
[Bugfix] Fix parameter name in `get_tokenizer` (#4107) 2024-04-26 10:10:48 +08:00			`revision=revision,`
[Core] Support offline use of local cache for models (#4374) Signed-off-by: Prashant Gupta <prashantgupta@us.ibm.com> Co-authored-by: Travis Johnson <tjohnson31415@gmail.com> 2024-04-27 09:59:55 -07:00			`local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,`
[Core] Refactor model loading code (#4097) 2024-04-16 11:34:39 -07:00			`# Ignore weights - we only need the tokenizer.`
fix_tokenizer_snapshot_download_bug (#4493) 2024-05-01 07:38:50 +08:00			`ignore_file_pattern=["..pt", "..safetensors", ".*.bin"])`
[Core] Refactor model loading code (#4097) 2024-04-16 11:34:39 -07:00			`tokenizer_name = tokenizer_path`

[Tokenizer] Add tokenizer mode (#298) 2023-06-28 14:19:22 -07:00			`if tokenizer_mode == "slow":`
			`if kwargs.get("use_fast", False):`
			`raise ValueError(`
			`"Cannot use the fast tokenizer in slow tokenizer mode.")`
			`kwargs["use_fast"] = False`

[Tokenizer] Add an option to specify tokenizer (#284) 2023-06-28 09:46:58 -07:00			`try:`
Add trust-remote-code flag to handle remote tokenizers (#364) 2023-07-07 20:04:58 +02:00			`tokenizer = AutoTokenizer.from_pretrained(`
			`tokenizer_name,`
			`*args,`
Add trust_remote_code arg to get_config (#405) 2023-07-08 15:24:17 -07:00			`trust_remote_code=trust_remote_code,`
[Bugfix] Fix parameter name in `get_tokenizer` (#4107) 2024-04-26 10:10:48 +08:00			`revision=revision,`
Add trust-remote-code flag to handle remote tokenizers (#364) 2023-07-07 20:04:58 +02:00			`**kwargs)`
			`except ValueError as e:`
			`# If the error pertains to the tokenizer class not existing or not`
			`# currently being imported, suggest using the --trust-remote-code flag.`
Add trust_remote_code arg to get_config (#405) 2023-07-08 15:24:17 -07:00			`if (not trust_remote_code and`
Add trust-remote-code flag to handle remote tokenizers (#364) 2023-07-07 20:04:58 +02:00			`("does not exist or is not currently imported." in str(e)`
			`or "requires you to execute the tokenizer file" in str(e))):`
			`err_msg = (`
			`"Failed to load the tokenizer. If the tokenizer is a custom "`
			`"tokenizer not yet available in the HuggingFace transformers "`
Add trust_remote_code arg to get_config (#405) 2023-07-08 15:24:17 -07:00			"library, consider setting `trust_remote_code=True` in LLM "
			"or using the `--trust-remote-code` flag in the CLI.")
Add trust-remote-code flag to handle remote tokenizers (#364) 2023-07-07 20:04:58 +02:00			`raise RuntimeError(err_msg) from e`
			`else:`
			`raise e`
Fix Baichuan tokenizer error (#1874) 2023-11-30 18:35:50 -08:00			`except AttributeError as e:`
			`if "BaichuanTokenizer" in str(e):`
			`# This is for the error "'BaichuanTokenizer' object has no`
			`# attribute 'sp_model'".`
			`tokenizer = BaichuanTokenizer.from_pretrained(`
			`tokenizer_name,`
			`*args,`
			`trust_remote_code=trust_remote_code,`
[Bugfix] Fix parameter name in `get_tokenizer` (#4107) 2024-04-26 10:10:48 +08:00			`revision=revision,`
Fix Baichuan tokenizer error (#1874) 2023-11-30 18:35:50 -08:00			`**kwargs)`
			`else:`
			`raise e`
[Tokenizer] Add an option to specify tokenizer (#284) 2023-06-28 09:46:58 -07:00
			`if not isinstance(tokenizer, PreTrainedTokenizerFast):`
			`logger.warning(`
			`"Using a slow tokenizer. This might cause a significant "`
			`"slowdown. Consider using a fast tokenizer instead.")`
Asynchronous tokenization (#2879) 2024-03-15 16:37:01 -07:00			`return get_cached_tokenizer(tokenizer)`
Incrementally decode output tokens (#121) 2023-05-23 20:46:32 -07:00

[Experimental] Add multi-LoRA support (#1804) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-01-24 00:26:37 +01:00			`def get_lora_tokenizer(lora_request: LoRARequest, *args,`
			`**kwargs) -> Optional[PreTrainedTokenizer]:`
			`if lora_request is None:`
			`return None`
			`try:`
			`tokenizer = get_tokenizer(lora_request.lora_local_path, *args,`
			`**kwargs)`
			`except OSError as e:`
			`# No tokenizer was found in the LoRA folder,`
			`# use base model tokenizer`
			`logger.warning(`
[CI] Disable non-lazy string operation on logging (#4326) Co-authored-by: Danny Guinther <dguinther@neuralmagic.com> 2024-04-26 16:16:58 +09:00			`"No tokenizer found in %s, using base model tokenizer instead. "`
			`"(Exception: %s)", lora_request.lora_local_path, e)`
[Experimental] Add multi-LoRA support (#1804) Co-authored-by: Chen Shen <scv119@gmail.com> Co-authored-by: Shreyas Krishnaswamy <shrekris@anyscale.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> 2024-01-24 00:26:37 +01:00			`tokenizer = None`
			`return tokenizer`


			`get_lora_tokenizer_async = make_async(get_lora_tokenizer)`