[Core] Remove tokenizer group in vLLM (#24078)

Signed-off-by: Zhuohan Li <zhuohan123@gmail.com>
2025-09-17 01:42:59 -07:00
parent c15309a730
commit 6c47f6bfa4
49 changed files with 276 additions and 934 deletions
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -37,7 +37,7 @@ from vllm.lora.request import LoRARequest
 from vllm.lora.utils import get_adapter_absolute_path
 from vllm.multimodal import MultiModalDataDict
 from vllm.multimodal.image import convert_image_mode
-from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
+from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import PlaceholderModule

 try:
@@ -100,8 +100,8 @@ class BenchmarkDataset(ABC):
    ) -> None:
        """
        Initialize the BenchmarkDataset with an optional dataset path and random
-        seed.  
-        
+        seed.
+
        Args:
            dataset_path (Optional[str]): Path to the dataset. If None, it
            indicates that a default or random dataset might be used.
@@ -133,10 +133,10 @@ class BenchmarkDataset(ABC):
            elif isinstance(mm_content, dict):
                content.append(mm_content)
            else:
-                raise TypeError(  
+                raise TypeError(
                    "Could not process multimodal content of type: " +
-                    f"{type(mm_content)}"  
-                ) 
+                    f"{type(mm_content)}"
+                )
        return [{"role": "user", "content": content}]

    def load_data(self) -> None:
@@ -155,34 +155,26 @@ class BenchmarkDataset(ABC):

    def get_random_lora_request(
        self,
-        tokenizer: PreTrainedTokenizerBase,
        max_loras: Optional[int] = None,
        lora_path: Optional[str] = None,
-    ) -> tuple[Optional[LoRARequest], AnyTokenizer]:
+    ) -> Optional[LoRARequest]:
        """
-        Optionally select a random LoRA request and return its associated
-        tokenizer.
+        Optionally select a random LoRA request.

        This method is used when LoRA parameters are provided.  It randomly
-        selects a LoRA based on max_loras and retrieves a cached tokenizer for
-        that LoRA if available. Otherwise, it returns the base tokenizer.
+        selects a LoRA based on max_loras.

        Args:
-            tokenizer (PreTrainedTokenizerBase): The base tokenizer to use if no
-                LoRA is selected.
            max_loras (Optional[int]): The maximum number of LoRAs available.
                If `None`, LoRA is not used.
            lora_path (Optional[str]): Path to the LoRA parameters on disk.
                If `None`, LoRA is not used.

        Returns:
-            A tuple with the following elements:
-                - A new [LoRARequest][] (or `None` if not applicable).
-                - The tokenizer associated with the LoRA request
-                  (or the base tokenizer).
+            A new [LoRARequest][] (or `None` if not applicable).
        """
        if max_loras is None or lora_path is None:
-            return None, tokenizer
+            return None

        # Generate a random LoRA ID in the range [1, max_loras].
        lora_id = random.randint(1, max_loras)
@@ -191,11 +183,7 @@ class BenchmarkDataset(ABC):
            lora_int_id=lora_id,
            lora_path=lora_path_on_disk(lora_path),
        )
-        if lora_id not in lora_tokenizer_cache:
-            lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request)
-        # Return lora_request and the cached tokenizer if available; otherwise,
-        # return the base tokenizer
-        return lora_request, lora_tokenizer_cache[lora_id] or tokenizer
+        return lora_request

    @abstractmethod
    def sample(self, tokenizer: PreTrainedTokenizerBase,
@@ -213,7 +201,7 @@ class BenchmarkDataset(ABC):
                for processing the dataset's text.
            num_requests (int): The number of sample requests to generate.
            request_id_prefix (str) The prefix of request_id.
-            
+

        Returns:
            list[SampleRequest]: A list of sample requests generated from the
@@ -527,7 +515,7 @@ class RandomDataset(BenchmarkDataset):
                                           size=num_requests)
        output_lens = self._rng.integers(output_low, output_high + 1,
                                            size=num_requests)
-        offsets = self._rng.integers(0, tokenizer.vocab_size, 
+        offsets = self._rng.integers(0, tokenizer.vocab_size,
                                        size=num_requests)
        return input_lens, output_lens, offsets

@@ -555,7 +543,7 @@ class RandomDataset(BenchmarkDataset):
        the encoded sequence is truncated before being decoded again.
        """
        # Build the inner sequence by sampling sequentially from the vocab
-        inner_seq = ((offset + index + np.arange(input_len)) 
+        inner_seq = ((offset + index + np.arange(input_len))
                    % vocab_size).tolist()
        token_sequence = prefix_token_ids + inner_seq

@@ -590,9 +578,9 @@ class RandomMultiModalDataset(RandomDataset):
       `num_mm_items_range_ratio` in [0, 1]. r=0 keeps it fixed; r=1 allows 0.
       The maximum is further clamped to the sum of per-modality limits.
    2) Each item’s modality and shape is sampled from `bucket_config`, a dict
-       mapping (height, width, num_frames) → probability. We treat 
-       `num_frames`=1 as image and and `num_frames` > 1 as video. 
-       Entries with zero probability are removed and the rest are renormalized 
+       mapping (height, width, num_frames) → probability. We treat
+       `num_frames`=1 as image and and `num_frames` > 1 as video.
+       Entries with zero probability are removed and the rest are renormalized
       to sum to 1.
    3) Per-modality hard caps are enforced via `limit_mm_per_prompt`.
       When a modality reaches its cap, all of its buckets are excluded and the
@@ -600,8 +588,8 @@ class RandomMultiModalDataset(RandomDataset):

    Example bucket configuration:
    {(256, 256, 1): 0.5, (720, 1280, 1): 0.4, (720, 1280, 16): 0.1}
-      - Two image buckets (`num_frames`=1) and one video bucket 
-      (`num_frames`=16). 
+      - Two image buckets (`num_frames`=1) and one video bucket
+      (`num_frames`=16).
    OBS.: Only image sampling is supported for now.
    """

@@ -624,9 +612,9 @@ class RandomMultiModalDataset(RandomDataset):

    def generate_synthetic_image(self, width: int, height: int) -> Image.Image:
        """Generate synthetic PIL image with random RGB values.
-        
-        NOTE: iid pixel sampling results in worst-case compression 
-        (good for stressing I/O), but very unlike real photos. 
+
+        NOTE: iid pixel sampling results in worst-case compression
+        (good for stressing I/O), but very unlike real photos.
        We could consider a “low-freq” mode (e.g., noise blur)
        to emulate network realism instead of max stress.
        """
@@ -638,11 +626,11 @@ class RandomMultiModalDataset(RandomDataset):
        )
        return Image.fromarray(random_pixels)

-    def generate_synthetic_video(self, width: int, 
-                                    height: int, 
+    def generate_synthetic_video(self, width: int,
+                                    height: int,
                                    num_frames: int) -> Any:
        """Generate synthetic video with random values.
-        
+
        TODO: Finish this method.
        """
        raise NotImplementedError("Video sampling is WIP.")
@@ -656,7 +644,7 @@ class RandomMultiModalDataset(RandomDataset):
        else:
            raise ValueError(f"Invalid multimodal item configuration: {config}")

-    def normalize_bucket_config(self, bucket_config: dict[tuple[int, int, int], 
+    def normalize_bucket_config(self, bucket_config: dict[tuple[int, int, int],
                                float]) -> dict[tuple[int, int, int], float]:
        """
        Remove zero probability entries
@@ -676,24 +664,24 @@ class RandomMultiModalDataset(RandomDataset):
        return {k: v / total for k, v in bucket_config.items()}


-    def generate_mm_item(self, 
+    def generate_mm_item(self,
                         mm_item_config: tuple[int, int, int],
                         ) -> Mapping[str, Any]:
        """
-        Create synthetic images and videos and 
+        Create synthetic images and videos and
        apply process_image/process_video respectively.
        This follows the OpenAI API chat completions
        https://github.com/openai/openai-python
        """
-        
+
        if self.map_config_to_modality(mm_item_config) == "image":
            return process_image(self.generate_synthetic_image(
                                                            mm_item_config[1],
                                                            mm_item_config[0]))
        elif self.map_config_to_modality(mm_item_config) == "video":
            return process_video(self.generate_synthetic_video(
-                                                            mm_item_config[1], 
-                                                            mm_item_config[0], 
+                                                            mm_item_config[1],
+                                                            mm_item_config[0],
                                                            mm_item_config[2]))
        else:
            raise ValueError(f"Invalid multimodal item configuration: "
@@ -723,17 +711,17 @@ class RandomMultiModalDataset(RandomDataset):
                                 f"limit_mm_per_prompt: "
                                 f"{limit_mm_per_prompt.keys()}")

-        # Remove zero probability entries 
+        # Remove zero probability entries
        # and normalize bucket config to sum to 1
        bucket_config = self.normalize_bucket_config(bucket_config)
        logger.info(
            "Normalized bucket config: %s", bucket_config,
        )
        # Only consider limit per prompt for modalities in bucket config
-        allowed_modalities = {self.map_config_to_modality(cfg) 
+        allowed_modalities = {self.map_config_to_modality(cfg)
                              for cfg in bucket_config}
        limit_mm_per_prompt = {
-            k: v for k, v in limit_mm_per_prompt.items() 
+            k: v for k, v in limit_mm_per_prompt.items()
            if k in allowed_modalities}
        if not limit_mm_per_prompt:
            raise ValueError("No valid limits for modalities present in "
@@ -746,19 +734,19 @@ class RandomMultiModalDataset(RandomDataset):
        # Get max and min num mm items and ensure
        # it is at most the sum of limit_mm_per_prompt for all modalities
        max_num_mm_items = min(
-            sum(limit_mm_per_prompt.values()), 
+            sum(limit_mm_per_prompt.values()),
            math.ceil(base_items_per_request * (1 + num_mm_items_range_ratio))
        )
        # Ensure min num mm items is at least 0
        min_num_mm_items = max(
-            0, 
+            0,
            math.floor(base_items_per_request * (1 - num_mm_items_range_ratio))
        )
        # Raise error if min num mm items is greater than max num mm items
        if min_num_mm_items > max_num_mm_items:
            raise ValueError(f"Min num mm items is greater than max mm items: "
                             f"{min_num_mm_items} > {max_num_mm_items}")
-        
+
        logger.info(
            "Sampling number of multimodal items from [%s, %s]",
            min_num_mm_items, max_num_mm_items,
@@ -783,8 +771,8 @@ class RandomMultiModalDataset(RandomDataset):
        whose size is between min_num_mm_items and max_num_mm_items.

        Loop over the bucket config and sample a multimodal item.
-        Loop until the number of multimodal items sampled is equal to 
-        request_num_mm_items or limit of multimodal items per prompt 
+        Loop until the number of multimodal items sampled is equal to
+        request_num_mm_items or limit of multimodal items per prompt
        for all modalities is reached.

        Note:
@@ -796,19 +784,19 @@ class RandomMultiModalDataset(RandomDataset):
        # Get the number of multimodal items to sample
        request_num_mm_items = int(
            self._rng.integers(min_num_mm_items, max_num_mm_items + 1)
-        ) 
+        )
        # If request_num_mm_items is 0, yield an empty iterator
        if request_num_mm_items == 0:
            return
        # Initialize modality counters
-        modality_counter = {self.map_config_to_modality(k): 0 
+        modality_counter = {self.map_config_to_modality(k): 0
                            for k in bucket_config}
        # Copy the bucket config to avoid modifying the original
        bucket_config_copy = bucket_config.copy()
        # Loop over the number of multimodal items to sample
        while sum(modality_counter.values()) < request_num_mm_items:
            # Sample a multimodal item config
-            mm_item_config = self._rng.choice(list(bucket_config_copy.keys()), 
+            mm_item_config = self._rng.choice(list(bucket_config_copy.keys()),
                                                p=list(bucket_config_copy.values()))
            modality = self.map_config_to_modality(mm_item_config)
            # Check that modality count is less than limit per prompt
@@ -849,7 +837,7 @@ class RandomMultiModalDataset(RandomDataset):
        limit_mm_per_prompt: dict[str, int] = DEFAULT_LIMIT_MM_PER_PROMPT,
        base_items_per_request: int = DEFAULT_BASE_ITEMS_PER_REQUEST,
        num_mm_items_range_ratio: float = DEFAULT_NUM_MM_ITEMS_RANGE_RATIO,
-        bucket_config: dict[tuple[int, int, int], float] = 
+        bucket_config: dict[tuple[int, int, int], float] =
                                        DEFAULT_MM_ITEM_BUCKET_CONFIG,
        enable_multimodal_chat: bool = DEFAULT_ENABLE_MULTIMODAL_CHAT,
        **kwargs,
@@ -857,7 +845,7 @@ class RandomMultiModalDataset(RandomDataset):

        # NOTE: Video sampling is WIP. Raise error if video is in bucket config
        # and probability is non-zero.
-        if any(self.map_config_to_modality(cfg) == "video" and p > 0 
+        if any(self.map_config_to_modality(cfg) == "video" and p > 0
                for cfg, p in bucket_config.items()):
            raise NotImplementedError("Video sampling not implemented; "
                                      "set its probability to 0.")
@@ -908,7 +896,7 @@ class RandomMultiModalDataset(RandomDataset):
            ])

            if enable_multimodal_chat:
-                # NOTE: For now this option is only provided for completeness 
+                # NOTE: For now this option is only provided for completeness
                # given that the serve.py benchmark currently does not use it.
                mm_chat_prompt: Any = prompt
                mm_chat_prompt = self.apply_multimodal_chat_transformation(
@@ -982,8 +970,8 @@ class ShareGPTDataset(BenchmarkDataset):
                entry["conversations"][1]["value"],
            )

-            lora_request, tokenizer = self.get_random_lora_request(
-                tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path)
+            lora_request = self.get_random_lora_request(
+                max_loras=max_loras, lora_path=lora_path)
            prompt_ids = tokenizer(prompt).input_ids
            completion_ids = tokenizer(completion).input_ids
            prompt_len = len(prompt_ids)
@@ -994,11 +982,11 @@ class ShareGPTDataset(BenchmarkDataset):
                                     skip_min_output_len_check=output_len
                                     is not None):
                continue
-            if image_path := entry.get("image"): 
-                mm_content = process_image(image_path) 
-            elif video_path := entry.get("video"): 
+            if image_path := entry.get("image"):
+                mm_content = process_image(image_path)
+            elif video_path := entry.get("video"):
                mm_content = process_video(video_path)
-            else: 
+            else:
                mm_content = None
            if enable_multimodal_chat:
                prompt = self.apply_multimodal_chat_transformation(
@@ -1013,9 +1001,9 @@ class ShareGPTDataset(BenchmarkDataset):
                    request_id=request_id_prefix + str(ind),
                ))
            ind += 1
-        self.maybe_oversample_requests(samples, 
-                                       num_requests, 
-                                       request_id_prefix, 
+        self.maybe_oversample_requests(samples,
+                                       num_requests,
+                                       request_id_prefix,
                                       no_oversample)
        return samples

@@ -1024,11 +1012,11 @@ class _ValidateDatasetArgs(argparse.Action):
    """Argparse action to validate dataset name and path compatibility."""
    def __call__(self, parser, namespace, values, option_string=None):
        setattr(namespace, self.dest, values)
-        
+
        # Get current values of both dataset_name and dataset_path
        dataset_name = getattr(namespace, 'dataset_name', 'random')
        dataset_path = getattr(namespace, 'dataset_path', None)
-        
+
        # Validate the combination
        if dataset_name == "random" and dataset_path is not None:
            parser.error(
@@ -1053,7 +1041,7 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
        default="random",
        action=_ValidateDatasetArgs,
        choices=[
-            "sharegpt", "burstgpt", "sonnet", "random", "random-mm", "hf", 
+            "sharegpt", "burstgpt", "sonnet", "random", "random-mm", "hf",
            "custom", "prefix_repetition", "spec_bench"
        ],
        help="Name of the dataset to benchmark on.",
@@ -1502,7 +1490,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
        # For datasets that follow a similar structure, use a mapping.
        dataset_mapping = {
            "spec_bench":
-            lambda: SpecBench(dataset_path=args.dataset_path, 
+            lambda: SpecBench(dataset_path=args.dataset_path,
                              category=args.spec_bench_category).sample(
                num_requests=args.num_prompts,
                tokenizer=tokenizer,
@@ -1660,7 +1648,7 @@ class CustomDataset(BenchmarkDataset):
            logger.info("num_requests is set to 0 or negative, "
                        "so using all available samples: %d",
                        num_requests)
-            
+
        sampled_requests = []
        for i, item in enumerate(self.data):
            if len(sampled_requests) >= num_requests:
@@ -1686,7 +1674,7 @@ class CustomDataset(BenchmarkDataset):
                    expected_output_len=output_len,
                    request_id=request_id_prefix + str(i),
                ))
-        self.maybe_oversample_requests(sampled_requests, num_requests, 
+        self.maybe_oversample_requests(sampled_requests, num_requests,
                                       request_id_prefix, no_oversample)

        return sampled_requests
@@ -1700,7 +1688,7 @@ class CustomDataset(BenchmarkDataset):
 class SpecBench(CustomDataset):
    """
    Implements the SpecBench dataset: https://github.com/hemingkx/Spec-Bench
-    Download the dataset using: 
+    Download the dataset using:
    wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl
    """ # noqa: E501

@@ -1736,8 +1724,8 @@ class SpecBench(CustomDataset):
        # leverage CustomDataset sample
        kwargs["skip_chat_template"] = False
        return super().sample(**kwargs)
-    
-    
+
+
 # -----------------------------------------------------------------------------
 # Sonnet Dataset Implementation
 # -----------------------------------------------------------------------------
@@ -1882,8 +1870,8 @@ class BurstGPTDataset(BenchmarkDataset):
        for i in range(num_requests):
            input_len = int(data[i][2])
            output_len = int(data[i][3])
-            lora_req, tokenizer = self.get_random_lora_request(
-                tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path)
+            lora_req = self.get_random_lora_request(
+                max_loras=max_loras, lora_path=lora_path)
            vocab_size = tokenizer.vocab_size
            # Generate a synthetic prompt: a list of token IDs computed as (i +
            # j) modulo vocab_size.
@@ -1995,7 +1983,7 @@ class ConversationDataset(HuggingFaceDataset):
                    request_id=request_id_prefix + str(ind),
                ))
            ind += 1
-        self.maybe_oversample_requests(sampled_requests, num_requests, 
+        self.maybe_oversample_requests(sampled_requests, num_requests,
                                       request_id_prefix, no_oversample)
        return sampled_requests

@@ -2055,7 +2043,7 @@ class VisionArenaDataset(HuggingFaceDataset):
                    multi_modal_data=mm_content,
                    request_id=request_id_prefix + str(i),
                ))
-        self.maybe_oversample_requests(sampled_requests, num_requests, 
+        self.maybe_oversample_requests(sampled_requests, num_requests,
                                       request_id_prefix, no_oversample)
        return sampled_requests

@@ -2172,7 +2160,7 @@ class InstructCoderDataset(HuggingFaceDataset):
                    expected_output_len=output_len,
                    request_id=request_id_prefix + str(i),
                ))
-        self.maybe_oversample_requests(sampled_requests, num_requests, 
+        self.maybe_oversample_requests(sampled_requests, num_requests,
                                       request_id_prefix, no_oversample)
        return sampled_requests

@@ -2234,7 +2222,7 @@ class MTBenchDataset(HuggingFaceDataset):
                    expected_output_len=output_len,
                    request_id=request_id_prefix + str(i),
                ))
-        self.maybe_oversample_requests(sampled_requests, num_requests, 
+        self.maybe_oversample_requests(sampled_requests, num_requests,
                                       request_id_prefix, no_oversample)
        return sampled_requests

@@ -2288,8 +2276,8 @@ class BlazeditDataset(HuggingFaceDataset):
            # compare the levenshtein distance normalized by code length
            if norm_distance < min_distance or norm_distance > max_distance:
                continue
-            
-            # template copied from 
+
+            # template copied from
            # https://github.com/ise-uiuc/blazedit/blob/7765137e656fd62de877422d2e4cf8de51228054/dataset/create_refined_dataset.py#L94-L105 # noqa: E501
            instruction = f"""Given a code file, please apply the change requests and generate the new file.

@@ -2322,9 +2310,9 @@ Please generate the new code file in the "New file" section below.""" # noqa: E5
                    expected_output_len=output_len,
                    request_id=request_id_prefix + str(i),
                ))
-        self.maybe_oversample_requests(sampled_requests, num_requests, 
+        self.maybe_oversample_requests(sampled_requests, num_requests,
                                       request_id_prefix, no_oversample)
-        
+
        return sampled_requests


@@ -2376,7 +2364,6 @@ class AIMODataset(HuggingFaceDataset):
                    expected_output_len=output_len,
                    multi_modal_data=None,
                    request_id=request_id_prefix + str(ind),
-                    
                ))
            ind += 1
        self.maybe_oversample_requests(sampled_requests, num_requests,
@@ -2470,9 +2457,9 @@ class NextEditPredictionDataset(HuggingFaceDataset):
                ))
            if len(samples) >= num_requests:
                break
-        self.maybe_oversample_requests(samples, 
-                                       num_requests, 
-                                       request_id_prefix, 
+        self.maybe_oversample_requests(samples,
+                                       num_requests,
+                                       request_id_prefix,
                                       no_oversample)
        return samples

@@ -2562,7 +2549,7 @@ class ASRDataset(HuggingFaceDataset):
                " what Whisper supports.",
                skipped,
            )
-        self.maybe_oversample_requests(sampled_requests, num_requests, 
+        self.maybe_oversample_requests(sampled_requests, num_requests,
                                       request_id_prefix, no_oversample)
        return sampled_requests

@@ -2647,7 +2634,7 @@ class MLPerfDataset(HuggingFaceDataset):
            )
            ind += 1

-        self.maybe_oversample_requests(sampled_requests, num_requests, 
+        self.maybe_oversample_requests(sampled_requests, num_requests,
                                       request_id_prefix, no_oversample)
        return sampled_requests

@@ -2658,7 +2645,7 @@ class MLPerfDataset(HuggingFaceDataset):


 class PrefixRepetitionRandomDataset(BenchmarkDataset):
-    # Default values copied from benchmark_serving.py for the repeated prefix 
+    # Default values copied from benchmark_serving.py for the repeated prefix
    # dataset.
    DEFAULT_PREFIX_LEN = 256
    DEFAULT_SUFFIX_LEN = 256