support ignore patterns in model loader (#6673)

2024-07-22 23:59:42 -07:00
parent 22fa2e35cb
commit 3eda4ec780
4 changed files with 51 additions and 10 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -599,12 +599,16 @@ class LoadConfig:
                mainly for profiling.
            "tensorizer" will use CoreWeave's tensorizer library for
                fast weight loading.
+        ignore_patterns: The list of patterns to ignore when loading the model.
+            Default to "original/**/*" to avoid repeated loading of llama's 
+            checkpoints.
    """

    load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO
    download_dir: Optional[str] = None
    model_loader_extra_config: Optional[Union[str, dict]] = field(
        default_factory=dict)
+    ignore_patterns: Optional[Union[List[str], str]] = None

    def __post_init__(self):
        model_loader_extra_config = self.model_loader_extra_config or {}
@@ -613,6 +617,13 @@ class LoadConfig:
                model_loader_extra_config)
        self._verify_load_format()

+        if self.ignore_patterns is not None and len(self.ignore_patterns) > 0:
+            logger.info(
+                "Ignoring the following patterns when downloading weights: %s",
+                self.ignore_patterns)
+        else:
+            self.ignore_patterns = ["original/**/*"]
+
    def _verify_load_format(self) -> None:
        if not isinstance(self.load_format, str):
            return
@@ -801,7 +812,9 @@ class SchedulerConfig:
                # for higher throughput.
                self.max_num_batched_tokens = max(max_model_len, 2048)
        if enable_chunked_prefill:
-            logger.info("Chunked prefill is enabled (EXPERIMENTAL).")
+            logger.info(
+                "Chunked prefill is enabled with max_num_batched_tokens=%d.",
+                max_num_batched_tokens)

        self.max_num_seqs = max_num_seqs
        self.max_model_len = max_model_len