[UX] Use gguf repo_id:quant_type syntax for examples and docs (#33371)
Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
@@ -6,34 +6,38 @@
|
||||
!!! warning
|
||||
Currently, vllm only supports loading single-file GGUF models. If you have a multi-files GGUF model, you can use [gguf-split](https://github.com/ggerganov/llama.cpp/pull/6135) tool to merge them to a single-file model.
|
||||
|
||||
To run a GGUF model with vLLM, you can download and use the local GGUF model from [TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF) with the following command:
|
||||
To run a GGUF model with vLLM, you can use the `repo_id:quant_type` format to load directly from HuggingFace. For example, to load a Q4_K_M quantized model from [unsloth/Qwen3-0.6B-GGUF](https://huggingface.co/unsloth/Qwen3-0.6B-GGUF):
|
||||
|
||||
```bash
|
||||
wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf
|
||||
# We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
|
||||
vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
|
||||
--tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0
|
||||
vllm serve unsloth/Qwen3-0.6B-GGUF:Q4_K_M --tokenizer Qwen/Qwen3-0.6B
|
||||
```
|
||||
|
||||
You can also add `--tensor-parallel-size 2` to enable tensor parallelism inference with 2 GPUs:
|
||||
|
||||
```bash
|
||||
# We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
|
||||
vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
|
||||
--tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
|
||||
vllm serve unsloth/Qwen3-0.6B-GGUF:Q4_K_M \
|
||||
--tokenizer Qwen/Qwen3-0.6B \
|
||||
--tensor-parallel-size 2
|
||||
```
|
||||
|
||||
Alternatively, you can download and use a local GGUF file:
|
||||
|
||||
```bash
|
||||
wget https://huggingface.co/unsloth/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q4_K_M.gguf
|
||||
vllm serve ./Qwen3-0.6B-Q4_K_M.gguf --tokenizer Qwen/Qwen3-0.6B
|
||||
```
|
||||
|
||||
!!! warning
|
||||
We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size.
|
||||
|
||||
GGUF assumes that huggingface can convert the metadata to a config file. In case huggingface doesn't support your model you can manually create a config and pass it as hf-config-path
|
||||
GGUF assumes that HuggingFace can convert the metadata to a config file. In case HuggingFace doesn't support your model you can manually create a config and pass it as hf-config-path
|
||||
|
||||
```bash
|
||||
# If you model is not supported by huggingface you can manually provide a huggingface compatible config path
|
||||
vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
|
||||
--tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
|
||||
--hf-config-path Tinyllama/TInyLlama-1.1B-Chat-v1.0
|
||||
# If your model is not supported by HuggingFace you can manually provide a HuggingFace compatible config path
|
||||
vllm serve unsloth/Qwen3-0.6B-GGUF:Q4_K_M \
|
||||
--tokenizer Qwen/Qwen3-0.6B \
|
||||
--hf-config-path Qwen/Qwen3-0.6B
|
||||
```
|
||||
|
||||
You can also use the GGUF model directly through the LLM entrypoint:
|
||||
@@ -66,10 +70,10 @@ You can also use the GGUF model directly through the LLM entrypoint:
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
# Create an LLM.
|
||||
# Create an LLM using repo_id:quant_type format.
|
||||
llm = LLM(
|
||||
model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
|
||||
tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
||||
model="unsloth/Qwen3-0.6B-GGUF:Q4_K_M",
|
||||
tokenizer="Qwen/Qwen3-0.6B",
|
||||
)
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
|
||||
@@ -56,17 +56,10 @@ Try it yourself with the following argument:
|
||||
|
||||
vLLM supports models that are quantized using GGUF.
|
||||
|
||||
Try one yourself by downloading a quantized GGUF model and using the following arguments:
|
||||
|
||||
```python
|
||||
from huggingface_hub import hf_hub_download
|
||||
repo_id = "bartowski/Phi-3-medium-4k-instruct-GGUF"
|
||||
filename = "Phi-3-medium-4k-instruct-IQ2_M.gguf"
|
||||
print(hf_hub_download(repo_id, filename=filename))
|
||||
```
|
||||
Try one yourself using the `repo_id:quant_type` format to load directly from HuggingFace:
|
||||
|
||||
```bash
|
||||
--model {local-path-printed-above} --tokenizer microsoft/Phi-3-medium-4k-instruct
|
||||
--model unsloth/Qwen3-0.6B-GGUF:Q4_K_M --tokenizer Qwen/Qwen3-0.6B
|
||||
```
|
||||
|
||||
### CPU offload
|
||||
|
||||
@@ -43,7 +43,7 @@ class TestIsRemoteGGUF:
|
||||
|
||||
def test_is_remote_gguf_with_colon_and_slash(self):
|
||||
"""Test is_remote_gguf with repo_id:quant_type format."""
|
||||
# Valid quant types
|
||||
# Valid quant types (exact GGML types)
|
||||
assert is_remote_gguf("unsloth/Qwen3-0.6B-GGUF:IQ1_S")
|
||||
assert is_remote_gguf("user/repo:Q2_K")
|
||||
assert is_remote_gguf("repo/model:Q4_K")
|
||||
@@ -54,6 +54,24 @@ class TestIsRemoteGGUF:
|
||||
assert not is_remote_gguf("repo/model:INVALID")
|
||||
assert not is_remote_gguf("repo/model:invalid_type")
|
||||
|
||||
def test_is_remote_gguf_extended_quant_types(self):
|
||||
"""Test is_remote_gguf with extended quant type naming conventions."""
|
||||
# Extended quant types with _M, _S, _L suffixes
|
||||
assert is_remote_gguf("repo/model:Q4_K_M")
|
||||
assert is_remote_gguf("repo/model:Q4_K_S")
|
||||
assert is_remote_gguf("repo/model:Q3_K_L")
|
||||
assert is_remote_gguf("repo/model:Q5_K_M")
|
||||
assert is_remote_gguf("repo/model:Q3_K_S")
|
||||
|
||||
# Extended quant types with _XL, _XS, _XXS suffixes
|
||||
assert is_remote_gguf("repo/model:Q5_K_XL")
|
||||
assert is_remote_gguf("repo/model:IQ4_XS")
|
||||
assert is_remote_gguf("repo/model:IQ3_XXS")
|
||||
|
||||
# Invalid extended types (base type doesn't exist)
|
||||
assert not is_remote_gguf("repo/model:INVALID_M")
|
||||
assert not is_remote_gguf("repo/model:Q9_K_M")
|
||||
|
||||
def test_is_remote_gguf_without_colon(self):
|
||||
"""Test is_remote_gguf without colon."""
|
||||
assert not is_remote_gguf("repo/model")
|
||||
@@ -106,6 +124,16 @@ class TestSplitRemoteGGUF:
|
||||
assert repo_id == "repo/model"
|
||||
assert quant_type == "Q2_K"
|
||||
|
||||
def test_split_remote_gguf_extended_quant_types(self):
|
||||
"""Test split_remote_gguf with extended quant type naming conventions."""
|
||||
repo_id, quant_type = split_remote_gguf("unsloth/Qwen3-0.6B-GGUF:Q4_K_M")
|
||||
assert repo_id == "unsloth/Qwen3-0.6B-GGUF"
|
||||
assert quant_type == "Q4_K_M"
|
||||
|
||||
repo_id, quant_type = split_remote_gguf("repo/model:Q3_K_S")
|
||||
assert repo_id == "repo/model"
|
||||
assert quant_type == "Q3_K_S"
|
||||
|
||||
def test_split_remote_gguf_with_path_object(self):
|
||||
"""Test split_remote_gguf with Path object."""
|
||||
repo_id, quant_type = split_remote_gguf(Path("unsloth/Qwen3-0.6B-GGUF:IQ1_S"))
|
||||
@@ -147,6 +175,11 @@ class TestIsGGUF:
|
||||
assert is_gguf("repo/model:Q2_K")
|
||||
assert is_gguf("repo/model:Q4_K")
|
||||
|
||||
# Extended quant types with suffixes
|
||||
assert is_gguf("repo/model:Q4_K_M")
|
||||
assert is_gguf("repo/model:Q3_K_S")
|
||||
assert is_gguf("repo/model:Q5_K_L")
|
||||
|
||||
# Invalid quant_type should return False
|
||||
assert not is_gguf("repo/model:quant")
|
||||
assert not is_gguf("repo/model:INVALID")
|
||||
|
||||
@@ -49,9 +49,29 @@ def is_remote_gguf(model: str | Path) -> bool:
|
||||
return False
|
||||
|
||||
|
||||
# Common suffixes used in GGUF file naming conventions
|
||||
# e.g., Q4_K_M, Q3_K_S, Q5_K_L, Q2_K_XL
|
||||
_GGUF_QUANT_SUFFIXES = ("_M", "_S", "_L", "_XL", "_XS", "_XXS")
|
||||
|
||||
|
||||
def is_valid_gguf_quant_type(gguf_quant_type: str) -> bool:
|
||||
"""Check if the quant type is a valid GGUF quant type."""
|
||||
return getattr(GGMLQuantizationType, gguf_quant_type, None) is not None
|
||||
"""Check if the quant type is a valid GGUF quant type.
|
||||
|
||||
Supports both exact GGML quant types (e.g., Q4_K, IQ1_S) and
|
||||
extended naming conventions (e.g., Q4_K_M, Q3_K_S, Q5_K_L).
|
||||
"""
|
||||
# Check for exact match first
|
||||
if getattr(GGMLQuantizationType, gguf_quant_type, None) is not None:
|
||||
return True
|
||||
|
||||
# Check for extended naming conventions (e.g., Q4_K_M -> Q4_K)
|
||||
for suffix in _GGUF_QUANT_SUFFIXES:
|
||||
if gguf_quant_type.endswith(suffix):
|
||||
base_type = gguf_quant_type[: -len(suffix)]
|
||||
if getattr(GGMLQuantizationType, base_type, None) is not None:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def split_remote_gguf(model: str | Path) -> tuple[str, str]:
|
||||
@@ -63,7 +83,8 @@ def split_remote_gguf(model: str | Path) -> tuple[str, str]:
|
||||
raise ValueError(
|
||||
f"Wrong GGUF model or invalid GGUF quant type: {model}.\n"
|
||||
"- It should be in repo_id:quant_type format.\n"
|
||||
f"- Valid GGMLQuantizationType values: {GGMLQuantizationType._member_names_}",
|
||||
f"- Valid base quant types: {GGMLQuantizationType._member_names_}\n"
|
||||
f"- Extended suffixes also supported: {_GGUF_QUANT_SUFFIXES}",
|
||||
)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user