diff --git a/docs/features/quantization/gguf.md b/docs/features/quantization/gguf.md index 2a731e9b7..41912a506 100644 --- a/docs/features/quantization/gguf.md +++ b/docs/features/quantization/gguf.md @@ -6,34 +6,38 @@ !!! warning Currently, vllm only supports loading single-file GGUF models. If you have a multi-files GGUF model, you can use [gguf-split](https://github.com/ggerganov/llama.cpp/pull/6135) tool to merge them to a single-file model. -To run a GGUF model with vLLM, you can download and use the local GGUF model from [TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF) with the following command: +To run a GGUF model with vLLM, you can use the `repo_id:quant_type` format to load directly from HuggingFace. For example, to load a Q4_K_M quantized model from [unsloth/Qwen3-0.6B-GGUF](https://huggingface.co/unsloth/Qwen3-0.6B-GGUF): ```bash -wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. -vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \ - --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 +vllm serve unsloth/Qwen3-0.6B-GGUF:Q4_K_M --tokenizer Qwen/Qwen3-0.6B ``` You can also add `--tensor-parallel-size 2` to enable tensor parallelism inference with 2 GPUs: ```bash -# We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. -vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \ - --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ +vllm serve unsloth/Qwen3-0.6B-GGUF:Q4_K_M \ + --tokenizer Qwen/Qwen3-0.6B \ --tensor-parallel-size 2 ``` +Alternatively, you can download and use a local GGUF file: + +```bash +wget https://huggingface.co/unsloth/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q4_K_M.gguf +vllm serve ./Qwen3-0.6B-Q4_K_M.gguf --tokenizer Qwen/Qwen3-0.6B +``` + !!! warning We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size. -GGUF assumes that huggingface can convert the metadata to a config file. In case huggingface doesn't support your model you can manually create a config and pass it as hf-config-path +GGUF assumes that HuggingFace can convert the metadata to a config file. In case HuggingFace doesn't support your model you can manually create a config and pass it as hf-config-path ```bash -# If you model is not supported by huggingface you can manually provide a huggingface compatible config path -vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \ - --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ - --hf-config-path Tinyllama/TInyLlama-1.1B-Chat-v1.0 +# If your model is not supported by HuggingFace you can manually provide a HuggingFace compatible config path +vllm serve unsloth/Qwen3-0.6B-GGUF:Q4_K_M \ + --tokenizer Qwen/Qwen3-0.6B \ + --hf-config-path Qwen/Qwen3-0.6B ``` You can also use the GGUF model directly through the LLM entrypoint: @@ -66,10 +70,10 @@ You can also use the GGUF model directly through the LLM entrypoint: # Create a sampling params object. sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - # Create an LLM. + # Create an LLM using repo_id:quant_type format. llm = LLM( - model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf", - tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0", + model="unsloth/Qwen3-0.6B-GGUF:Q4_K_M", + tokenizer="Qwen/Qwen3-0.6B", ) # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. diff --git a/examples/offline_inference/basic/README.md b/examples/offline_inference/basic/README.md index cbb3116e9..3eedeb725 100644 --- a/examples/offline_inference/basic/README.md +++ b/examples/offline_inference/basic/README.md @@ -56,17 +56,10 @@ Try it yourself with the following argument: vLLM supports models that are quantized using GGUF. -Try one yourself by downloading a quantized GGUF model and using the following arguments: - -```python -from huggingface_hub import hf_hub_download -repo_id = "bartowski/Phi-3-medium-4k-instruct-GGUF" -filename = "Phi-3-medium-4k-instruct-IQ2_M.gguf" -print(hf_hub_download(repo_id, filename=filename)) -``` +Try one yourself using the `repo_id:quant_type` format to load directly from HuggingFace: ```bash ---model {local-path-printed-above} --tokenizer microsoft/Phi-3-medium-4k-instruct +--model unsloth/Qwen3-0.6B-GGUF:Q4_K_M --tokenizer Qwen/Qwen3-0.6B ``` ### CPU offload diff --git a/tests/transformers_utils/test_utils.py b/tests/transformers_utils/test_utils.py index 0a6a65b41..cf83970b4 100644 --- a/tests/transformers_utils/test_utils.py +++ b/tests/transformers_utils/test_utils.py @@ -43,7 +43,7 @@ class TestIsRemoteGGUF: def test_is_remote_gguf_with_colon_and_slash(self): """Test is_remote_gguf with repo_id:quant_type format.""" - # Valid quant types + # Valid quant types (exact GGML types) assert is_remote_gguf("unsloth/Qwen3-0.6B-GGUF:IQ1_S") assert is_remote_gguf("user/repo:Q2_K") assert is_remote_gguf("repo/model:Q4_K") @@ -54,6 +54,24 @@ class TestIsRemoteGGUF: assert not is_remote_gguf("repo/model:INVALID") assert not is_remote_gguf("repo/model:invalid_type") + def test_is_remote_gguf_extended_quant_types(self): + """Test is_remote_gguf with extended quant type naming conventions.""" + # Extended quant types with _M, _S, _L suffixes + assert is_remote_gguf("repo/model:Q4_K_M") + assert is_remote_gguf("repo/model:Q4_K_S") + assert is_remote_gguf("repo/model:Q3_K_L") + assert is_remote_gguf("repo/model:Q5_K_M") + assert is_remote_gguf("repo/model:Q3_K_S") + + # Extended quant types with _XL, _XS, _XXS suffixes + assert is_remote_gguf("repo/model:Q5_K_XL") + assert is_remote_gguf("repo/model:IQ4_XS") + assert is_remote_gguf("repo/model:IQ3_XXS") + + # Invalid extended types (base type doesn't exist) + assert not is_remote_gguf("repo/model:INVALID_M") + assert not is_remote_gguf("repo/model:Q9_K_M") + def test_is_remote_gguf_without_colon(self): """Test is_remote_gguf without colon.""" assert not is_remote_gguf("repo/model") @@ -106,6 +124,16 @@ class TestSplitRemoteGGUF: assert repo_id == "repo/model" assert quant_type == "Q2_K" + def test_split_remote_gguf_extended_quant_types(self): + """Test split_remote_gguf with extended quant type naming conventions.""" + repo_id, quant_type = split_remote_gguf("unsloth/Qwen3-0.6B-GGUF:Q4_K_M") + assert repo_id == "unsloth/Qwen3-0.6B-GGUF" + assert quant_type == "Q4_K_M" + + repo_id, quant_type = split_remote_gguf("repo/model:Q3_K_S") + assert repo_id == "repo/model" + assert quant_type == "Q3_K_S" + def test_split_remote_gguf_with_path_object(self): """Test split_remote_gguf with Path object.""" repo_id, quant_type = split_remote_gguf(Path("unsloth/Qwen3-0.6B-GGUF:IQ1_S")) @@ -147,6 +175,11 @@ class TestIsGGUF: assert is_gguf("repo/model:Q2_K") assert is_gguf("repo/model:Q4_K") + # Extended quant types with suffixes + assert is_gguf("repo/model:Q4_K_M") + assert is_gguf("repo/model:Q3_K_S") + assert is_gguf("repo/model:Q5_K_L") + # Invalid quant_type should return False assert not is_gguf("repo/model:quant") assert not is_gguf("repo/model:INVALID") diff --git a/vllm/transformers_utils/gguf_utils.py b/vllm/transformers_utils/gguf_utils.py index f3fd43c6a..81d773316 100644 --- a/vllm/transformers_utils/gguf_utils.py +++ b/vllm/transformers_utils/gguf_utils.py @@ -49,9 +49,29 @@ def is_remote_gguf(model: str | Path) -> bool: return False +# Common suffixes used in GGUF file naming conventions +# e.g., Q4_K_M, Q3_K_S, Q5_K_L, Q2_K_XL +_GGUF_QUANT_SUFFIXES = ("_M", "_S", "_L", "_XL", "_XS", "_XXS") + + def is_valid_gguf_quant_type(gguf_quant_type: str) -> bool: - """Check if the quant type is a valid GGUF quant type.""" - return getattr(GGMLQuantizationType, gguf_quant_type, None) is not None + """Check if the quant type is a valid GGUF quant type. + + Supports both exact GGML quant types (e.g., Q4_K, IQ1_S) and + extended naming conventions (e.g., Q4_K_M, Q3_K_S, Q5_K_L). + """ + # Check for exact match first + if getattr(GGMLQuantizationType, gguf_quant_type, None) is not None: + return True + + # Check for extended naming conventions (e.g., Q4_K_M -> Q4_K) + for suffix in _GGUF_QUANT_SUFFIXES: + if gguf_quant_type.endswith(suffix): + base_type = gguf_quant_type[: -len(suffix)] + if getattr(GGMLQuantizationType, base_type, None) is not None: + return True + + return False def split_remote_gguf(model: str | Path) -> tuple[str, str]: @@ -63,7 +83,8 @@ def split_remote_gguf(model: str | Path) -> tuple[str, str]: raise ValueError( f"Wrong GGUF model or invalid GGUF quant type: {model}.\n" "- It should be in repo_id:quant_type format.\n" - f"- Valid GGMLQuantizationType values: {GGMLQuantizationType._member_names_}", + f"- Valid base quant types: {GGMLQuantizationType._member_names_}\n" + f"- Extended suffixes also supported: {_GGUF_QUANT_SUFFIXES}", )