[doc] improve readability (#18675)
Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>
This commit is contained in:
@@ -42,7 +42,9 @@ print(f'Model is quantized and saved at "{quant_path}"')
|
||||
To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command:
|
||||
|
||||
```console
|
||||
python examples/offline_inference/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq
|
||||
python examples/offline_inference/llm_engine_example.py \
|
||||
--model TheBloke/Llama-2-7b-Chat-AWQ \
|
||||
--quantization awq
|
||||
```
|
||||
|
||||
AWQ models are also supported directly through the LLM entrypoint:
|
||||
|
||||
@@ -33,7 +33,12 @@ import torch
|
||||
|
||||
# "hxbgsyxh/llama-13b-4bit-g-1-bitblas" is a pre-quantized checkpoint.
|
||||
model_id = "hxbgsyxh/llama-13b-4bit-g-1-bitblas"
|
||||
llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, quantization="bitblas")
|
||||
llm = LLM(
|
||||
model=model_id,
|
||||
dtype=torch.bfloat16,
|
||||
trust_remote_code=True,
|
||||
quantization="bitblas"
|
||||
)
|
||||
```
|
||||
|
||||
## Read gptq format checkpoint
|
||||
@@ -44,5 +49,11 @@ import torch
|
||||
|
||||
# "hxbgsyxh/llama-13b-4bit-g-1" is a pre-quantized checkpoint.
|
||||
model_id = "hxbgsyxh/llama-13b-4bit-g-1"
|
||||
llm = LLM(model=model_id, dtype=torch.float16, trust_remote_code=True, quantization="bitblas", max_model_len=1024)
|
||||
llm = LLM(
|
||||
model=model_id,
|
||||
dtype=torch.float16,
|
||||
trust_remote_code=True,
|
||||
quantization="bitblas",
|
||||
max_model_len=1024
|
||||
)
|
||||
```
|
||||
|
||||
@@ -27,7 +27,11 @@ from vllm import LLM
|
||||
import torch
|
||||
# unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint.
|
||||
model_id = "unsloth/tinyllama-bnb-4bit"
|
||||
llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True)
|
||||
llm = LLM(
|
||||
model=model_id,
|
||||
dtype=torch.bfloat16,
|
||||
trust_remote_code=True
|
||||
)
|
||||
```
|
||||
|
||||
## Inflight quantization: load as 4bit quantization
|
||||
@@ -38,8 +42,12 @@ For inflight 4bit quantization with BitsAndBytes, you need to explicitly specify
|
||||
from vllm import LLM
|
||||
import torch
|
||||
model_id = "huggyllama/llama-7b"
|
||||
llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
|
||||
quantization="bitsandbytes")
|
||||
llm = LLM(
|
||||
model=model_id,
|
||||
dtype=torch.bfloat16,
|
||||
trust_remote_code=True,
|
||||
quantization="bitsandbytes"
|
||||
)
|
||||
```
|
||||
|
||||
## OpenAI Compatible Server
|
||||
|
||||
@@ -14,14 +14,17 @@ To run a GGUF model with vLLM, you can download and use the local GGUF model fro
|
||||
```console
|
||||
wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf
|
||||
# We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
|
||||
vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0
|
||||
vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
|
||||
--tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0
|
||||
```
|
||||
|
||||
You can also add `--tensor-parallel-size 2` to enable tensor parallelism inference with 2 GPUs:
|
||||
|
||||
```console
|
||||
# We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
|
||||
vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 2
|
||||
vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
|
||||
--tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
|
||||
--tensor-parallel-size 2
|
||||
```
|
||||
|
||||
!!! warning
|
||||
@@ -31,7 +34,9 @@ GGUF assumes that huggingface can convert the metadata to a config file. In case
|
||||
|
||||
```console
|
||||
# If you model is not supported by huggingface you can manually provide a huggingface compatible config path
|
||||
vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --hf-config-path Tinyllama/TInyLlama-1.1B-Chat-v1.0
|
||||
vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
|
||||
--tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
|
||||
--hf-config-path Tinyllama/TInyLlama-1.1B-Chat-v1.0
|
||||
```
|
||||
|
||||
You can also use the GGUF model directly through the LLM entrypoint:
|
||||
|
||||
@@ -59,7 +59,8 @@ model.save(quant_path)
|
||||
To run an GPTQModel quantized model with vLLM, you can use [DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2](https://huggingface.co/ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2) with the following command:
|
||||
|
||||
```console
|
||||
python examples/offline_inference/llm_engine_example.py --model ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2
|
||||
python examples/offline_inference/llm_engine_example.py \
|
||||
--model ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2
|
||||
```
|
||||
|
||||
## Using GPTQModel with vLLM's Python API
|
||||
|
||||
@@ -7,7 +7,9 @@ We recommend installing the latest torchao nightly with
|
||||
```console
|
||||
# Install the latest TorchAO nightly build
|
||||
# Choose the CUDA version that matches your system (cu126, cu128, etc.)
|
||||
pip install --pre torchao>=10.0.0 --index-url https://download.pytorch.org/whl/nightly/cu126
|
||||
pip install \
|
||||
--pre torchao>=10.0.0 \
|
||||
--index-url https://download.pytorch.org/whl/nightly/cu126
|
||||
```
|
||||
|
||||
## Quantizing HuggingFace Models
|
||||
@@ -20,7 +22,12 @@ from torchao.quantization import Int8WeightOnlyConfig
|
||||
|
||||
model_name = "meta-llama/Meta-Llama-3-8B"
|
||||
quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", quantization_config=quantization_config)
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
model_name,
|
||||
torch_dtype="auto",
|
||||
device_map="auto",
|
||||
quantization_config=quantization_config
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
input_text = "What are we having for dinner?"
|
||||
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
|
||||
|
||||
Reference in New Issue
Block a user