[doc] Fold long code blocks to improve readability (#19926)
Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>
This commit is contained in:
@@ -15,29 +15,31 @@ pip install autoawq
|
||||
|
||||
After installing AutoAWQ, you are ready to quantize a model. Please refer to the [AutoAWQ documentation](https://casper-hansen.github.io/AutoAWQ/examples/#basic-quantization) for further details. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`:
|
||||
|
||||
```python
|
||||
from awq import AutoAWQForCausalLM
|
||||
from transformers import AutoTokenizer
|
||||
??? Code
|
||||
|
||||
model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
|
||||
quant_path = 'mistral-instruct-v0.2-awq'
|
||||
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
|
||||
```python
|
||||
from awq import AutoAWQForCausalLM
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
# Load model
|
||||
model = AutoAWQForCausalLM.from_pretrained(
|
||||
model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
|
||||
quant_path = 'mistral-instruct-v0.2-awq'
|
||||
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
|
||||
|
||||
# Quantize
|
||||
model.quantize(tokenizer, quant_config=quant_config)
|
||||
# Load model
|
||||
model = AutoAWQForCausalLM.from_pretrained(
|
||||
model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
|
||||
# Save quantized model
|
||||
model.save_quantized(quant_path)
|
||||
tokenizer.save_pretrained(quant_path)
|
||||
# Quantize
|
||||
model.quantize(tokenizer, quant_config=quant_config)
|
||||
|
||||
print(f'Model is quantized and saved at "{quant_path}"')
|
||||
```
|
||||
# Save quantized model
|
||||
model.save_quantized(quant_path)
|
||||
tokenizer.save_pretrained(quant_path)
|
||||
|
||||
print(f'Model is quantized and saved at "{quant_path}"')
|
||||
```
|
||||
|
||||
To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command:
|
||||
|
||||
@@ -49,27 +51,29 @@ python examples/offline_inference/llm_engine_example.py \
|
||||
|
||||
AWQ models are also supported directly through the LLM entrypoint:
|
||||
|
||||
```python
|
||||
from vllm import LLM, SamplingParams
|
||||
??? Code
|
||||
|
||||
# Sample prompts.
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
```python
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
# Create an LLM.
|
||||
llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ")
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
# Print the outputs.
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
```
|
||||
# Sample prompts.
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
# Create an LLM.
|
||||
llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ")
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
# Print the outputs.
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
```
|
||||
|
||||
@@ -43,17 +43,19 @@ llm = LLM(
|
||||
|
||||
## Read gptq format checkpoint
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
import torch
|
||||
??? Code
|
||||
|
||||
# "hxbgsyxh/llama-13b-4bit-g-1" is a pre-quantized checkpoint.
|
||||
model_id = "hxbgsyxh/llama-13b-4bit-g-1"
|
||||
llm = LLM(
|
||||
model=model_id,
|
||||
dtype=torch.float16,
|
||||
trust_remote_code=True,
|
||||
quantization="bitblas",
|
||||
max_model_len=1024
|
||||
)
|
||||
```
|
||||
```python
|
||||
from vllm import LLM
|
||||
import torch
|
||||
|
||||
# "hxbgsyxh/llama-13b-4bit-g-1" is a pre-quantized checkpoint.
|
||||
model_id = "hxbgsyxh/llama-13b-4bit-g-1"
|
||||
llm = LLM(
|
||||
model=model_id,
|
||||
dtype=torch.float16,
|
||||
trust_remote_code=True,
|
||||
quantization="bitblas",
|
||||
max_model_len=1024
|
||||
)
|
||||
```
|
||||
|
||||
@@ -58,22 +58,24 @@ For FP8 quantization, we can recover accuracy with simple RTN quantization. We r
|
||||
|
||||
Since simple RTN does not require data for weight quantization and the activations are quantized dynamically, we do not need any calibration data for this quantization flow.
|
||||
|
||||
```python
|
||||
from llmcompressor.transformers import oneshot
|
||||
from llmcompressor.modifiers.quantization import QuantizationModifier
|
||||
??? Code
|
||||
|
||||
# Configure the simple PTQ quantization
|
||||
recipe = QuantizationModifier(
|
||||
targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
|
||||
```python
|
||||
from llmcompressor.transformers import oneshot
|
||||
from llmcompressor.modifiers.quantization import QuantizationModifier
|
||||
|
||||
# Apply the quantization algorithm.
|
||||
oneshot(model=model, recipe=recipe)
|
||||
# Configure the simple PTQ quantization
|
||||
recipe = QuantizationModifier(
|
||||
targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
|
||||
|
||||
# Save the model: Meta-Llama-3-8B-Instruct-FP8-Dynamic
|
||||
SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
|
||||
model.save_pretrained(SAVE_DIR)
|
||||
tokenizer.save_pretrained(SAVE_DIR)
|
||||
```
|
||||
# Apply the quantization algorithm.
|
||||
oneshot(model=model, recipe=recipe)
|
||||
|
||||
# Save the model: Meta-Llama-3-8B-Instruct-FP8-Dynamic
|
||||
SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
|
||||
model.save_pretrained(SAVE_DIR)
|
||||
tokenizer.save_pretrained(SAVE_DIR)
|
||||
```
|
||||
|
||||
### 3. Evaluating Accuracy
|
||||
|
||||
|
||||
@@ -41,42 +41,44 @@ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
|
||||
|
||||
You can also use the GGUF model directly through the LLM entrypoint:
|
||||
|
||||
```python
|
||||
from vllm import LLM, SamplingParams
|
||||
??? Code
|
||||
|
||||
# In this script, we demonstrate how to pass input to the chat method:
|
||||
conversation = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant"
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Hello"
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Hello! How can I assist you today?"
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Write an essay about the importance of higher education.",
|
||||
},
|
||||
]
|
||||
```python
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
# In this script, we demonstrate how to pass input to the chat method:
|
||||
conversation = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant"
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Hello"
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Hello! How can I assist you today?"
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Write an essay about the importance of higher education.",
|
||||
},
|
||||
]
|
||||
|
||||
# Create an LLM.
|
||||
llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
|
||||
tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
outputs = llm.chat(conversation, sampling_params)
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
# Print the outputs.
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
```
|
||||
# Create an LLM.
|
||||
llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
|
||||
tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
outputs = llm.chat(conversation, sampling_params)
|
||||
|
||||
# Print the outputs.
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
```
|
||||
|
||||
@@ -31,28 +31,30 @@ After installing GPTQModel, you are ready to quantize a model. Please refer to t
|
||||
|
||||
Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
|
||||
|
||||
```python
|
||||
from datasets import load_dataset
|
||||
from gptqmodel import GPTQModel, QuantizeConfig
|
||||
??? Code
|
||||
|
||||
model_id = "meta-llama/Llama-3.2-1B-Instruct"
|
||||
quant_path = "Llama-3.2-1B-Instruct-gptqmodel-4bit"
|
||||
```python
|
||||
from datasets import load_dataset
|
||||
from gptqmodel import GPTQModel, QuantizeConfig
|
||||
|
||||
calibration_dataset = load_dataset(
|
||||
"allenai/c4",
|
||||
data_files="en/c4-train.00001-of-01024.json.gz",
|
||||
split="train"
|
||||
).select(range(1024))["text"]
|
||||
model_id = "meta-llama/Llama-3.2-1B-Instruct"
|
||||
quant_path = "Llama-3.2-1B-Instruct-gptqmodel-4bit"
|
||||
|
||||
quant_config = QuantizeConfig(bits=4, group_size=128)
|
||||
calibration_dataset = load_dataset(
|
||||
"allenai/c4",
|
||||
data_files="en/c4-train.00001-of-01024.json.gz",
|
||||
split="train"
|
||||
).select(range(1024))["text"]
|
||||
|
||||
model = GPTQModel.load(model_id, quant_config)
|
||||
quant_config = QuantizeConfig(bits=4, group_size=128)
|
||||
|
||||
# increase `batch_size` to match gpu/vram specs to speed up quantization
|
||||
model.quantize(calibration_dataset, batch_size=2)
|
||||
model = GPTQModel.load(model_id, quant_config)
|
||||
|
||||
model.save(quant_path)
|
||||
```
|
||||
# increase `batch_size` to match gpu/vram specs to speed up quantization
|
||||
model.quantize(calibration_dataset, batch_size=2)
|
||||
|
||||
model.save(quant_path)
|
||||
```
|
||||
|
||||
## Running a quantized model with vLLM
|
||||
|
||||
@@ -67,32 +69,34 @@ python examples/offline_inference/llm_engine_example.py \
|
||||
|
||||
GPTQModel quantized models are also supported directly through the LLM entrypoint:
|
||||
|
||||
```python
|
||||
from vllm import LLM, SamplingParams
|
||||
??? Code
|
||||
|
||||
# Sample prompts.
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
```python
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(temperature=0.6, top_p=0.9)
|
||||
# Sample prompts.
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
# Create an LLM.
|
||||
llm = LLM(model="ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2")
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(temperature=0.6, top_p=0.9)
|
||||
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
# Create an LLM.
|
||||
llm = LLM(model="ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2")
|
||||
|
||||
# Print the outputs.
|
||||
print("-"*50)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
# Print the outputs.
|
||||
print("-"*50)
|
||||
```
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
|
||||
print("-"*50)
|
||||
```
|
||||
|
||||
@@ -53,51 +53,55 @@ When quantizing weights to INT4, you need sample data to estimate the weight upd
|
||||
It's best to use calibration data that closely matches your deployment data.
|
||||
For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:
|
||||
|
||||
```python
|
||||
from datasets import load_dataset
|
||||
??? Code
|
||||
|
||||
NUM_CALIBRATION_SAMPLES = 512
|
||||
MAX_SEQUENCE_LENGTH = 2048
|
||||
```python
|
||||
from datasets import load_dataset
|
||||
|
||||
# Load and preprocess the dataset
|
||||
ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
|
||||
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
|
||||
NUM_CALIBRATION_SAMPLES = 512
|
||||
MAX_SEQUENCE_LENGTH = 2048
|
||||
|
||||
def preprocess(example):
|
||||
return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
|
||||
ds = ds.map(preprocess)
|
||||
# Load and preprocess the dataset
|
||||
ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
|
||||
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
|
||||
|
||||
def tokenize(sample):
|
||||
return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
|
||||
ds = ds.map(tokenize, remove_columns=ds.column_names)
|
||||
```
|
||||
def preprocess(example):
|
||||
return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
|
||||
ds = ds.map(preprocess)
|
||||
|
||||
def tokenize(sample):
|
||||
return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
|
||||
ds = ds.map(tokenize, remove_columns=ds.column_names)
|
||||
```
|
||||
|
||||
### 3. Applying Quantization
|
||||
|
||||
Now, apply the quantization algorithms:
|
||||
|
||||
```python
|
||||
from llmcompressor.transformers import oneshot
|
||||
from llmcompressor.modifiers.quantization import GPTQModifier
|
||||
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
|
||||
??? Code
|
||||
|
||||
# Configure the quantization algorithms
|
||||
recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
|
||||
```python
|
||||
from llmcompressor.transformers import oneshot
|
||||
from llmcompressor.modifiers.quantization import GPTQModifier
|
||||
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
|
||||
|
||||
# Apply quantization
|
||||
oneshot(
|
||||
model=model,
|
||||
dataset=ds,
|
||||
recipe=recipe,
|
||||
max_seq_length=MAX_SEQUENCE_LENGTH,
|
||||
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
|
||||
)
|
||||
# Configure the quantization algorithms
|
||||
recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
|
||||
|
||||
# Save the compressed model: Meta-Llama-3-8B-Instruct-W4A16-G128
|
||||
SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
|
||||
model.save_pretrained(SAVE_DIR, save_compressed=True)
|
||||
tokenizer.save_pretrained(SAVE_DIR)
|
||||
```
|
||||
# Apply quantization
|
||||
oneshot(
|
||||
model=model,
|
||||
dataset=ds,
|
||||
recipe=recipe,
|
||||
max_seq_length=MAX_SEQUENCE_LENGTH,
|
||||
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
|
||||
)
|
||||
|
||||
# Save the compressed model: Meta-Llama-3-8B-Instruct-W4A16-G128
|
||||
SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
|
||||
model.save_pretrained(SAVE_DIR, save_compressed=True)
|
||||
tokenizer.save_pretrained(SAVE_DIR)
|
||||
```
|
||||
|
||||
This process creates a W4A16 model with weights quantized to 4-bit integers.
|
||||
|
||||
@@ -137,34 +141,36 @@ $ lm_eval --model vllm \
|
||||
|
||||
The following is an example of an expanded quantization recipe you can tune to your own use case:
|
||||
|
||||
```python
|
||||
from compressed_tensors.quantization import (
|
||||
QuantizationArgs,
|
||||
QuantizationScheme,
|
||||
QuantizationStrategy,
|
||||
QuantizationType,
|
||||
)
|
||||
recipe = GPTQModifier(
|
||||
targets="Linear",
|
||||
config_groups={
|
||||
"config_group": QuantizationScheme(
|
||||
targets=["Linear"],
|
||||
weights=QuantizationArgs(
|
||||
num_bits=4,
|
||||
type=QuantizationType.INT,
|
||||
strategy=QuantizationStrategy.GROUP,
|
||||
group_size=128,
|
||||
symmetric=True,
|
||||
dynamic=False,
|
||||
actorder="weight",
|
||||
??? Code
|
||||
|
||||
```python
|
||||
from compressed_tensors.quantization import (
|
||||
QuantizationArgs,
|
||||
QuantizationScheme,
|
||||
QuantizationStrategy,
|
||||
QuantizationType,
|
||||
)
|
||||
recipe = GPTQModifier(
|
||||
targets="Linear",
|
||||
config_groups={
|
||||
"config_group": QuantizationScheme(
|
||||
targets=["Linear"],
|
||||
weights=QuantizationArgs(
|
||||
num_bits=4,
|
||||
type=QuantizationType.INT,
|
||||
strategy=QuantizationStrategy.GROUP,
|
||||
group_size=128,
|
||||
symmetric=True,
|
||||
dynamic=False,
|
||||
actorder="weight",
|
||||
),
|
||||
),
|
||||
),
|
||||
},
|
||||
ignore=["lm_head"],
|
||||
update_size=NUM_CALIBRATION_SAMPLES,
|
||||
dampening_frac=0.01
|
||||
)
|
||||
```
|
||||
},
|
||||
ignore=["lm_head"],
|
||||
update_size=NUM_CALIBRATION_SAMPLES,
|
||||
dampening_frac=0.01
|
||||
)
|
||||
```
|
||||
|
||||
## Troubleshooting and Support
|
||||
|
||||
|
||||
@@ -54,54 +54,60 @@ When quantizing activations to INT8, you need sample data to estimate the activa
|
||||
It's best to use calibration data that closely matches your deployment data.
|
||||
For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:
|
||||
|
||||
```python
|
||||
from datasets import load_dataset
|
||||
??? Code
|
||||
|
||||
NUM_CALIBRATION_SAMPLES = 512
|
||||
MAX_SEQUENCE_LENGTH = 2048
|
||||
```python
|
||||
from datasets import load_dataset
|
||||
|
||||
# Load and preprocess the dataset
|
||||
ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
|
||||
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
|
||||
NUM_CALIBRATION_SAMPLES = 512
|
||||
MAX_SEQUENCE_LENGTH = 2048
|
||||
|
||||
def preprocess(example):
|
||||
return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
|
||||
ds = ds.map(preprocess)
|
||||
# Load and preprocess the dataset
|
||||
ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
|
||||
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
|
||||
|
||||
def tokenize(sample):
|
||||
return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
|
||||
ds = ds.map(tokenize, remove_columns=ds.column_names)
|
||||
```
|
||||
def preprocess(example):
|
||||
return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
|
||||
ds = ds.map(preprocess)
|
||||
|
||||
def tokenize(sample):
|
||||
return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
|
||||
ds = ds.map(tokenize, remove_columns=ds.column_names)
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
### 3. Applying Quantization
|
||||
|
||||
Now, apply the quantization algorithms:
|
||||
|
||||
```python
|
||||
from llmcompressor.transformers import oneshot
|
||||
from llmcompressor.modifiers.quantization import GPTQModifier
|
||||
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
|
||||
??? Code
|
||||
|
||||
# Configure the quantization algorithms
|
||||
recipe = [
|
||||
SmoothQuantModifier(smoothing_strength=0.8),
|
||||
GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
|
||||
]
|
||||
```python
|
||||
from llmcompressor.transformers import oneshot
|
||||
from llmcompressor.modifiers.quantization import GPTQModifier
|
||||
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
|
||||
|
||||
# Apply quantization
|
||||
oneshot(
|
||||
model=model,
|
||||
dataset=ds,
|
||||
recipe=recipe,
|
||||
max_seq_length=MAX_SEQUENCE_LENGTH,
|
||||
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
|
||||
)
|
||||
# Configure the quantization algorithms
|
||||
recipe = [
|
||||
SmoothQuantModifier(smoothing_strength=0.8),
|
||||
GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
|
||||
]
|
||||
|
||||
# Save the compressed model: Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token
|
||||
SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token"
|
||||
model.save_pretrained(SAVE_DIR, save_compressed=True)
|
||||
tokenizer.save_pretrained(SAVE_DIR)
|
||||
```
|
||||
# Apply quantization
|
||||
oneshot(
|
||||
model=model,
|
||||
dataset=ds,
|
||||
recipe=recipe,
|
||||
max_seq_length=MAX_SEQUENCE_LENGTH,
|
||||
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
|
||||
)
|
||||
|
||||
# Save the compressed model: Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token
|
||||
SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token"
|
||||
model.save_pretrained(SAVE_DIR, save_compressed=True)
|
||||
tokenizer.save_pretrained(SAVE_DIR)
|
||||
```
|
||||
|
||||
This process creates a W8A8 model with weights and activations quantized to 8-bit integers.
|
||||
|
||||
|
||||
@@ -14,24 +14,26 @@ You can quantize HuggingFace models using the example scripts provided in the Te
|
||||
|
||||
Below is an example showing how to quantize a model using modelopt's PTQ API:
|
||||
|
||||
```python
|
||||
import modelopt.torch.quantization as mtq
|
||||
from transformers import AutoModelForCausalLM
|
||||
??? Code
|
||||
|
||||
# Load the model from HuggingFace
|
||||
model = AutoModelForCausalLM.from_pretrained("<path_or_model_id>")
|
||||
```python
|
||||
import modelopt.torch.quantization as mtq
|
||||
from transformers import AutoModelForCausalLM
|
||||
|
||||
# Select the quantization config, for example, FP8
|
||||
config = mtq.FP8_DEFAULT_CFG
|
||||
# Load the model from HuggingFace
|
||||
model = AutoModelForCausalLM.from_pretrained("<path_or_model_id>")
|
||||
|
||||
# Define a forward loop function for calibration
|
||||
def forward_loop(model):
|
||||
for data in calib_set:
|
||||
model(data)
|
||||
# Select the quantization config, for example, FP8
|
||||
config = mtq.FP8_DEFAULT_CFG
|
||||
|
||||
# PTQ with in-place replacement of quantized modules
|
||||
model = mtq.quantize(model, config, forward_loop)
|
||||
```
|
||||
# Define a forward loop function for calibration
|
||||
def forward_loop(model):
|
||||
for data in calib_set:
|
||||
model(data)
|
||||
|
||||
# PTQ with in-place replacement of quantized modules
|
||||
model = mtq.quantize(model, config, forward_loop)
|
||||
```
|
||||
|
||||
After the model is quantized, you can export it to a quantized checkpoint using the export API:
|
||||
|
||||
@@ -48,31 +50,33 @@ with torch.inference_mode():
|
||||
|
||||
The quantized checkpoint can then be deployed with vLLM. As an example, the following code shows how to deploy `nvidia/Llama-3.1-8B-Instruct-FP8`, which is the FP8 quantized checkpoint derived from `meta-llama/Llama-3.1-8B-Instruct`, using vLLM:
|
||||
|
||||
```python
|
||||
from vllm import LLM, SamplingParams
|
||||
??? Code
|
||||
|
||||
def main():
|
||||
```python
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
|
||||
# Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
|
||||
llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)
|
||||
def main():
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.9)
|
||||
model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
|
||||
# Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
|
||||
llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.9)
|
||||
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
```
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
```
|
||||
|
||||
@@ -35,20 +35,22 @@ Studies have shown that FP8 E4M3 quantization typically only minimally degrades
|
||||
|
||||
Here is an example of how to enable FP8 quantization:
|
||||
|
||||
```python
|
||||
# To calculate kv cache scales on the fly enable the calculate_kv_scales
|
||||
# parameter
|
||||
??? Code
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
```python
|
||||
# To calculate kv cache scales on the fly enable the calculate_kv_scales
|
||||
# parameter
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.7, top_p=0.8)
|
||||
llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
|
||||
kv_cache_dtype="fp8",
|
||||
calculate_kv_scales=True)
|
||||
prompt = "London is the capital of"
|
||||
out = llm.generate(prompt, sampling_params)[0].outputs[0].text
|
||||
print(out)
|
||||
```
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.7, top_p=0.8)
|
||||
llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
|
||||
kv_cache_dtype="fp8",
|
||||
calculate_kv_scales=True)
|
||||
prompt = "London is the capital of"
|
||||
out = llm.generate(prompt, sampling_params)[0].outputs[0].text
|
||||
print(out)
|
||||
```
|
||||
|
||||
The `kv_cache_dtype` argument specifies the data type for KV cache storage:
|
||||
- `"auto"`: Uses the model's default "unquantized" data type
|
||||
@@ -71,67 +73,69 @@ pip install llmcompressor
|
||||
|
||||
Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models can use this same pattern):
|
||||
|
||||
```python
|
||||
from datasets import load_dataset
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
from llmcompressor.transformers import oneshot
|
||||
??? Code
|
||||
|
||||
# Select model and load it
|
||||
MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
|
||||
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto")
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
||||
```python
|
||||
from datasets import load_dataset
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
from llmcompressor.transformers import oneshot
|
||||
|
||||
# Select calibration dataset
|
||||
DATASET_ID = "HuggingFaceH4/ultrachat_200k"
|
||||
DATASET_SPLIT = "train_sft"
|
||||
# Select model and load it
|
||||
MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
|
||||
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto")
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
||||
|
||||
# Configure calibration parameters
|
||||
NUM_CALIBRATION_SAMPLES = 512 # 512 samples is a good starting point
|
||||
MAX_SEQUENCE_LENGTH = 2048
|
||||
# Select calibration dataset
|
||||
DATASET_ID = "HuggingFaceH4/ultrachat_200k"
|
||||
DATASET_SPLIT = "train_sft"
|
||||
|
||||
# Load and preprocess dataset
|
||||
ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
|
||||
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
|
||||
# Configure calibration parameters
|
||||
NUM_CALIBRATION_SAMPLES = 512 # 512 samples is a good starting point
|
||||
MAX_SEQUENCE_LENGTH = 2048
|
||||
|
||||
def process_and_tokenize(example):
|
||||
text = tokenizer.apply_chat_template(example["messages"], tokenize=False)
|
||||
return tokenizer(
|
||||
text,
|
||||
padding=False,
|
||||
max_length=MAX_SEQUENCE_LENGTH,
|
||||
truncation=True,
|
||||
add_special_tokens=False,
|
||||
# Load and preprocess dataset
|
||||
ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
|
||||
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
|
||||
|
||||
def process_and_tokenize(example):
|
||||
text = tokenizer.apply_chat_template(example["messages"], tokenize=False)
|
||||
return tokenizer(
|
||||
text,
|
||||
padding=False,
|
||||
max_length=MAX_SEQUENCE_LENGTH,
|
||||
truncation=True,
|
||||
add_special_tokens=False,
|
||||
)
|
||||
|
||||
ds = ds.map(process_and_tokenize, remove_columns=ds.column_names)
|
||||
|
||||
# Configure quantization settings
|
||||
recipe = """
|
||||
quant_stage:
|
||||
quant_modifiers:
|
||||
QuantizationModifier:
|
||||
kv_cache_scheme:
|
||||
num_bits: 8
|
||||
type: float
|
||||
strategy: tensor
|
||||
dynamic: false
|
||||
symmetric: true
|
||||
"""
|
||||
|
||||
# Apply quantization
|
||||
oneshot(
|
||||
model=model,
|
||||
dataset=ds,
|
||||
recipe=recipe,
|
||||
max_seq_length=MAX_SEQUENCE_LENGTH,
|
||||
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
|
||||
)
|
||||
|
||||
ds = ds.map(process_and_tokenize, remove_columns=ds.column_names)
|
||||
|
||||
# Configure quantization settings
|
||||
recipe = """
|
||||
quant_stage:
|
||||
quant_modifiers:
|
||||
QuantizationModifier:
|
||||
kv_cache_scheme:
|
||||
num_bits: 8
|
||||
type: float
|
||||
strategy: tensor
|
||||
dynamic: false
|
||||
symmetric: true
|
||||
"""
|
||||
|
||||
# Apply quantization
|
||||
oneshot(
|
||||
model=model,
|
||||
dataset=ds,
|
||||
recipe=recipe,
|
||||
max_seq_length=MAX_SEQUENCE_LENGTH,
|
||||
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
|
||||
)
|
||||
|
||||
# Save quantized model: Llama-3.1-8B-Instruct-FP8-KV
|
||||
SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
|
||||
model.save_pretrained(SAVE_DIR, save_compressed=True)
|
||||
tokenizer.save_pretrained(SAVE_DIR)
|
||||
```
|
||||
# Save quantized model: Llama-3.1-8B-Instruct-FP8-KV
|
||||
SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
|
||||
model.save_pretrained(SAVE_DIR, save_compressed=True)
|
||||
tokenizer.save_pretrained(SAVE_DIR)
|
||||
```
|
||||
|
||||
The above script will create a folder in your current directory containing your quantized model (e.g., `Llama-3.1-8B-Instruct-FP8-KV`) with calibrated scales.
|
||||
|
||||
|
||||
@@ -42,20 +42,22 @@ The Quark quantization process can be listed for 5 steps as below:
|
||||
Quark uses [Transformers](https://huggingface.co/docs/transformers/en/index)
|
||||
to fetch model and tokenizer.
|
||||
|
||||
```python
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
??? Code
|
||||
|
||||
MODEL_ID = "meta-llama/Llama-2-70b-chat-hf"
|
||||
MAX_SEQ_LEN = 512
|
||||
```python
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
MODEL_ID, device_map="auto", torch_dtype="auto",
|
||||
)
|
||||
model.eval()
|
||||
MODEL_ID = "meta-llama/Llama-2-70b-chat-hf"
|
||||
MAX_SEQ_LEN = 512
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, model_max_length=MAX_SEQ_LEN)
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
```
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
MODEL_ID, device_map="auto", torch_dtype="auto",
|
||||
)
|
||||
model.eval()
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, model_max_length=MAX_SEQ_LEN)
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
```
|
||||
|
||||
### 2. Prepare the Calibration Dataloader
|
||||
|
||||
@@ -63,22 +65,24 @@ Quark uses the [PyTorch Dataloader](https://pytorch.org/tutorials/beginner/basic
|
||||
to load calibration data. For more details about how to use calibration datasets efficiently, please refer
|
||||
to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calibration_datasets.html).
|
||||
|
||||
```python
|
||||
from datasets import load_dataset
|
||||
from torch.utils.data import DataLoader
|
||||
??? Code
|
||||
|
||||
BATCH_SIZE = 1
|
||||
NUM_CALIBRATION_DATA = 512
|
||||
```python
|
||||
from datasets import load_dataset
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
# Load the dataset and get calibration data.
|
||||
dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
|
||||
text_data = dataset["text"][:NUM_CALIBRATION_DATA]
|
||||
BATCH_SIZE = 1
|
||||
NUM_CALIBRATION_DATA = 512
|
||||
|
||||
tokenized_outputs = tokenizer(text_data, return_tensors="pt",
|
||||
padding=True, truncation=True, max_length=MAX_SEQ_LEN)
|
||||
calib_dataloader = DataLoader(tokenized_outputs['input_ids'],
|
||||
batch_size=BATCH_SIZE, drop_last=True)
|
||||
```
|
||||
# Load the dataset and get calibration data.
|
||||
dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
|
||||
text_data = dataset["text"][:NUM_CALIBRATION_DATA]
|
||||
|
||||
tokenized_outputs = tokenizer(text_data, return_tensors="pt",
|
||||
padding=True, truncation=True, max_length=MAX_SEQ_LEN)
|
||||
calib_dataloader = DataLoader(tokenized_outputs['input_ids'],
|
||||
batch_size=BATCH_SIZE, drop_last=True)
|
||||
```
|
||||
|
||||
### 3. Set the Quantization Configuration
|
||||
|
||||
@@ -94,42 +98,44 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
|
||||
AutoSmoothQuant config file for Llama is
|
||||
`examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json`.
|
||||
|
||||
```python
|
||||
from quark.torch.quantization import (Config, QuantizationConfig,
|
||||
FP8E4M3PerTensorSpec,
|
||||
load_quant_algo_config_from_file)
|
||||
??? Code
|
||||
|
||||
# Define fp8/per-tensor/static spec.
|
||||
FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max",
|
||||
is_dynamic=False).to_quantization_spec()
|
||||
```python
|
||||
from quark.torch.quantization import (Config, QuantizationConfig,
|
||||
FP8E4M3PerTensorSpec,
|
||||
load_quant_algo_config_from_file)
|
||||
|
||||
# Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
|
||||
global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC,
|
||||
weight=FP8_PER_TENSOR_SPEC)
|
||||
# Define fp8/per-tensor/static spec.
|
||||
FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max",
|
||||
is_dynamic=False).to_quantization_spec()
|
||||
|
||||
# Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
|
||||
KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
|
||||
kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
|
||||
kv_cache_quant_config = {name :
|
||||
QuantizationConfig(input_tensors=global_quant_config.input_tensors,
|
||||
weight=global_quant_config.weight,
|
||||
output_tensors=KV_CACHE_SPEC)
|
||||
for name in kv_cache_layer_names_for_llama}
|
||||
layer_quant_config = kv_cache_quant_config.copy()
|
||||
# Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
|
||||
global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC,
|
||||
weight=FP8_PER_TENSOR_SPEC)
|
||||
|
||||
# Define algorithm config by config file.
|
||||
LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE =
|
||||
'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json'
|
||||
algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE)
|
||||
# Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
|
||||
KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
|
||||
kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
|
||||
kv_cache_quant_config = {name :
|
||||
QuantizationConfig(input_tensors=global_quant_config.input_tensors,
|
||||
weight=global_quant_config.weight,
|
||||
output_tensors=KV_CACHE_SPEC)
|
||||
for name in kv_cache_layer_names_for_llama}
|
||||
layer_quant_config = kv_cache_quant_config.copy()
|
||||
|
||||
EXCLUDE_LAYERS = ["lm_head"]
|
||||
quant_config = Config(
|
||||
global_quant_config=global_quant_config,
|
||||
layer_quant_config=layer_quant_config,
|
||||
kv_cache_quant_config=kv_cache_quant_config,
|
||||
exclude=EXCLUDE_LAYERS,
|
||||
algo_config=algo_config)
|
||||
```
|
||||
# Define algorithm config by config file.
|
||||
LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE =
|
||||
'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json'
|
||||
algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE)
|
||||
|
||||
EXCLUDE_LAYERS = ["lm_head"]
|
||||
quant_config = Config(
|
||||
global_quant_config=global_quant_config,
|
||||
layer_quant_config=layer_quant_config,
|
||||
kv_cache_quant_config=kv_cache_quant_config,
|
||||
exclude=EXCLUDE_LAYERS,
|
||||
algo_config=algo_config)
|
||||
```
|
||||
|
||||
### 4. Quantize the Model and Export
|
||||
|
||||
@@ -139,63 +145,67 @@ HuggingFace `safetensors`, you can refer to
|
||||
[HuggingFace format exporting](https://quark.docs.amd.com/latest/pytorch/export/quark_export_hf.html)
|
||||
for more exporting format details.
|
||||
|
||||
```python
|
||||
import torch
|
||||
from quark.torch import ModelQuantizer, ModelExporter
|
||||
from quark.torch.export import ExporterConfig, JsonExporterConfig
|
||||
??? Code
|
||||
|
||||
# Apply quantization.
|
||||
quantizer = ModelQuantizer(quant_config)
|
||||
quant_model = quantizer.quantize_model(model, calib_dataloader)
|
||||
```python
|
||||
import torch
|
||||
from quark.torch import ModelQuantizer, ModelExporter
|
||||
from quark.torch.export import ExporterConfig, JsonExporterConfig
|
||||
|
||||
# Freeze quantized model to export.
|
||||
freezed_model = quantizer.freeze(model)
|
||||
# Apply quantization.
|
||||
quantizer = ModelQuantizer(quant_config)
|
||||
quant_model = quantizer.quantize_model(model, calib_dataloader)
|
||||
|
||||
# Define export config.
|
||||
LLAMA_KV_CACHE_GROUP = ["*k_proj", "*v_proj"]
|
||||
export_config = ExporterConfig(json_export_config=JsonExporterConfig())
|
||||
export_config.json_export_config.kv_cache_group = LLAMA_KV_CACHE_GROUP
|
||||
# Freeze quantized model to export.
|
||||
freezed_model = quantizer.freeze(model)
|
||||
|
||||
# Model: Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant
|
||||
EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
|
||||
exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
|
||||
with torch.no_grad():
|
||||
exporter.export_safetensors_model(freezed_model,
|
||||
quant_config=quant_config, tokenizer=tokenizer)
|
||||
```
|
||||
# Define export config.
|
||||
LLAMA_KV_CACHE_GROUP = ["*k_proj", "*v_proj"]
|
||||
export_config = ExporterConfig(json_export_config=JsonExporterConfig())
|
||||
export_config.json_export_config.kv_cache_group = LLAMA_KV_CACHE_GROUP
|
||||
|
||||
# Model: Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant
|
||||
EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
|
||||
exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
|
||||
with torch.no_grad():
|
||||
exporter.export_safetensors_model(freezed_model,
|
||||
quant_config=quant_config, tokenizer=tokenizer)
|
||||
```
|
||||
|
||||
### 5. Evaluation in vLLM
|
||||
|
||||
Now, you can load and run the Quark quantized model directly through the LLM entrypoint:
|
||||
|
||||
```python
|
||||
from vllm import LLM, SamplingParams
|
||||
??? Code
|
||||
|
||||
# Sample prompts.
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
```python
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
# Create an LLM.
|
||||
llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
|
||||
kv_cache_dtype='fp8',quantization='quark')
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
# Print the outputs.
|
||||
print("\nGenerated Outputs:\n" + "-" * 60)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}")
|
||||
print(f"Output: {generated_text!r}")
|
||||
print("-" * 60)
|
||||
```
|
||||
# Sample prompts.
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
# Create an LLM.
|
||||
llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
|
||||
kv_cache_dtype='fp8',quantization='quark')
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
# Print the outputs.
|
||||
print("\nGenerated Outputs:\n" + "-" * 60)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}")
|
||||
print(f"Output: {generated_text!r}")
|
||||
print("-" * 60)
|
||||
```
|
||||
|
||||
Or, you can use `lm_eval` to evaluate accuracy:
|
||||
|
||||
|
||||
@@ -15,26 +15,28 @@ pip install \
|
||||
## Quantizing HuggingFace Models
|
||||
You can quantize your own huggingface model with torchao, e.g. [transformers](https://huggingface.co/docs/transformers/main/en/quantization/torchao) and [diffusers](https://huggingface.co/docs/diffusers/en/quantization/torchao), and save the checkpoint to huggingface hub like [this](https://huggingface.co/jerryzh168/llama3-8b-int8wo) with the following example code:
|
||||
|
||||
```Python
|
||||
import torch
|
||||
from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
|
||||
from torchao.quantization import Int8WeightOnlyConfig
|
||||
??? Code
|
||||
|
||||
model_name = "meta-llama/Meta-Llama-3-8B"
|
||||
quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
model_name,
|
||||
torch_dtype="auto",
|
||||
device_map="auto",
|
||||
quantization_config=quantization_config
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
input_text = "What are we having for dinner?"
|
||||
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
|
||||
```Python
|
||||
import torch
|
||||
from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
|
||||
from torchao.quantization import Int8WeightOnlyConfig
|
||||
|
||||
hub_repo = # YOUR HUB REPO ID
|
||||
tokenizer.push_to_hub(hub_repo)
|
||||
quantized_model.push_to_hub(hub_repo, safe_serialization=False)
|
||||
```
|
||||
model_name = "meta-llama/Meta-Llama-3-8B"
|
||||
quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
model_name,
|
||||
torch_dtype="auto",
|
||||
device_map="auto",
|
||||
quantization_config=quantization_config
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
input_text = "What are we having for dinner?"
|
||||
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
|
||||
|
||||
hub_repo = # YOUR HUB REPO ID
|
||||
tokenizer.push_to_hub(hub_repo)
|
||||
quantized_model.push_to_hub(hub_repo, safe_serialization=False)
|
||||
```
|
||||
|
||||
Alternatively, you can use the [TorchAO Quantization space](https://huggingface.co/spaces/medmekk/TorchAO_Quantization) for quantizing models with a simple UI.
|
||||
|
||||
Reference in New Issue
Block a user