[doc] Fold long code blocks to improve readability (#19926)
Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>
This commit is contained in:
@@ -53,51 +53,55 @@ When quantizing weights to INT4, you need sample data to estimate the weight upd
|
||||
It's best to use calibration data that closely matches your deployment data.
|
||||
For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:
|
||||
|
||||
```python
|
||||
from datasets import load_dataset
|
||||
??? Code
|
||||
|
||||
NUM_CALIBRATION_SAMPLES = 512
|
||||
MAX_SEQUENCE_LENGTH = 2048
|
||||
```python
|
||||
from datasets import load_dataset
|
||||
|
||||
# Load and preprocess the dataset
|
||||
ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
|
||||
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
|
||||
NUM_CALIBRATION_SAMPLES = 512
|
||||
MAX_SEQUENCE_LENGTH = 2048
|
||||
|
||||
def preprocess(example):
|
||||
return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
|
||||
ds = ds.map(preprocess)
|
||||
# Load and preprocess the dataset
|
||||
ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
|
||||
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
|
||||
|
||||
def tokenize(sample):
|
||||
return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
|
||||
ds = ds.map(tokenize, remove_columns=ds.column_names)
|
||||
```
|
||||
def preprocess(example):
|
||||
return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
|
||||
ds = ds.map(preprocess)
|
||||
|
||||
def tokenize(sample):
|
||||
return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
|
||||
ds = ds.map(tokenize, remove_columns=ds.column_names)
|
||||
```
|
||||
|
||||
### 3. Applying Quantization
|
||||
|
||||
Now, apply the quantization algorithms:
|
||||
|
||||
```python
|
||||
from llmcompressor.transformers import oneshot
|
||||
from llmcompressor.modifiers.quantization import GPTQModifier
|
||||
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
|
||||
??? Code
|
||||
|
||||
# Configure the quantization algorithms
|
||||
recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
|
||||
```python
|
||||
from llmcompressor.transformers import oneshot
|
||||
from llmcompressor.modifiers.quantization import GPTQModifier
|
||||
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
|
||||
|
||||
# Apply quantization
|
||||
oneshot(
|
||||
model=model,
|
||||
dataset=ds,
|
||||
recipe=recipe,
|
||||
max_seq_length=MAX_SEQUENCE_LENGTH,
|
||||
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
|
||||
)
|
||||
# Configure the quantization algorithms
|
||||
recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
|
||||
|
||||
# Save the compressed model: Meta-Llama-3-8B-Instruct-W4A16-G128
|
||||
SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
|
||||
model.save_pretrained(SAVE_DIR, save_compressed=True)
|
||||
tokenizer.save_pretrained(SAVE_DIR)
|
||||
```
|
||||
# Apply quantization
|
||||
oneshot(
|
||||
model=model,
|
||||
dataset=ds,
|
||||
recipe=recipe,
|
||||
max_seq_length=MAX_SEQUENCE_LENGTH,
|
||||
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
|
||||
)
|
||||
|
||||
# Save the compressed model: Meta-Llama-3-8B-Instruct-W4A16-G128
|
||||
SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
|
||||
model.save_pretrained(SAVE_DIR, save_compressed=True)
|
||||
tokenizer.save_pretrained(SAVE_DIR)
|
||||
```
|
||||
|
||||
This process creates a W4A16 model with weights quantized to 4-bit integers.
|
||||
|
||||
@@ -137,34 +141,36 @@ $ lm_eval --model vllm \
|
||||
|
||||
The following is an example of an expanded quantization recipe you can tune to your own use case:
|
||||
|
||||
```python
|
||||
from compressed_tensors.quantization import (
|
||||
QuantizationArgs,
|
||||
QuantizationScheme,
|
||||
QuantizationStrategy,
|
||||
QuantizationType,
|
||||
)
|
||||
recipe = GPTQModifier(
|
||||
targets="Linear",
|
||||
config_groups={
|
||||
"config_group": QuantizationScheme(
|
||||
targets=["Linear"],
|
||||
weights=QuantizationArgs(
|
||||
num_bits=4,
|
||||
type=QuantizationType.INT,
|
||||
strategy=QuantizationStrategy.GROUP,
|
||||
group_size=128,
|
||||
symmetric=True,
|
||||
dynamic=False,
|
||||
actorder="weight",
|
||||
??? Code
|
||||
|
||||
```python
|
||||
from compressed_tensors.quantization import (
|
||||
QuantizationArgs,
|
||||
QuantizationScheme,
|
||||
QuantizationStrategy,
|
||||
QuantizationType,
|
||||
)
|
||||
recipe = GPTQModifier(
|
||||
targets="Linear",
|
||||
config_groups={
|
||||
"config_group": QuantizationScheme(
|
||||
targets=["Linear"],
|
||||
weights=QuantizationArgs(
|
||||
num_bits=4,
|
||||
type=QuantizationType.INT,
|
||||
strategy=QuantizationStrategy.GROUP,
|
||||
group_size=128,
|
||||
symmetric=True,
|
||||
dynamic=False,
|
||||
actorder="weight",
|
||||
),
|
||||
),
|
||||
),
|
||||
},
|
||||
ignore=["lm_head"],
|
||||
update_size=NUM_CALIBRATION_SAMPLES,
|
||||
dampening_frac=0.01
|
||||
)
|
||||
```
|
||||
},
|
||||
ignore=["lm_head"],
|
||||
update_size=NUM_CALIBRATION_SAMPLES,
|
||||
dampening_frac=0.01
|
||||
)
|
||||
```
|
||||
|
||||
## Troubleshooting and Support
|
||||
|
||||
|
||||
Reference in New Issue
Block a user