Files
deepseek-v4-quant/quantize_llmcompressor.py

218 lines
8.9 KiB
Python
Raw Permalink Normal View History

2026-05-06 23:47:07 +00:00
#!/usr/bin/env python3
"""Path B: llm-compressor oneshot NVFP4 quantization for DeepSeek V4 Pro.
Uses sequential pipeline + activation calibration to produce W4A4 NVFP4 with
calibrated activation global scales. Higher quality than the streaming converter
on activation-sensitive ops, at the cost of much longer wall time and more
fragility on a brand-new architecture.
Memory plan with 2.7 TB host RAM + 8x B200 (1.5 TB HBM):
- FP8 base resident in CPU RAM: ~865 GB
- One transformer block on GPU at a time: ~10-30 GB HBM
- Activation calibration cache: tens to a few hundred GB
- Headroom: ~1.5+ TB RAM, ~1.4+ TB HBM
Critical: this loads the model with trust_remote_code=True. V4 architecture is
brand new; expect to need:
- transformers from source (or recent main)
- llm-compressor from source
- The V4 modeling code in DeepSeek-V4-Pro-FP8/inference/ to be importable
Usage:
python quantize_llmcompressor.py \\
--src DeepSeek-V4-Pro-FP8 \\
--dst DeepSeek-V4-Pro-NVFP4-llmcompressor \\
--num-samples 256 \\
--max-seq-len 4096
"""
import argparse
import os
import sys
from pathlib import Path
import torch
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--src", required=True, help="Source FP8 model directory")
ap.add_argument("--dst", required=True, help="Output NVFP4 model directory")
ap.add_argument("--num-samples", type=int, default=256)
ap.add_argument("--max-seq-len", type=int, default=4096)
ap.add_argument("--calibration-dataset", default="HuggingFaceH4/ultrachat_200k")
ap.add_argument(
"--offload-folder", default="/root/nvidia-meeting/.offload",
help="NVMe folder for accelerate disk-offload spillover (rarely needed at 2.7TB RAM)",
)
ap.add_argument(
"--no-activation-quant", action="store_true",
help="Quantize weights only (no activation calibration). Faster, closer to Path A."
)
args = ap.parse_args()
src = Path(args.src).resolve()
dst = Path(args.dst).resolve()
if not (src / "config.json").exists():
sys.exit(f"No config.json at {src}")
Path(args.offload_folder).mkdir(parents=True, exist_ok=True)
# Heavy imports happen here so --help is fast
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
# ----------------------------------------------------------------------
# 1. Load model
# ----------------------------------------------------------------------
print(f"Loading {src} ...")
print(" This will take several minutes — FP8 base is ~865 GB.")
# We want FP8 weights to stay as FP8 on CPU and only be promoted to BF16
# when each block goes to GPU during sequential calibration. The exact
# behavior depends on transformers' V4 modeling code — if it auto-dequants
# on load, expect 3.2 TB BF16 in RAM and you'll spill. Watch `free -h`.
tokenizer = AutoTokenizer.from_pretrained(src, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
src,
torch_dtype="auto",
device_map="cpu", # all on CPU; sequential pipeline moves blocks to GPU
trust_remote_code=True,
offload_folder=args.offload_folder,
)
print(f" Model class: {type(model).__name__}")
print(f" Param count: {sum(p.numel() for p in model.parameters()):,}")
# ----------------------------------------------------------------------
# 2. MoE handling — replace_modules_for_calibration
# ----------------------------------------------------------------------
# On Llama4/Qwen3-MoE, llm-compressor needs a wrapper class that exposes
# every expert during calibration (otherwise routed-only experts never see
# data). For DeepSeek V4 the MoE class name is something like
# `DeepseekV4MoE`. Try the canonical entrypoint first; fall back gracefully.
try:
from llmcompressor.modeling import replace_modules_for_calibration
print("Replacing MoE modules for calibration...")
replace_modules_for_calibration(model)
except ImportError:
print("WARN: replace_modules_for_calibration not available in this "
"llm-compressor version. Routed-only experts may not see "
"calibration data, lowering NVFP4 quality on rare experts.")
except Exception as e:
print(f"WARN: replace_modules_for_calibration failed: {e}")
print(" You may need to register a custom MoE wrapper for V4. "
"Find the MoE class name in DeepSeek-V4-Pro-FP8/inference/ and "
"register it via llmcompressor.modeling.register_module_replacement.")
# ----------------------------------------------------------------------
# 3. Calibration dataset
# ----------------------------------------------------------------------
print(f"Loading calibration dataset {args.calibration_dataset} ...")
ds = load_dataset(args.calibration_dataset, split="train_sft")
ds = ds.shuffle(seed=42).select(range(args.num_samples))
def preprocess(example):
# Use the model's chat template if it has one; ultrachat samples have a
# 'messages' field already in the OpenAI shape.
if "messages" in example:
try:
text = tokenizer.apply_chat_template(
example["messages"], tokenize=False, add_generation_prompt=False
)
except Exception:
text = "\n".join(m.get("content", "") for m in example["messages"])
else:
text = example.get("text") or example.get("prompt") or ""
return {"text": text}
ds = ds.map(preprocess, remove_columns=ds.column_names)
def tokenize(example):
return tokenizer(
example["text"],
truncation=True,
max_length=args.max_seq_len,
padding=False,
return_tensors=None,
)
ds = ds.map(tokenize, remove_columns=["text"])
# ----------------------------------------------------------------------
# 4. Recipe
# ----------------------------------------------------------------------
# NVFP4 W4A4 by default. The ignore list mirrors Path A's preserve list:
# output head, embeddings, MoE router gates (NOT gate_proj!), norms, and
# V4-specific attention indexer / mHC residual mixing weights.
ignore = [
"re:.*lm_head",
"re:.*embed_tokens$",
"re:.*\\.mlp\\.gate$",
"re:.*\\.mlp\\.gate\\.weight$",
"re:.*norm.*",
"re:.*indexer.*",
"re:.*hyper_conn.*",
"re:.*\\.mhc.*",
"re:.*scoring.*",
]
if args.no_activation_quant:
print("Recipe: NVFP4 weight-only (W4A16 effective)")
recipe = QuantizationModifier(
targets="Linear",
scheme="NVFP4A16", # weight-only variant
ignore=ignore,
)
else:
print("Recipe: NVFP4 W4A4 with activation calibration")
recipe = QuantizationModifier(
targets="Linear",
scheme="NVFP4",
ignore=ignore,
)
# ----------------------------------------------------------------------
# 5. Run oneshot — sequential pipeline is the key for memory
# ----------------------------------------------------------------------
print("Starting oneshot calibration + quantization (this is the long part)...")
print(f" num_samples={args.num_samples}, max_seq_len={args.max_seq_len}")
print(f" Watch with: watch -n 5 'free -h && nvidia-smi --query-gpu=memory.used,memory.free --format=csv'")
oneshot(
model=model,
dataset=ds,
recipe=recipe,
max_seq_length=args.max_seq_len,
num_calibration_samples=args.num_samples,
# Sequential pipeline: one block at a time on GPU, rest on CPU.
pipeline="sequential",
# Calibrate every expert, even routed-only ones that wouldn't see traffic.
moe_calibrate_all_experts=True,
)
# ----------------------------------------------------------------------
# 6. Save compressed
# ----------------------------------------------------------------------
print(f"Saving compressed checkpoint to {dst} ...")
dst.mkdir(parents=True, exist_ok=True)
model.save_pretrained(str(dst), save_compressed=True)
tokenizer.save_pretrained(str(dst))
# Copy any extra files that save_pretrained doesn't (encoding/, inference/, PDF)
import shutil
for fname in src.iterdir():
if fname.is_dir() and fname.name in {"encoding", "inference", "assets"}:
dst_sub = dst / fname.name
if not dst_sub.exists():
shutil.copytree(fname, dst_sub)
elif fname.suffix in {".pdf", ".md"} and not (dst / fname.name).exists():
shutil.copy2(fname, dst / fname.name)
print("Done.")
print(f"Output: {dst}")
if __name__ == "__main__":
main()