218 lines
8.9 KiB
Python
218 lines
8.9 KiB
Python
#!/usr/bin/env python3
|
|
"""Path B: llm-compressor oneshot NVFP4 quantization for DeepSeek V4 Pro.
|
|
|
|
Uses sequential pipeline + activation calibration to produce W4A4 NVFP4 with
|
|
calibrated activation global scales. Higher quality than the streaming converter
|
|
on activation-sensitive ops, at the cost of much longer wall time and more
|
|
fragility on a brand-new architecture.
|
|
|
|
Memory plan with 2.7 TB host RAM + 8x B200 (1.5 TB HBM):
|
|
- FP8 base resident in CPU RAM: ~865 GB
|
|
- One transformer block on GPU at a time: ~10-30 GB HBM
|
|
- Activation calibration cache: tens to a few hundred GB
|
|
- Headroom: ~1.5+ TB RAM, ~1.4+ TB HBM
|
|
|
|
Critical: this loads the model with trust_remote_code=True. V4 architecture is
|
|
brand new; expect to need:
|
|
- transformers from source (or recent main)
|
|
- llm-compressor from source
|
|
- The V4 modeling code in DeepSeek-V4-Pro-FP8/inference/ to be importable
|
|
|
|
Usage:
|
|
python quantize_llmcompressor.py \\
|
|
--src DeepSeek-V4-Pro-FP8 \\
|
|
--dst DeepSeek-V4-Pro-NVFP4-llmcompressor \\
|
|
--num-samples 256 \\
|
|
--max-seq-len 4096
|
|
"""
|
|
|
|
import argparse
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import torch
|
|
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--src", required=True, help="Source FP8 model directory")
|
|
ap.add_argument("--dst", required=True, help="Output NVFP4 model directory")
|
|
ap.add_argument("--num-samples", type=int, default=256)
|
|
ap.add_argument("--max-seq-len", type=int, default=4096)
|
|
ap.add_argument("--calibration-dataset", default="HuggingFaceH4/ultrachat_200k")
|
|
ap.add_argument(
|
|
"--offload-folder", default="/root/nvidia-meeting/.offload",
|
|
help="NVMe folder for accelerate disk-offload spillover (rarely needed at 2.7TB RAM)",
|
|
)
|
|
ap.add_argument(
|
|
"--no-activation-quant", action="store_true",
|
|
help="Quantize weights only (no activation calibration). Faster, closer to Path A."
|
|
)
|
|
args = ap.parse_args()
|
|
|
|
src = Path(args.src).resolve()
|
|
dst = Path(args.dst).resolve()
|
|
if not (src / "config.json").exists():
|
|
sys.exit(f"No config.json at {src}")
|
|
|
|
Path(args.offload_folder).mkdir(parents=True, exist_ok=True)
|
|
|
|
# Heavy imports happen here so --help is fast
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
from datasets import load_dataset
|
|
from llmcompressor import oneshot
|
|
from llmcompressor.modifiers.quantization import QuantizationModifier
|
|
|
|
# ----------------------------------------------------------------------
|
|
# 1. Load model
|
|
# ----------------------------------------------------------------------
|
|
print(f"Loading {src} ...")
|
|
print(" This will take several minutes — FP8 base is ~865 GB.")
|
|
|
|
# We want FP8 weights to stay as FP8 on CPU and only be promoted to BF16
|
|
# when each block goes to GPU during sequential calibration. The exact
|
|
# behavior depends on transformers' V4 modeling code — if it auto-dequants
|
|
# on load, expect 3.2 TB BF16 in RAM and you'll spill. Watch `free -h`.
|
|
tokenizer = AutoTokenizer.from_pretrained(src, trust_remote_code=True)
|
|
model = AutoModelForCausalLM.from_pretrained(
|
|
src,
|
|
torch_dtype="auto",
|
|
device_map="cpu", # all on CPU; sequential pipeline moves blocks to GPU
|
|
trust_remote_code=True,
|
|
offload_folder=args.offload_folder,
|
|
)
|
|
print(f" Model class: {type(model).__name__}")
|
|
print(f" Param count: {sum(p.numel() for p in model.parameters()):,}")
|
|
|
|
# ----------------------------------------------------------------------
|
|
# 2. MoE handling — replace_modules_for_calibration
|
|
# ----------------------------------------------------------------------
|
|
# On Llama4/Qwen3-MoE, llm-compressor needs a wrapper class that exposes
|
|
# every expert during calibration (otherwise routed-only experts never see
|
|
# data). For DeepSeek V4 the MoE class name is something like
|
|
# `DeepseekV4MoE`. Try the canonical entrypoint first; fall back gracefully.
|
|
try:
|
|
from llmcompressor.modeling import replace_modules_for_calibration
|
|
print("Replacing MoE modules for calibration...")
|
|
replace_modules_for_calibration(model)
|
|
except ImportError:
|
|
print("WARN: replace_modules_for_calibration not available in this "
|
|
"llm-compressor version. Routed-only experts may not see "
|
|
"calibration data, lowering NVFP4 quality on rare experts.")
|
|
except Exception as e:
|
|
print(f"WARN: replace_modules_for_calibration failed: {e}")
|
|
print(" You may need to register a custom MoE wrapper for V4. "
|
|
"Find the MoE class name in DeepSeek-V4-Pro-FP8/inference/ and "
|
|
"register it via llmcompressor.modeling.register_module_replacement.")
|
|
|
|
# ----------------------------------------------------------------------
|
|
# 3. Calibration dataset
|
|
# ----------------------------------------------------------------------
|
|
print(f"Loading calibration dataset {args.calibration_dataset} ...")
|
|
ds = load_dataset(args.calibration_dataset, split="train_sft")
|
|
ds = ds.shuffle(seed=42).select(range(args.num_samples))
|
|
|
|
def preprocess(example):
|
|
# Use the model's chat template if it has one; ultrachat samples have a
|
|
# 'messages' field already in the OpenAI shape.
|
|
if "messages" in example:
|
|
try:
|
|
text = tokenizer.apply_chat_template(
|
|
example["messages"], tokenize=False, add_generation_prompt=False
|
|
)
|
|
except Exception:
|
|
text = "\n".join(m.get("content", "") for m in example["messages"])
|
|
else:
|
|
text = example.get("text") or example.get("prompt") or ""
|
|
return {"text": text}
|
|
|
|
ds = ds.map(preprocess, remove_columns=ds.column_names)
|
|
|
|
def tokenize(example):
|
|
return tokenizer(
|
|
example["text"],
|
|
truncation=True,
|
|
max_length=args.max_seq_len,
|
|
padding=False,
|
|
return_tensors=None,
|
|
)
|
|
|
|
ds = ds.map(tokenize, remove_columns=["text"])
|
|
|
|
# ----------------------------------------------------------------------
|
|
# 4. Recipe
|
|
# ----------------------------------------------------------------------
|
|
# NVFP4 W4A4 by default. The ignore list mirrors Path A's preserve list:
|
|
# output head, embeddings, MoE router gates (NOT gate_proj!), norms, and
|
|
# V4-specific attention indexer / mHC residual mixing weights.
|
|
ignore = [
|
|
"re:.*lm_head",
|
|
"re:.*embed_tokens$",
|
|
"re:.*\\.mlp\\.gate$",
|
|
"re:.*\\.mlp\\.gate\\.weight$",
|
|
"re:.*norm.*",
|
|
"re:.*indexer.*",
|
|
"re:.*hyper_conn.*",
|
|
"re:.*\\.mhc.*",
|
|
"re:.*scoring.*",
|
|
]
|
|
|
|
if args.no_activation_quant:
|
|
print("Recipe: NVFP4 weight-only (W4A16 effective)")
|
|
recipe = QuantizationModifier(
|
|
targets="Linear",
|
|
scheme="NVFP4A16", # weight-only variant
|
|
ignore=ignore,
|
|
)
|
|
else:
|
|
print("Recipe: NVFP4 W4A4 with activation calibration")
|
|
recipe = QuantizationModifier(
|
|
targets="Linear",
|
|
scheme="NVFP4",
|
|
ignore=ignore,
|
|
)
|
|
|
|
# ----------------------------------------------------------------------
|
|
# 5. Run oneshot — sequential pipeline is the key for memory
|
|
# ----------------------------------------------------------------------
|
|
print("Starting oneshot calibration + quantization (this is the long part)...")
|
|
print(f" num_samples={args.num_samples}, max_seq_len={args.max_seq_len}")
|
|
print(f" Watch with: watch -n 5 'free -h && nvidia-smi --query-gpu=memory.used,memory.free --format=csv'")
|
|
|
|
oneshot(
|
|
model=model,
|
|
dataset=ds,
|
|
recipe=recipe,
|
|
max_seq_length=args.max_seq_len,
|
|
num_calibration_samples=args.num_samples,
|
|
# Sequential pipeline: one block at a time on GPU, rest on CPU.
|
|
pipeline="sequential",
|
|
# Calibrate every expert, even routed-only ones that wouldn't see traffic.
|
|
moe_calibrate_all_experts=True,
|
|
)
|
|
|
|
# ----------------------------------------------------------------------
|
|
# 6. Save compressed
|
|
# ----------------------------------------------------------------------
|
|
print(f"Saving compressed checkpoint to {dst} ...")
|
|
dst.mkdir(parents=True, exist_ok=True)
|
|
model.save_pretrained(str(dst), save_compressed=True)
|
|
tokenizer.save_pretrained(str(dst))
|
|
|
|
# Copy any extra files that save_pretrained doesn't (encoding/, inference/, PDF)
|
|
import shutil
|
|
for fname in src.iterdir():
|
|
if fname.is_dir() and fname.name in {"encoding", "inference", "assets"}:
|
|
dst_sub = dst / fname.name
|
|
if not dst_sub.exists():
|
|
shutil.copytree(fname, dst_sub)
|
|
elif fname.suffix in {".pdf", ".md"} and not (dst / fname.name).exists():
|
|
shutil.copy2(fname, dst / fname.name)
|
|
|
|
print("Done.")
|
|
print(f"Output: {dst}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |