#!/usr/bin/env python3 """Path B: llm-compressor oneshot NVFP4 quantization for DeepSeek V4 Pro. Uses sequential pipeline + activation calibration to produce W4A4 NVFP4 with calibrated activation global scales. Higher quality than the streaming converter on activation-sensitive ops, at the cost of much longer wall time and more fragility on a brand-new architecture. Memory plan with 2.7 TB host RAM + 8x B200 (1.5 TB HBM): - FP8 base resident in CPU RAM: ~865 GB - One transformer block on GPU at a time: ~10-30 GB HBM - Activation calibration cache: tens to a few hundred GB - Headroom: ~1.5+ TB RAM, ~1.4+ TB HBM Critical: this loads the model with trust_remote_code=True. V4 architecture is brand new; expect to need: - transformers from source (or recent main) - llm-compressor from source - The V4 modeling code in DeepSeek-V4-Pro-FP8/inference/ to be importable Usage: python quantize_llmcompressor.py \\ --src DeepSeek-V4-Pro-FP8 \\ --dst DeepSeek-V4-Pro-NVFP4-llmcompressor \\ --num-samples 256 \\ --max-seq-len 4096 """ import argparse import os import sys from pathlib import Path import torch def main(): ap = argparse.ArgumentParser() ap.add_argument("--src", required=True, help="Source FP8 model directory") ap.add_argument("--dst", required=True, help="Output NVFP4 model directory") ap.add_argument("--num-samples", type=int, default=256) ap.add_argument("--max-seq-len", type=int, default=4096) ap.add_argument("--calibration-dataset", default="HuggingFaceH4/ultrachat_200k") ap.add_argument( "--offload-folder", default="/root/nvidia-meeting/.offload", help="NVMe folder for accelerate disk-offload spillover (rarely needed at 2.7TB RAM)", ) ap.add_argument( "--no-activation-quant", action="store_true", help="Quantize weights only (no activation calibration). Faster, closer to Path A." ) args = ap.parse_args() src = Path(args.src).resolve() dst = Path(args.dst).resolve() if not (src / "config.json").exists(): sys.exit(f"No config.json at {src}") Path(args.offload_folder).mkdir(parents=True, exist_ok=True) # Heavy imports happen here so --help is fast from transformers import AutoModelForCausalLM, AutoTokenizer from datasets import load_dataset from llmcompressor import oneshot from llmcompressor.modifiers.quantization import QuantizationModifier # ---------------------------------------------------------------------- # 1. Load model # ---------------------------------------------------------------------- print(f"Loading {src} ...") print(" This will take several minutes — FP8 base is ~865 GB.") # We want FP8 weights to stay as FP8 on CPU and only be promoted to BF16 # when each block goes to GPU during sequential calibration. The exact # behavior depends on transformers' V4 modeling code — if it auto-dequants # on load, expect 3.2 TB BF16 in RAM and you'll spill. Watch `free -h`. tokenizer = AutoTokenizer.from_pretrained(src, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( src, torch_dtype="auto", device_map="cpu", # all on CPU; sequential pipeline moves blocks to GPU trust_remote_code=True, offload_folder=args.offload_folder, ) print(f" Model class: {type(model).__name__}") print(f" Param count: {sum(p.numel() for p in model.parameters()):,}") # ---------------------------------------------------------------------- # 2. MoE handling — replace_modules_for_calibration # ---------------------------------------------------------------------- # On Llama4/Qwen3-MoE, llm-compressor needs a wrapper class that exposes # every expert during calibration (otherwise routed-only experts never see # data). For DeepSeek V4 the MoE class name is something like # `DeepseekV4MoE`. Try the canonical entrypoint first; fall back gracefully. try: from llmcompressor.modeling import replace_modules_for_calibration print("Replacing MoE modules for calibration...") replace_modules_for_calibration(model) except ImportError: print("WARN: replace_modules_for_calibration not available in this " "llm-compressor version. Routed-only experts may not see " "calibration data, lowering NVFP4 quality on rare experts.") except Exception as e: print(f"WARN: replace_modules_for_calibration failed: {e}") print(" You may need to register a custom MoE wrapper for V4. " "Find the MoE class name in DeepSeek-V4-Pro-FP8/inference/ and " "register it via llmcompressor.modeling.register_module_replacement.") # ---------------------------------------------------------------------- # 3. Calibration dataset # ---------------------------------------------------------------------- print(f"Loading calibration dataset {args.calibration_dataset} ...") ds = load_dataset(args.calibration_dataset, split="train_sft") ds = ds.shuffle(seed=42).select(range(args.num_samples)) def preprocess(example): # Use the model's chat template if it has one; ultrachat samples have a # 'messages' field already in the OpenAI shape. if "messages" in example: try: text = tokenizer.apply_chat_template( example["messages"], tokenize=False, add_generation_prompt=False ) except Exception: text = "\n".join(m.get("content", "") for m in example["messages"]) else: text = example.get("text") or example.get("prompt") or "" return {"text": text} ds = ds.map(preprocess, remove_columns=ds.column_names) def tokenize(example): return tokenizer( example["text"], truncation=True, max_length=args.max_seq_len, padding=False, return_tensors=None, ) ds = ds.map(tokenize, remove_columns=["text"]) # ---------------------------------------------------------------------- # 4. Recipe # ---------------------------------------------------------------------- # NVFP4 W4A4 by default. The ignore list mirrors Path A's preserve list: # output head, embeddings, MoE router gates (NOT gate_proj!), norms, and # V4-specific attention indexer / mHC residual mixing weights. ignore = [ "re:.*lm_head", "re:.*embed_tokens$", "re:.*\\.mlp\\.gate$", "re:.*\\.mlp\\.gate\\.weight$", "re:.*norm.*", "re:.*indexer.*", "re:.*hyper_conn.*", "re:.*\\.mhc.*", "re:.*scoring.*", ] if args.no_activation_quant: print("Recipe: NVFP4 weight-only (W4A16 effective)") recipe = QuantizationModifier( targets="Linear", scheme="NVFP4A16", # weight-only variant ignore=ignore, ) else: print("Recipe: NVFP4 W4A4 with activation calibration") recipe = QuantizationModifier( targets="Linear", scheme="NVFP4", ignore=ignore, ) # ---------------------------------------------------------------------- # 5. Run oneshot — sequential pipeline is the key for memory # ---------------------------------------------------------------------- print("Starting oneshot calibration + quantization (this is the long part)...") print(f" num_samples={args.num_samples}, max_seq_len={args.max_seq_len}") print(f" Watch with: watch -n 5 'free -h && nvidia-smi --query-gpu=memory.used,memory.free --format=csv'") oneshot( model=model, dataset=ds, recipe=recipe, max_seq_length=args.max_seq_len, num_calibration_samples=args.num_samples, # Sequential pipeline: one block at a time on GPU, rest on CPU. pipeline="sequential", # Calibrate every expert, even routed-only ones that wouldn't see traffic. moe_calibrate_all_experts=True, ) # ---------------------------------------------------------------------- # 6. Save compressed # ---------------------------------------------------------------------- print(f"Saving compressed checkpoint to {dst} ...") dst.mkdir(parents=True, exist_ok=True) model.save_pretrained(str(dst), save_compressed=True) tokenizer.save_pretrained(str(dst)) # Copy any extra files that save_pretrained doesn't (encoding/, inference/, PDF) import shutil for fname in src.iterdir(): if fname.is_dir() and fname.name in {"encoding", "inference", "assets"}: dst_sub = dst / fname.name if not dst_sub.exists(): shutil.copytree(fname, dst_sub) elif fname.suffix in {".pdf", ".md"} and not (dst / fname.name).exists(): shutil.copy2(fname, dst / fname.name) print("Done.") print(f"Output: {dst}") if __name__ == "__main__": main()