deepseek-v4-quant/quantize_llmcompressor.py

#!/usr/bin/env python3
"""Path B: llm-compressor oneshot NVFP4 quantization for DeepSeek V4 Pro.

Uses sequential pipeline + activation calibration to produce W4A4 NVFP4 with
calibrated activation global scales. Higher quality than the streaming converter
on activation-sensitive ops, at the cost of much longer wall time and more
fragility on a brand-new architecture.

Memory plan with 2.7 TB host RAM + 8x B200 (1.5 TB HBM):
  - FP8 base resident in CPU RAM:           ~865 GB
  - One transformer block on GPU at a time: ~10-30 GB HBM
  - Activation calibration cache:           tens to a few hundred GB
  - Headroom:                               ~1.5+ TB RAM, ~1.4+ TB HBM

Critical: this loads the model with trust_remote_code=True. V4 architecture is
brand new; expect to need:
  - transformers from source (or recent main)
  - llm-compressor from source
  - The V4 modeling code in DeepSeek-V4-Pro-FP8/inference/ to be importable

Usage:
    python quantize_llmcompressor.py \\
        --src DeepSeek-V4-Pro-FP8 \\
        --dst DeepSeek-V4-Pro-NVFP4-llmcompressor \\
        --num-samples 256 \\
        --max-seq-len 4096
"""

import argparse
import os
import sys
from pathlib import Path

import torch


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--src", required=True, help="Source FP8 model directory")
    ap.add_argument("--dst", required=True, help="Output NVFP4 model directory")
    ap.add_argument("--num-samples", type=int, default=256)
    ap.add_argument("--max-seq-len", type=int, default=4096)
    ap.add_argument("--calibration-dataset", default="HuggingFaceH4/ultrachat_200k")
    ap.add_argument(
        "--offload-folder", default="/root/nvidia-meeting/.offload",
        help="NVMe folder for accelerate disk-offload spillover (rarely needed at 2.7TB RAM)",
    )
    ap.add_argument(
        "--no-activation-quant", action="store_true",
        help="Quantize weights only (no activation calibration). Faster, closer to Path A."
    )
    args = ap.parse_args()

    src = Path(args.src).resolve()
    dst = Path(args.dst).resolve()
    if not (src / "config.json").exists():
        sys.exit(f"No config.json at {src}")

    Path(args.offload_folder).mkdir(parents=True, exist_ok=True)

    # Heavy imports happen here so --help is fast
    from transformers import AutoModelForCausalLM, AutoTokenizer
    from datasets import load_dataset
    from llmcompressor import oneshot
    from llmcompressor.modifiers.quantization import QuantizationModifier

    # ----------------------------------------------------------------------
    # 1. Load model
    # ----------------------------------------------------------------------
    print(f"Loading {src} ...")
    print("  This will take several minutes — FP8 base is ~865 GB.")

    # We want FP8 weights to stay as FP8 on CPU and only be promoted to BF16
    # when each block goes to GPU during sequential calibration. The exact
    # behavior depends on transformers' V4 modeling code — if it auto-dequants
    # on load, expect 3.2 TB BF16 in RAM and you'll spill. Watch `free -h`.
    tokenizer = AutoTokenizer.from_pretrained(src, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        src,
        torch_dtype="auto",
        device_map="cpu",  # all on CPU; sequential pipeline moves blocks to GPU
        trust_remote_code=True,
        offload_folder=args.offload_folder,
    )
    print(f"  Model class: {type(model).__name__}")
    print(f"  Param count: {sum(p.numel() for p in model.parameters()):,}")

    # ----------------------------------------------------------------------
    # 2. MoE handling — replace_modules_for_calibration
    # ----------------------------------------------------------------------
    # On Llama4/Qwen3-MoE, llm-compressor needs a wrapper class that exposes
    # every expert during calibration (otherwise routed-only experts never see
    # data). For DeepSeek V4 the MoE class name is something like
    # `DeepseekV4MoE`. Try the canonical entrypoint first; fall back gracefully.
    try:
        from llmcompressor.modeling import replace_modules_for_calibration
        print("Replacing MoE modules for calibration...")
        replace_modules_for_calibration(model)
    except ImportError:
        print("WARN: replace_modules_for_calibration not available in this "
              "llm-compressor version. Routed-only experts may not see "
              "calibration data, lowering NVFP4 quality on rare experts.")
    except Exception as e:
        print(f"WARN: replace_modules_for_calibration failed: {e}")
        print("      You may need to register a custom MoE wrapper for V4. "
              "Find the MoE class name in DeepSeek-V4-Pro-FP8/inference/ and "
              "register it via llmcompressor.modeling.register_module_replacement.")

    # ----------------------------------------------------------------------
    # 3. Calibration dataset
    # ----------------------------------------------------------------------
    print(f"Loading calibration dataset {args.calibration_dataset} ...")
    ds = load_dataset(args.calibration_dataset, split="train_sft")
    ds = ds.shuffle(seed=42).select(range(args.num_samples))

    def preprocess(example):
        # Use the model's chat template if it has one; ultrachat samples have a
        # 'messages' field already in the OpenAI shape.
        if "messages" in example:
            try:
                text = tokenizer.apply_chat_template(
                    example["messages"], tokenize=False, add_generation_prompt=False
                )
            except Exception:
                text = "\n".join(m.get("content", "") for m in example["messages"])
        else:
            text = example.get("text") or example.get("prompt") or ""
        return {"text": text}

    ds = ds.map(preprocess, remove_columns=ds.column_names)

    def tokenize(example):
        return tokenizer(
            example["text"],
            truncation=True,
            max_length=args.max_seq_len,
            padding=False,
            return_tensors=None,
        )

    ds = ds.map(tokenize, remove_columns=["text"])

    # ----------------------------------------------------------------------
    # 4. Recipe
    # ----------------------------------------------------------------------
    # NVFP4 W4A4 by default. The ignore list mirrors Path A's preserve list:
    # output head, embeddings, MoE router gates (NOT gate_proj!), norms, and
    # V4-specific attention indexer / mHC residual mixing weights.
    ignore = [
        "re:.*lm_head",
        "re:.*embed_tokens$",
        "re:.*\\.mlp\\.gate$",
        "re:.*\\.mlp\\.gate\\.weight$",
        "re:.*norm.*",
        "re:.*indexer.*",
        "re:.*hyper_conn.*",
        "re:.*\\.mhc.*",
        "re:.*scoring.*",
    ]

    if args.no_activation_quant:
        print("Recipe: NVFP4 weight-only (W4A16 effective)")
        recipe = QuantizationModifier(
            targets="Linear",
            scheme="NVFP4A16",  # weight-only variant
            ignore=ignore,
        )
    else:
        print("Recipe: NVFP4 W4A4 with activation calibration")
        recipe = QuantizationModifier(
            targets="Linear",
            scheme="NVFP4",
            ignore=ignore,
        )

    # ----------------------------------------------------------------------
    # 5. Run oneshot — sequential pipeline is the key for memory
    # ----------------------------------------------------------------------
    print("Starting oneshot calibration + quantization (this is the long part)...")
    print(f"  num_samples={args.num_samples}, max_seq_len={args.max_seq_len}")
    print(f"  Watch with: watch -n 5 'free -h && nvidia-smi --query-gpu=memory.used,memory.free --format=csv'")

    oneshot(
        model=model,
        dataset=ds,
        recipe=recipe,
        max_seq_length=args.max_seq_len,
        num_calibration_samples=args.num_samples,
        # Sequential pipeline: one block at a time on GPU, rest on CPU.
        pipeline="sequential",
        # Calibrate every expert, even routed-only ones that wouldn't see traffic.
        moe_calibrate_all_experts=True,
    )

    # ----------------------------------------------------------------------
    # 6. Save compressed
    # ----------------------------------------------------------------------
    print(f"Saving compressed checkpoint to {dst} ...")
    dst.mkdir(parents=True, exist_ok=True)
    model.save_pretrained(str(dst), save_compressed=True)
    tokenizer.save_pretrained(str(dst))

    # Copy any extra files that save_pretrained doesn't (encoding/, inference/, PDF)
    import shutil
    for fname in src.iterdir():
        if fname.is_dir() and fname.name in {"encoding", "inference", "assets"}:
            dst_sub = dst / fname.name
            if not dst_sub.exists():
                shutil.copytree(fname, dst_sub)
        elif fname.suffix in {".pdf", ".md"} and not (dst / fname.name).exists():
            shutil.copy2(fname, dst / fname.name)

    print("Done.")
    print(f"Output: {dst}")


if __name__ == "__main__":
    main()