quantize_llmcompressor.py

#!/usr/bin/env python3
"""Path B: llm-compressor oneshot NVFP4 quantization for DeepSeek V4 Pro.

Uses sequential pipeline + activation calibration to produce W4A4 NVFP4 with
calibrated activation global scales. Higher quality than the streaming converter
on activation-sensitive ops, at the cost of much longer wall time and more
fragility on a brand-new architecture.

Memory plan with 2.7 TB host RAM + 8x B200 (1.5 TB HBM):
  - FP8 base resident in CPU RAM:           ~865 GB
  - One transformer block on GPU at a time: ~10-30 GB HBM
  - Activation calibration cache:           tens to a few hundred GB
  - Headroom:                               ~1.5+ TB RAM, ~1.4+ TB HBM

Critical: this loads the model with trust_remote_code=True. V4 architecture is
brand new; expect to need:
  - transformers from source (or recent main)
  - llm-compressor from source
  - The V4 modeling code in DeepSeek-V4-Pro-FP8/inference/ to be importable

Usage:
    python quantize_llmcompressor.py \\
        --src DeepSeek-V4-Pro-FP8 \\
        --dst DeepSeek-V4-Pro-NVFP4-llmcompressor \\
        --num-samples 256 \\
        --max-seq-len 4096
"""

import argparse
import os
import sys
from pathlib import Path

import torch


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--src", required=True, help="Source FP8 model directory")
    ap.add_argument("--dst", required=True, help="Output NVFP4 model directory")
    ap.add_argument("--num-samples", type=int, default=256)
    ap.add_argument("--max-seq-len", type=int, default=4096)
    ap.add_argument("--calibration-dataset", default="HuggingFaceH4/ultrachat_200k")
    ap.add_argument(
        "--offload-folder", default="/root/nvidia-meeting/.offload",
        help="NVMe folder for accelerate disk-offload spillover (rarely needed at 2.7TB RAM)",
    )
    ap.add_argument(
        "--no-activation-quant", action="store_true",
        help="Quantize weights only (no activation calibration). Faster, closer to Path A."
    )
    args = ap.parse_args()

    src = Path(args.src).resolve()
    dst = Path(args.dst).resolve()
    if not (src / "config.json").exists():
        sys.exit(f"No config.json at {src}")

    Path(args.offload_folder).mkdir(parents=True, exist_ok=True)

    # Heavy imports happen here so --help is fast
    from transformers import AutoModelForCausalLM, AutoTokenizer
    from datasets import load_dataset
    from llmcompressor import oneshot
    from llmcompressor.modifiers.quantization import QuantizationModifier

    # ----------------------------------------------------------------------
    # 1. Load model
    # ----------------------------------------------------------------------
    print(f"Loading {src} ...")
    print("  This will take several minutes — FP8 base is ~865 GB.")

    # We want FP8 weights to stay as FP8 on CPU and only be promoted to BF16
    # when each block goes to GPU during sequential calibration. The exact
    # behavior depends on transformers' V4 modeling code — if it auto-dequants
    # on load, expect 3.2 TB BF16 in RAM and you'll spill. Watch `free -h`.
    tokenizer = AutoTokenizer.from_pretrained(src, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        src,
        torch_dtype="auto",
        device_map="cpu",  # all on CPU; sequential pipeline moves blocks to GPU
        trust_remote_code=True,
        offload_folder=args.offload_folder,
    )
    print(f"  Model class: {type(model).__name__}")
    print(f"  Param count: {sum(p.numel() for p in model.parameters()):,}")

    # ----------------------------------------------------------------------
    # 2. MoE handling — replace_modules_for_calibration
    # ----------------------------------------------------------------------
    # On Llama4/Qwen3-MoE, llm-compressor needs a wrapper class that exposes
    # every expert during calibration (otherwise routed-only experts never see
    # data). For DeepSeek V4 the MoE class name is something like
    # `DeepseekV4MoE`. Try the canonical entrypoint first; fall back gracefully.
    try:
        from llmcompressor.modeling import replace_modules_for_calibration
        print("Replacing MoE modules for calibration...")
        replace_modules_for_calibration(model)
    except ImportError:
        print("WARN: replace_modules_for_calibration not available in this "
              "llm-compressor version. Routed-only experts may not see "
              "calibration data, lowering NVFP4 quality on rare experts.")
    except Exception as e:
        print(f"WARN: replace_modules_for_calibration failed: {e}")
        print("      You may need to register a custom MoE wrapper for V4. "
              "Find the MoE class name in DeepSeek-V4-Pro-FP8/inference/ and "
              "register it via llmcompressor.modeling.register_module_replacement.")

    # ----------------------------------------------------------------------
    # 3. Calibration dataset
    # ----------------------------------------------------------------------
    print(f"Loading calibration dataset {args.calibration_dataset} ...")
    ds = load_dataset(args.calibration_dataset, split="train_sft")
    ds = ds.shuffle(seed=42).select(range(args.num_samples))

    def preprocess(example):
        # Use the model's chat template if it has one; ultrachat samples have a
        # 'messages' field already in the OpenAI shape.
        if "messages" in example:
            try:
                text = tokenizer.apply_chat_template(
                    example["messages"], tokenize=False, add_generation_prompt=False
                )
            except Exception:
                text = "\n".join(m.get("content", "") for m in example["messages"])
        else:
            text = example.get("text") or example.get("prompt") or ""
        return {"text": text}

    ds = ds.map(preprocess, remove_columns=ds.column_names)

    def tokenize(example):
        return tokenizer(
            example["text"],
            truncation=True,
            max_length=args.max_seq_len,
            padding=False,
            return_tensors=None,
        )

    ds = ds.map(tokenize, remove_columns=["text"])

    # ----------------------------------------------------------------------
    # 4. Recipe
    # ----------------------------------------------------------------------
    # NVFP4 W4A4 by default. The ignore list mirrors Path A's preserve list:
    # output head, embeddings, MoE router gates (NOT gate_proj!), norms, and
    # V4-specific attention indexer / mHC residual mixing weights.
    ignore = [
        "re:.*lm_head",
        "re:.*embed_tokens$",
        "re:.*\\.mlp\\.gate$",
        "re:.*\\.mlp\\.gate\\.weight$",
        "re:.*norm.*",
        "re:.*indexer.*",
        "re:.*hyper_conn.*",
        "re:.*\\.mhc.*",
        "re:.*scoring.*",
    ]

    if args.no_activation_quant:
        print("Recipe: NVFP4 weight-only (W4A16 effective)")
        recipe = QuantizationModifier(
            targets="Linear",
            scheme="NVFP4A16",  # weight-only variant
            ignore=ignore,
        )
    else:
        print("Recipe: NVFP4 W4A4 with activation calibration")
        recipe = QuantizationModifier(
            targets="Linear",
            scheme="NVFP4",
            ignore=ignore,
        )

    # ----------------------------------------------------------------------
    # 5. Run oneshot — sequential pipeline is the key for memory
    # ----------------------------------------------------------------------
    print("Starting oneshot calibration + quantization (this is the long part)...")
    print(f"  num_samples={args.num_samples}, max_seq_len={args.max_seq_len}")
    print(f"  Watch with: watch -n 5 'free -h && nvidia-smi --query-gpu=memory.used,memory.free --format=csv'")

    oneshot(
        model=model,
        dataset=ds,
        recipe=recipe,
        max_seq_length=args.max_seq_len,
        num_calibration_samples=args.num_samples,
        # Sequential pipeline: one block at a time on GPU, rest on CPU.
        pipeline="sequential",
        # Calibrate every expert, even routed-only ones that wouldn't see traffic.
        moe_calibrate_all_experts=True,
    )

    # ----------------------------------------------------------------------
    # 6. Save compressed
    # ----------------------------------------------------------------------
    print(f"Saving compressed checkpoint to {dst} ...")
    dst.mkdir(parents=True, exist_ok=True)
    model.save_pretrained(str(dst), save_compressed=True)
    tokenizer.save_pretrained(str(dst))

    # Copy any extra files that save_pretrained doesn't (encoding/, inference/, PDF)
    import shutil
    for fname in src.iterdir():
        if fname.is_dir() and fname.name in {"encoding", "inference", "assets"}:
            dst_sub = dst / fname.name
            if not dst_sub.exists():
                shutil.copytree(fname, dst_sub)
        elif fname.suffix in {".pdf", ".md"} and not (dst / fname.name).exists():
            shutil.copy2(fname, dst / fname.name)

    print("Done.")
    print(f"Output: {dst}")


if __name__ == "__main__":
    main()
init commit 2026-05-06 23:47:07 +00:00			`#!/usr/bin/env python3`
			`"""Path B: llm-compressor oneshot NVFP4 quantization for DeepSeek V4 Pro.`

			`Uses sequential pipeline + activation calibration to produce W4A4 NVFP4 with`
			`calibrated activation global scales. Higher quality than the streaming converter`
			`on activation-sensitive ops, at the cost of much longer wall time and more`
			`fragility on a brand-new architecture.`

			`Memory plan with 2.7 TB host RAM + 8x B200 (1.5 TB HBM):`
			`- FP8 base resident in CPU RAM: ~865 GB`
			`- One transformer block on GPU at a time: ~10-30 GB HBM`
			`- Activation calibration cache: tens to a few hundred GB`
			`- Headroom: ~1.5+ TB RAM, ~1.4+ TB HBM`

			`Critical: this loads the model with trust_remote_code=True. V4 architecture is`
			`brand new; expect to need:`
			`- transformers from source (or recent main)`
			`- llm-compressor from source`
			`- The V4 modeling code in DeepSeek-V4-Pro-FP8/inference/ to be importable`

			`Usage:`
			`python quantize_llmcompressor.py \\`
			`--src DeepSeek-V4-Pro-FP8 \\`
			`--dst DeepSeek-V4-Pro-NVFP4-llmcompressor \\`
			`--num-samples 256 \\`
			`--max-seq-len 4096`
			`"""`

			`import argparse`
			`import os`
			`import sys`
			`from pathlib import Path`

			`import torch`


			`def main():`
			`ap = argparse.ArgumentParser()`
			`ap.add_argument("--src", required=True, help="Source FP8 model directory")`
			`ap.add_argument("--dst", required=True, help="Output NVFP4 model directory")`
			`ap.add_argument("--num-samples", type=int, default=256)`
			`ap.add_argument("--max-seq-len", type=int, default=4096)`
			`ap.add_argument("--calibration-dataset", default="HuggingFaceH4/ultrachat_200k")`
			`ap.add_argument(`
			`"--offload-folder", default="/root/nvidia-meeting/.offload",`
			`help="NVMe folder for accelerate disk-offload spillover (rarely needed at 2.7TB RAM)",`
			`)`
			`ap.add_argument(`
			`"--no-activation-quant", action="store_true",`
			`help="Quantize weights only (no activation calibration). Faster, closer to Path A."`
			`)`
			`args = ap.parse_args()`

			`src = Path(args.src).resolve()`
			`dst = Path(args.dst).resolve()`
			`if not (src / "config.json").exists():`
			`sys.exit(f"No config.json at {src}")`

			`Path(args.offload_folder).mkdir(parents=True, exist_ok=True)`

			`# Heavy imports happen here so --help is fast`
			`from transformers import AutoModelForCausalLM, AutoTokenizer`
			`from datasets import load_dataset`
			`from llmcompressor import oneshot`
			`from llmcompressor.modifiers.quantization import QuantizationModifier`

			`# ----------------------------------------------------------------------`
			`# 1. Load model`
			`# ----------------------------------------------------------------------`
			`print(f"Loading {src} ...")`
			`print(" This will take several minutes — FP8 base is ~865 GB.")`

			`# We want FP8 weights to stay as FP8 on CPU and only be promoted to BF16`
			`# when each block goes to GPU during sequential calibration. The exact`
			`# behavior depends on transformers' V4 modeling code — if it auto-dequants`
			# on load, expect 3.2 TB BF16 in RAM and you'll spill. Watch `free -h`.
			`tokenizer = AutoTokenizer.from_pretrained(src, trust_remote_code=True)`
			`model = AutoModelForCausalLM.from_pretrained(`
			`src,`
			`torch_dtype="auto",`
			`device_map="cpu", # all on CPU; sequential pipeline moves blocks to GPU`
			`trust_remote_code=True,`
			`offload_folder=args.offload_folder,`
			`)`
			`print(f" Model class: {type(model).__name__}")`
			`print(f" Param count: {sum(p.numel() for p in model.parameters()):,}")`

			`# ----------------------------------------------------------------------`
			`# 2. MoE handling — replace_modules_for_calibration`
			`# ----------------------------------------------------------------------`
			`# On Llama4/Qwen3-MoE, llm-compressor needs a wrapper class that exposes`
			`# every expert during calibration (otherwise routed-only experts never see`
			`# data). For DeepSeek V4 the MoE class name is something like`
			# `DeepseekV4MoE`. Try the canonical entrypoint first; fall back gracefully.
			`try:`
			`from llmcompressor.modeling import replace_modules_for_calibration`
			`print("Replacing MoE modules for calibration...")`
			`replace_modules_for_calibration(model)`
			`except ImportError:`
			`print("WARN: replace_modules_for_calibration not available in this "`
			`"llm-compressor version. Routed-only experts may not see "`
			`"calibration data, lowering NVFP4 quality on rare experts.")`
			`except Exception as e:`
			`print(f"WARN: replace_modules_for_calibration failed: {e}")`
			`print(" You may need to register a custom MoE wrapper for V4. "`
			`"Find the MoE class name in DeepSeek-V4-Pro-FP8/inference/ and "`
			`"register it via llmcompressor.modeling.register_module_replacement.")`

			`# ----------------------------------------------------------------------`
			`# 3. Calibration dataset`
			`# ----------------------------------------------------------------------`
			`print(f"Loading calibration dataset {args.calibration_dataset} ...")`
			`ds = load_dataset(args.calibration_dataset, split="train_sft")`
			`ds = ds.shuffle(seed=42).select(range(args.num_samples))`

			`def preprocess(example):`
			`# Use the model's chat template if it has one; ultrachat samples have a`
			`# 'messages' field already in the OpenAI shape.`
			`if "messages" in example:`
			`try:`
			`text = tokenizer.apply_chat_template(`
			`example["messages"], tokenize=False, add_generation_prompt=False`
			`)`
			`except Exception:`
			`text = "\n".join(m.get("content", "") for m in example["messages"])`
			`else:`
			`text = example.get("text") or example.get("prompt") or ""`
			`return {"text": text}`

			`ds = ds.map(preprocess, remove_columns=ds.column_names)`

			`def tokenize(example):`
			`return tokenizer(`
			`example["text"],`
			`truncation=True,`
			`max_length=args.max_seq_len,`
			`padding=False,`
			`return_tensors=None,`
			`)`

			`ds = ds.map(tokenize, remove_columns=["text"])`

			`# ----------------------------------------------------------------------`
			`# 4. Recipe`
			`# ----------------------------------------------------------------------`
			`# NVFP4 W4A4 by default. The ignore list mirrors Path A's preserve list:`
			`# output head, embeddings, MoE router gates (NOT gate_proj!), norms, and`
			`# V4-specific attention indexer / mHC residual mixing weights.`
			`ignore = [`
			`"re:.*lm_head",`
			`"re:.*embed_tokens$",`
			`"re:.*\\.mlp\\.gate$",`
			`"re:.*\\.mlp\\.gate\\.weight$",`
			`"re:.norm.",`
			`"re:.indexer.",`
			`"re:.hyper_conn.",`
			`"re:.\\.mhc.",`
			`"re:.scoring.",`
			`]`

			`if args.no_activation_quant:`
			`print("Recipe: NVFP4 weight-only (W4A16 effective)")`
			`recipe = QuantizationModifier(`
			`targets="Linear",`
			`scheme="NVFP4A16", # weight-only variant`
			`ignore=ignore,`
			`)`
			`else:`
			`print("Recipe: NVFP4 W4A4 with activation calibration")`
			`recipe = QuantizationModifier(`
			`targets="Linear",`
			`scheme="NVFP4",`
			`ignore=ignore,`
			`)`

			`# ----------------------------------------------------------------------`
			`# 5. Run oneshot — sequential pipeline is the key for memory`
			`# ----------------------------------------------------------------------`
			`print("Starting oneshot calibration + quantization (this is the long part)...")`
			`print(f" num_samples={args.num_samples}, max_seq_len={args.max_seq_len}")`
			`print(f" Watch with: watch -n 5 'free -h && nvidia-smi --query-gpu=memory.used,memory.free --format=csv'")`

			`oneshot(`
			`model=model,`
			`dataset=ds,`
			`recipe=recipe,`
			`max_seq_length=args.max_seq_len,`
			`num_calibration_samples=args.num_samples,`
			`# Sequential pipeline: one block at a time on GPU, rest on CPU.`
			`pipeline="sequential",`
			`# Calibrate every expert, even routed-only ones that wouldn't see traffic.`
			`moe_calibrate_all_experts=True,`
			`)`

			`# ----------------------------------------------------------------------`
			`# 6. Save compressed`
			`# ----------------------------------------------------------------------`
			`print(f"Saving compressed checkpoint to {dst} ...")`
			`dst.mkdir(parents=True, exist_ok=True)`
			`model.save_pretrained(str(dst), save_compressed=True)`
			`tokenizer.save_pretrained(str(dst))`

			`# Copy any extra files that save_pretrained doesn't (encoding/, inference/, PDF)`
			`import shutil`
			`for fname in src.iterdir():`
			`if fname.is_dir() and fname.name in {"encoding", "inference", "assets"}:`
			`dst_sub = dst / fname.name`
			`if not dst_sub.exists():`
			`shutil.copytree(fname, dst_sub)`
			`elif fname.suffix in {".pdf", ".md"} and not (dst / fname.name).exists():`
			`shutil.copy2(fname, dst / fname.name)`

			`print("Done.")`
			`print(f"Output: {dst}")`


			`if __name__ == "__main__":`
			`main()`