From f1d21900ea4befac32c03e4ef8a1aa04c4930995 Mon Sep 17 00:00:00 2001
From: biondizzle <biondizzle@gmail.com>
Date: Fri, 8 May 2026 17:13:39 +0000
Subject: [PATCH] =?UTF-8?q?Remove=20upcast=5Fto=5Fbf16.py=20=E2=80=94=20su?=
 =?UTF-8?q?perseded=20by=20dequant=5Ffp8=5Fto=5Fbf16.py?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 scripts/upcast_to_bf16.py | 84 ---------------------------------------
 1 file changed, 84 deletions(-)
 delete mode 100644 scripts/upcast_to_bf16.py

diff --git a/scripts/upcast_to_bf16.py b/scripts/upcast_to_bf16.py
deleted file mode 100644
index 5d1fcd4..0000000
--- a/scripts/upcast_to_bf16.py
+++ /dev/null
@@ -1,84 +0,0 @@
-#!/usr/bin/env python3
-"""
-Upcast a mixed-precision DeepSeek V4 Pro model to pure BF16.
-
-Converts all FP8 tensors (float8_e8m0fnu, float8_e4m3fn, float8_e5m2)
-to bfloat16 so that modelopt's PTQ calibration can run without hitting
-broken FP8 kernel paths (DeepGEMM doesn't support Blackwell, and the
-Triton finegrained-fp8 matmul has shape mismatches during quantization).
-
-Usage:
-    python3 upcast_to_bf16.py /path/to/DeepSeek-V4-Pro /path/to/DeepSeek-V4-Pro-BF16
-
-The output model will have the same shard structure, same config (with
-torch_dtype updated to bfloat16), and zero FP8 tensors.
-"""
-
-import os
-import glob
-import shutil
-import argparse
-
-from safetensors import safe_open
-from safetensors.torch import save_file
-import torch
-
-FP8_DTYPES = (torch.float8_e8m0fnu, torch.float8_e4m3fn, torch.float8_e5m2)
-
-
-def upcast_model(model_dir: str, out_dir: str):
-    os.makedirs(out_dir, exist_ok=True)
-
-    # Copy non-safetensor files (config, tokenizer, etc.)
-    for f in os.listdir(model_dir):
-        fp = os.path.join(model_dir, f)
-        if not f.endswith(".safetensors") and os.path.isfile(fp):
-            shutil.copy2(fp, os.path.join(out_dir, f))
-            print(f"Copied {f}")
-
-    # Convert safetensors shard by shard
-    safetensor_files = sorted(glob.glob(os.path.join(model_dir, "*.safetensors")))
-    total = len(safetensor_files)
-    fp8_count = 0
-
-    for i, f in enumerate(safetensor_files):
-        tensors = {}
-        with safe_open(f, framework="pt") as sf:
-            for key in sf.keys():
-                t = sf.get_tensor(key)
-                if t.dtype in FP8_DTYPES:
-                    t = t.to(torch.bfloat16)
-                    fp8_count += 1
-                tensors[key] = t
-
-        out_path = os.path.join(out_dir, os.path.basename(f))
-        save_file(tensors, out_path)
-        del tensors  # free memory
-        if (i + 1) % 10 == 0 or i == total - 1:
-            print(f"[{i + 1}/{total}] {os.path.basename(f)} (converted {fp8_count} FP8 tensors)")
-
-    print(f"\nDone! FP8->BF16 tensors: {fp8_count}")
-
-    # Verify: count remaining FP8 tensors
-    remaining_fp8 = 0
-    for f in sorted(glob.glob(os.path.join(out_dir, "*.safetensors"))):
-        with safe_open(f, framework="pt") as sf:
-            for key in sf.keys():
-                if sf.get_tensor(key).dtype in FP8_DTYPES:
-                    remaining_fp8 += 1
-    print(f"Verification: {remaining_fp8} FP8 tensors remaining (should be 0)")
-
-    out_size = sum(
-        os.path.getsize(os.path.join(out_dir, f))
-        for f in os.listdir(out_dir)
-        if f.endswith(".safetensors")
-    )
-    print(f"Output size: {out_size / 1e12:.2f} TB")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Upcast DeepSeek V4 Pro mixed-precision to BF16")
-    parser.add_argument("model_dir", help="Path to mixed-precision model")
-    parser.add_argument("out_dir", help="Path to write BF16 model")
-    args = parser.parse_args()
-    upcast_model(args.model_dir, args.out_dir)