TileLang kernels (mhc_pre_big_fuse_tilelang, mhc_fused_tilelang) don't work correctly on Blackwell SM100 and cause empty model output. Replace with pure PyTorch implementations: - mhc_pre_torch: Sinkhorn-normalized HC residual mixing - mhc_post_torch: HC post block (einsum residual + post layer mix) - mhc_fused_post_pre_torch: Fused post+pre (composition of above) - hc_head_fused_torch: RMS norm + linear + sigmoid + weighted sum Patch both layers/mhc.py (CustomOp dispatch) and kernels/mhc/__init__.py (no tilelang import). Also remove tilelang from pyproject.toml deps.
16 lines
350 B
TOML
16 lines
350 B
TOML
[build-system]
|
|
requires = ["setuptools>=68.0", "wheel"]
|
|
build-backend = "setuptools.build_meta"
|
|
|
|
[project]
|
|
name = "nvfp4-megamoe-kernel"
|
|
version = "0.1.0"
|
|
description = "NVFP4 Mega MoE kernel for DeepSeek-V4-Pro on Blackwell (TileLang)"
|
|
requires-python = ">=3.10"
|
|
dependencies = [
|
|
"torch>=2.5",
|
|
]
|
|
|
|
[tool.setuptools.packages.find]
|
|
where = ["src"]
|