fix: use NVFP4 SymmBuffer (2x SF size for group_size=16)
The NVFP4 mega_moe kernel needs a larger symmetric buffer because group_size=16 produces 2x more scale factor entries than MXFP4's 32. Switch from deep_gemm.get_symm_buffer_for_mega_moe to deep_gemm.mega.nvfp4.get_symm_buffer_for_nvfp4_mega_moe.
This commit is contained in:
@@ -761,7 +761,7 @@ class DeepseekV4MegaMoEExperts(nn.Module):
|
||||
return w_packed, scale_exp
|
||||
|
||||
def get_symm_buffer(self):
|
||||
import vllm.third_party.deep_gemm as deep_gemm
|
||||
from deep_gemm.mega import nvfp4 as nvfp4_mega
|
||||
|
||||
group = get_ep_group().device_group
|
||||
device = torch.accelerator.current_device_index()
|
||||
@@ -776,7 +776,8 @@ class DeepseekV4MegaMoEExperts(nn.Module):
|
||||
)
|
||||
symm_buffer = self._symm_buffer_cache.get(key)
|
||||
if symm_buffer is None:
|
||||
symm_buffer = deep_gemm.get_symm_buffer_for_mega_moe(
|
||||
# NVFP4 SymmBuffer: 2x SF size due to group_size=16 (vs MXFP4's 32)
|
||||
symm_buffer = nvfp4_mega.get_symm_buffer_for_nvfp4_mega_moe(
|
||||
group,
|
||||
self.num_experts,
|
||||
self.max_num_tokens,
|
||||
|
||||
Reference in New Issue
Block a user