[Bugfix][Quantization] Fix FP8 + EP (#13784)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
This commit is contained in:
committed by
GitHub
parent
51010a1807
commit
1e15aaef56
@@ -136,7 +136,7 @@ class AWQMarlinConfig(QuantizationConfig):
|
||||
self.full_config).get_quant_method(layer, prefix)
|
||||
return AWQMarlinLinearMethod(self)
|
||||
elif isinstance(layer, FusedMoE):
|
||||
if layer.num_experts > 32:
|
||||
if layer.local_num_experts > 32:
|
||||
# For MoEs with many experts the moe_wna16 kernel is faster
|
||||
return MoeWNA16Config.from_config(
|
||||
self.full_config).get_quant_method(layer, prefix)
|
||||
|
||||
Reference in New Issue
Block a user