[CORE] [QUANT] Support for GPTQModel's dynamic quantization per module override/control (#7086)

This commit is contained in:
Qubitium-ModelCloud
2025-02-13 01:19:43 +08:00
committed by GitHub
parent 2c2b560f48
commit 36a08630e8
8 changed files with 281 additions and 56 deletions

View File

@@ -108,9 +108,9 @@ class LogitsProcessor(nn.Module):
embedding_bias: Optional[torch.Tensor],
) -> Optional[torch.Tensor]:
# Get the logits for the next tokens.
logits = lm_head.linear_method.apply(lm_head,
hidden_states,
bias=embedding_bias)
logits = lm_head.quant_method.apply(lm_head,
hidden_states,
bias=embedding_bias)
# Gather logits for TP
logits = self._gather_logits(logits)