Enabled BnB NF4 inference on Gaudi (#20172)

Signed-off-by: Ruheena Suhani Shaik <rsshaik@habana.ai>
2025-07-15 08:56:08 +05:30
parent 80305c1b24
commit 016b8d1b7f
2 changed files with 18 additions and 8 deletions
--- a/vllm/model_executor/model_loader/bitsandbytes_loader.py
+++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py
@@ -199,6 +199,10 @@ class BitsAndBytesModelLoader(BaseModelLoader):

        if self.pre_quant:
            if self.load_8bit:
+                if current_platform.is_hpu():
+                    raise ValueError(
+                        "currently hpu supports 4bit quantization only")
+
                return self._quantized_8bit_generator(
                    hf_weights_files, use_safetensors,
                    quant_state_dict), quant_state_dict
@@ -302,6 +306,10 @@ class BitsAndBytesModelLoader(BaseModelLoader):
                        in temp_state_dict):
                quant_state = _parse_quant_state(mapped_weight_name,
                                                 temp_state_dict)
+                if current_platform.is_hpu():
+                    assert quant_state.quant_type == "nf4", (
+                        "currently hpu supports nf4 quant_type only")
+
                quant_state_dict[mapped_weight_name] = quant_state
                yield org_weight_name, weight_tensor
            else:
@@ -372,10 +380,12 @@ class BitsAndBytesModelLoader(BaseModelLoader):
                                                      ...]

                # bitsandbytes requires data in GPU
-                if weight_sub_tensor.is_cuda:
+                if (weight_sub_tensor.is_cuda
+                        or weight_sub_tensor.device.type == "hpu"):
                    loaded_weight = weight_sub_tensor
                else:
-                    loaded_weight = weight_sub_tensor.cuda()
+                    loaded_weight = weight_sub_tensor.to(
+                        device=current_platform.device_type)

                # remove the following after the issue is fixed:
                # https://github.com/bitsandbytes-foundation/bitsandbytes/issues/1342