diff --git a/tests/models/language/pooling/test_token_classification.py b/tests/models/language/pooling/test_token_classification.py index 64d42432c..341df02e7 100644 --- a/tests/models/language/pooling/test_token_classification.py +++ b/tests/models/language/pooling/test_token_classification.py @@ -34,8 +34,8 @@ def test_bert_models( # check logits difference for hf_output, vllm_output in zip(hf_outputs, vllm_outputs): - hf_output = torch.tensor(hf_output).cpu().float() - vllm_output = torch.tensor(vllm_output).cpu().float() + hf_output = hf_output.detach().clone().cpu().float() + vllm_output = vllm_output.detach().clone().cpu().float() assert torch.allclose(hf_output, vllm_output, 1e-2) @@ -49,11 +49,22 @@ def test_modernbert_models( model: str, dtype: str, ) -> None: + from vllm.platforms import current_platform + with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model: vllm_outputs = vllm_model.token_classify(example_prompts) + # Use eager attention on ROCm to avoid HF Transformers flash attention + # accuracy issues: https://github.com/vllm-project/vllm/issues/30167 + hf_model_kwargs = {} + if current_platform.is_rocm(): + hf_model_kwargs["attn_implementation"] = "eager" + with hf_runner( - model, dtype=dtype, auto_cls=AutoModelForTokenClassification + model, + dtype=dtype, + auto_cls=AutoModelForTokenClassification, + model_kwargs=hf_model_kwargs, ) as hf_model: tokenizer = hf_model.tokenizer hf_outputs = [] @@ -65,8 +76,8 @@ def test_modernbert_models( # check logits difference for hf_output, vllm_output in zip(hf_outputs, vllm_outputs): - hf_output = torch.tensor(hf_output).cpu().float() - vllm_output = torch.tensor(vllm_output).cpu().float() + hf_output = hf_output.detach().clone().cpu().float() + vllm_output = vllm_output.detach().clone().cpu().float() assert torch.allclose(hf_output, vllm_output, atol=1e-2) @@ -96,6 +107,6 @@ def test_auto_conversion( # check logits difference for hf_output, vllm_output in zip(hf_outputs, vllm_outputs): - hf_output = torch.tensor(hf_output).cpu().float() - vllm_output = torch.tensor(vllm_output).cpu().float() + hf_output = hf_output.detach().clone().cpu().float() + vllm_output = vllm_output.detach().clone().cpu().float() assert torch.allclose(hf_output, vllm_output, atol=1e-2)