[ROCm][CI] Fix ModernBERT token classification test (#31612)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
This commit is contained in:
Andreas Karatzas
2026-01-01 22:19:08 -06:00
committed by GitHub
parent 5ac55eb30f
commit 013b54088c

View File

@@ -34,8 +34,8 @@ def test_bert_models(
# check logits difference
for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
hf_output = torch.tensor(hf_output).cpu().float()
vllm_output = torch.tensor(vllm_output).cpu().float()
hf_output = hf_output.detach().clone().cpu().float()
vllm_output = vllm_output.detach().clone().cpu().float()
assert torch.allclose(hf_output, vllm_output, 1e-2)
@@ -49,11 +49,22 @@ def test_modernbert_models(
model: str,
dtype: str,
) -> None:
from vllm.platforms import current_platform
with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.token_classify(example_prompts)
# Use eager attention on ROCm to avoid HF Transformers flash attention
# accuracy issues: https://github.com/vllm-project/vllm/issues/30167
hf_model_kwargs = {}
if current_platform.is_rocm():
hf_model_kwargs["attn_implementation"] = "eager"
with hf_runner(
model, dtype=dtype, auto_cls=AutoModelForTokenClassification
model,
dtype=dtype,
auto_cls=AutoModelForTokenClassification,
model_kwargs=hf_model_kwargs,
) as hf_model:
tokenizer = hf_model.tokenizer
hf_outputs = []
@@ -65,8 +76,8 @@ def test_modernbert_models(
# check logits difference
for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
hf_output = torch.tensor(hf_output).cpu().float()
vllm_output = torch.tensor(vllm_output).cpu().float()
hf_output = hf_output.detach().clone().cpu().float()
vllm_output = vllm_output.detach().clone().cpu().float()
assert torch.allclose(hf_output, vllm_output, atol=1e-2)
@@ -96,6 +107,6 @@ def test_auto_conversion(
# check logits difference
for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
hf_output = torch.tensor(hf_output).cpu().float()
vllm_output = torch.tensor(vllm_output).cpu().float()
hf_output = hf_output.detach().clone().cpu().float()
vllm_output = vllm_output.detach().clone().cpu().float()
assert torch.allclose(hf_output, vllm_output, atol=1e-2)