tests/models/language/pooling/test_token_classification.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
from transformers import AutoModelForTokenClassification

from tests.models.utils import softmax
from vllm.platforms import current_platform


@pytest.mark.parametrize("model", ["boltuix/NeuroBERT-NER"])
# The float32 is required for this tiny model to pass the test.
@pytest.mark.parametrize("dtype", ["float"])
@torch.inference_mode
def test_bert_models(
    hf_runner,
    vllm_runner,
    example_prompts,
    model: str,
    dtype: str,
) -> None:
    with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
        vllm_outputs = vllm_model.token_classify(example_prompts)

    # Use eager attention on ROCm to avoid HF Transformers flash attention
    # accuracy issues: https://github.com/vllm-project/vllm/issues/30167
    hf_model_kwargs = {}
    if current_platform.is_rocm():
        hf_model_kwargs["attn_implementation"] = "eager"

    with hf_runner(
        model,
        dtype=dtype,
        auto_cls=AutoModelForTokenClassification,
        model_kwargs=hf_model_kwargs,
    ) as hf_model:
        tokenizer = hf_model.tokenizer
        hf_outputs = []
        for prompt in example_prompts:
            inputs = tokenizer([prompt], return_tensors="pt")
            inputs = hf_model.wrap_device(inputs)
            output = hf_model.model(**inputs)
            hf_outputs.append(softmax(output.logits[0]))

    # check logits difference
    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
        hf_output = hf_output.detach().clone().cpu().float()
        vllm_output = vllm_output.detach().clone().cpu().float()
        torch.testing.assert_close(hf_output, vllm_output, atol=1.2e-2, rtol=1e-3)


@pytest.mark.parametrize("model", ["disham993/electrical-ner-ModernBERT-base"])
@pytest.mark.parametrize("dtype", ["float"])
@torch.inference_mode
def test_modernbert_models(
    hf_runner,
    vllm_runner,
    example_prompts,
    model: str,
    dtype: str,
) -> None:
    with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
        vllm_outputs = vllm_model.token_classify(example_prompts)

    # Use eager attention on ROCm to avoid HF Transformers flash attention
    # accuracy issues: https://github.com/vllm-project/vllm/issues/30167
    hf_model_kwargs = {}
    if current_platform.is_rocm():
        hf_model_kwargs["attn_implementation"] = "eager"

    with hf_runner(
        model,
        dtype=dtype,
        auto_cls=AutoModelForTokenClassification,
        model_kwargs=hf_model_kwargs,
    ) as hf_model:
        tokenizer = hf_model.tokenizer
        hf_outputs = []
        for prompt in example_prompts:
            inputs = tokenizer([prompt], return_tensors="pt")
            inputs = hf_model.wrap_device(inputs)
            output = hf_model.model(**inputs)
            hf_outputs.append(softmax(output.logits[0]))

    # check logits difference
    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
        hf_output = hf_output.detach().clone().cpu().float()
        vllm_output = vllm_output.detach().clone().cpu().float()
        torch.testing.assert_close(hf_output, vllm_output, atol=1.2e-2, rtol=1e-3)


@pytest.mark.parametrize("model", ["bd2lcco/Qwen3-0.6B-finetuned"])
@pytest.mark.parametrize("dtype", ["float"])
@torch.inference_mode
def test_auto_conversion(
    hf_runner,
    vllm_runner,
    example_prompts,
    model: str,
    dtype: str,
) -> None:
    with vllm_runner(model, max_model_len=1024, dtype=dtype) as vllm_model:
        vllm_outputs = vllm_model.token_classify(example_prompts)

    with hf_runner(
        model, dtype=dtype, auto_cls=AutoModelForTokenClassification
    ) as hf_model:
        tokenizer = hf_model.tokenizer
        hf_outputs = []
        for prompt in example_prompts:
            inputs = tokenizer([prompt], return_tensors="pt")
            inputs = hf_model.wrap_device(inputs)
            output = hf_model.model(**inputs)
            hf_outputs.append(softmax(output.logits[0]))

    # check logits difference
    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
        hf_output = hf_output.detach().clone().cpu().float()
        vllm_output = vllm_output.detach().clone().cpu().float()
        assert torch.allclose(hf_output, vllm_output, atol=1e-2)
[New Model] Support BertForTokenClassification / Named Entity Recognition (NER) task (#24872) Signed-off-by: wang.yuqi <noooop@126.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> 2025-09-18 23:22:01 +08:00			`# SPDX-License-Identifier: Apache-2.0`
			`# SPDX-FileCopyrightText: Copyright contributors to the vLLM project`
			`import pytest`
			`import torch`
			`from transformers import AutoModelForTokenClassification`

			`from tests.models.utils import softmax`
[ROCm][CI] Fix test_token_classification.py::test_bert_models (#31993) Signed-off-by: Divakar Verma <divakar.verma@amd.com> 2026-01-08 22:04:33 -06:00			`from vllm.platforms import current_platform`
[New Model] Support BertForTokenClassification / Named Entity Recognition (NER) task (#24872) Signed-off-by: wang.yuqi <noooop@126.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> 2025-09-18 23:22:01 +08:00

			`@pytest.mark.parametrize("model", ["boltuix/NeuroBERT-NER"])`
			`# The float32 is required for this tiny model to pass the test.`
			`@pytest.mark.parametrize("dtype", ["float"])`
			`@torch.inference_mode`
[Model] Add support for ModernBertForTokenClassification (#26340) Signed-off-by: Antoine Recanati Le Goat <antoine.recanati@sancare.fr> Signed-off-by: antrec <antoine.recanati@gmail.com> Co-authored-by: Antoine Recanati Le Goat <antoine.recanati@sancare.fr> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> 2025-10-07 16:29:19 +02:00			`def test_bert_models(`
			`hf_runner,`
			`vllm_runner,`
			`example_prompts,`
			`model: str,`
			`dtype: str,`
			`) -> None:`
			`with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:`
[Model][2/N] Improve all pooling task \| Support multi-vector retrieval (#25370) Signed-off-by: wang.yuqi <noooop@126.com> 2025-10-15 19:14:41 +08:00			`vllm_outputs = vllm_model.token_classify(example_prompts)`
[Model] Add support for ModernBertForTokenClassification (#26340) Signed-off-by: Antoine Recanati Le Goat <antoine.recanati@sancare.fr> Signed-off-by: antrec <antoine.recanati@gmail.com> Co-authored-by: Antoine Recanati Le Goat <antoine.recanati@sancare.fr> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> 2025-10-07 16:29:19 +02:00
[ROCm][CI] Fix test_token_classification.py::test_bert_models (#31993) Signed-off-by: Divakar Verma <divakar.verma@amd.com> 2026-01-08 22:04:33 -06:00			`# Use eager attention on ROCm to avoid HF Transformers flash attention`
			`# accuracy issues: https://github.com/vllm-project/vllm/issues/30167`
			`hf_model_kwargs = {}`
			`if current_platform.is_rocm():`
			`hf_model_kwargs["attn_implementation"] = "eager"`

[Model] Add support for ModernBertForTokenClassification (#26340) Signed-off-by: Antoine Recanati Le Goat <antoine.recanati@sancare.fr> Signed-off-by: antrec <antoine.recanati@gmail.com> Co-authored-by: Antoine Recanati Le Goat <antoine.recanati@sancare.fr> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> 2025-10-07 16:29:19 +02:00			`with hf_runner(`
[ROCm][CI] Fix test_token_classification.py::test_bert_models (#31993) Signed-off-by: Divakar Verma <divakar.verma@amd.com> 2026-01-08 22:04:33 -06:00			`model,`
			`dtype=dtype,`
			`auto_cls=AutoModelForTokenClassification,`
			`model_kwargs=hf_model_kwargs,`
[Model] Add support for ModernBertForTokenClassification (#26340) Signed-off-by: Antoine Recanati Le Goat <antoine.recanati@sancare.fr> Signed-off-by: antrec <antoine.recanati@gmail.com> Co-authored-by: Antoine Recanati Le Goat <antoine.recanati@sancare.fr> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> 2025-10-07 16:29:19 +02:00			`) as hf_model:`
			`tokenizer = hf_model.tokenizer`
			`hf_outputs = []`
			`for prompt in example_prompts:`
			`inputs = tokenizer([prompt], return_tensors="pt")`
			`inputs = hf_model.wrap_device(inputs)`
			`output = hf_model.model(**inputs)`
			`hf_outputs.append(softmax(output.logits[0]))`

			`# check logits difference`
			`for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):`
[ROCm][CI] Fix ModernBERT token classification test (#31612) Signed-off-by: Andreas Karatzas <akaratza@amd.com> 2026-01-01 22:19:08 -06:00			`hf_output = hf_output.detach().clone().cpu().float()`
			`vllm_output = vllm_output.detach().clone().cpu().float()`
[ROCm][CI] Fix test_token_classification.py::test_bert_models (#31993) Signed-off-by: Divakar Verma <divakar.verma@amd.com> 2026-01-08 22:04:33 -06:00			`torch.testing.assert_close(hf_output, vllm_output, atol=1.2e-2, rtol=1e-3)`
[Model] Add support for ModernBertForTokenClassification (#26340) Signed-off-by: Antoine Recanati Le Goat <antoine.recanati@sancare.fr> Signed-off-by: antrec <antoine.recanati@gmail.com> Co-authored-by: Antoine Recanati Le Goat <antoine.recanati@sancare.fr> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> 2025-10-07 16:29:19 +02:00

			`@pytest.mark.parametrize("model", ["disham993/electrical-ner-ModernBERT-base"])`
			`@pytest.mark.parametrize("dtype", ["float"])`
			`@torch.inference_mode`
			`def test_modernbert_models(`
[New Model] Support BertForTokenClassification / Named Entity Recognition (NER) task (#24872) Signed-off-by: wang.yuqi <noooop@126.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> 2025-09-18 23:22:01 +08:00			`hf_runner,`
			`vllm_runner,`
			`example_prompts,`
			`model: str,`
			`dtype: str,`
			`) -> None:`
			`with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:`
[Model][2/N] Improve all pooling task \| Support multi-vector retrieval (#25370) Signed-off-by: wang.yuqi <noooop@126.com> 2025-10-15 19:14:41 +08:00			`vllm_outputs = vllm_model.token_classify(example_prompts)`
[New Model] Support BertForTokenClassification / Named Entity Recognition (NER) task (#24872) Signed-off-by: wang.yuqi <noooop@126.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> 2025-09-18 23:22:01 +08:00
[ROCm][CI] Fix ModernBERT token classification test (#31612) Signed-off-by: Andreas Karatzas <akaratza@amd.com> 2026-01-01 22:19:08 -06:00			`# Use eager attention on ROCm to avoid HF Transformers flash attention`
			`# accuracy issues: https://github.com/vllm-project/vllm/issues/30167`
			`hf_model_kwargs = {}`
			`if current_platform.is_rocm():`
			`hf_model_kwargs["attn_implementation"] = "eager"`

[New Model] Support BertForTokenClassification / Named Entity Recognition (NER) task (#24872) Signed-off-by: wang.yuqi <noooop@126.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> 2025-09-18 23:22:01 +08:00			`with hf_runner(`
[ROCm][CI] Fix ModernBERT token classification test (#31612) Signed-off-by: Andreas Karatzas <akaratza@amd.com> 2026-01-01 22:19:08 -06:00			`model,`
			`dtype=dtype,`
			`auto_cls=AutoModelForTokenClassification,`
			`model_kwargs=hf_model_kwargs,`
[New Model] Support BertForTokenClassification / Named Entity Recognition (NER) task (#24872) Signed-off-by: wang.yuqi <noooop@126.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> 2025-09-18 23:22:01 +08:00			`) as hf_model:`
			`tokenizer = hf_model.tokenizer`
			`hf_outputs = []`
			`for prompt in example_prompts:`
			`inputs = tokenizer([prompt], return_tensors="pt")`
			`inputs = hf_model.wrap_device(inputs)`
			`output = hf_model.model(**inputs)`
			`hf_outputs.append(softmax(output.logits[0]))`

			`# check logits difference`
			`for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):`
[ROCm][CI] Fix ModernBERT token classification test (#31612) Signed-off-by: Andreas Karatzas <akaratza@amd.com> 2026-01-01 22:19:08 -06:00			`hf_output = hf_output.detach().clone().cpu().float()`
			`vllm_output = vllm_output.detach().clone().cpu().float()`
[Bugfix][CI/Build] Fix failing pooling models test due to Triton kernel accuracy diff (#31776) Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> 2026-01-06 16:44:22 +08:00			`torch.testing.assert_close(hf_output, vllm_output, atol=1.2e-2, rtol=1e-3)`
[Model] Automatic conversion of TokenClassification model (#30666) Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io> 2025-12-15 16:13:00 +08:00

			`@pytest.mark.parametrize("model", ["bd2lcco/Qwen3-0.6B-finetuned"])`
			`@pytest.mark.parametrize("dtype", ["float"])`
			`@torch.inference_mode`
			`def test_auto_conversion(`
			`hf_runner,`
			`vllm_runner,`
			`example_prompts,`
			`model: str,`
			`dtype: str,`
			`) -> None:`
			`with vllm_runner(model, max_model_len=1024, dtype=dtype) as vllm_model:`
			`vllm_outputs = vllm_model.token_classify(example_prompts)`

			`with hf_runner(`
			`model, dtype=dtype, auto_cls=AutoModelForTokenClassification`
			`) as hf_model:`
			`tokenizer = hf_model.tokenizer`
			`hf_outputs = []`
			`for prompt in example_prompts:`
			`inputs = tokenizer([prompt], return_tensors="pt")`
			`inputs = hf_model.wrap_device(inputs)`
			`output = hf_model.model(**inputs)`
			`hf_outputs.append(softmax(output.logits[0]))`

			`# check logits difference`
			`for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):`
[ROCm][CI] Fix ModernBERT token classification test (#31612) Signed-off-by: Andreas Karatzas <akaratza@amd.com> 2026-01-01 22:19:08 -06:00			`hf_output = hf_output.detach().clone().cpu().float()`
			`vllm_output = vllm_output.detach().clone().cpu().float()`
[Model] Automatic conversion of TokenClassification model (#30666) Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io> 2025-12-15 16:13:00 +08:00			`assert torch.allclose(hf_output, vllm_output, atol=1e-2)`