tests/models/language/pooling/test_token_classification.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import random

import numpy as np
import pytest
import torch
from transformers import AutoModelForTokenClassification

from tests.models.utils import softmax
from vllm.platforms import current_platform


@pytest.fixture(autouse=True)
def seed_everything():
    """Seed all random number generators for reproducibility."""
    seed = 0
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    yield


@pytest.mark.parametrize(
    "model",
    [
        "boltuix/NeuroBERT-NER",
        "gyr66/Ernie-3.0-base-chinese-finetuned-ner",
    ],
)
# The float32 is required for this tiny model to pass the test.
@pytest.mark.parametrize("dtype", ["float"])
@torch.inference_mode
def test_bert_like_models(
    hf_runner,
    vllm_runner,
    example_prompts,
    model: str,
    dtype: str,
) -> None:
    with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
        vllm_outputs = vllm_model.token_classify(example_prompts)

    # Use eager attention on ROCm to avoid HF Transformers flash attention
    # accuracy issues: https://github.com/vllm-project/vllm/issues/30167
    hf_model_kwargs = {}
    if current_platform.is_rocm():
        hf_model_kwargs["attn_implementation"] = "eager"

    with hf_runner(
        model,
        dtype=dtype,
        auto_cls=AutoModelForTokenClassification,
        model_kwargs=hf_model_kwargs,
    ) as hf_model:
        tokenizer = hf_model.tokenizer
        hf_outputs = []
        for prompt in example_prompts:
            inputs = tokenizer([prompt], return_tensors="pt")
            inputs = hf_model.wrap_device(inputs)
            output = hf_model.model(**inputs)
            hf_outputs.append(softmax(output.logits[0]))

    # check logits difference
    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
        hf_output = hf_output.detach().clone().cpu().float()
        vllm_output = vllm_output.detach().clone().cpu().float()
        torch.testing.assert_close(hf_output, vllm_output, atol=3.2e-2, rtol=1e-3)


@pytest.mark.parametrize("model", ["disham993/electrical-ner-ModernBERT-base"])
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.flaky(reruns=3)
@torch.inference_mode
def test_modernbert_models(
    hf_runner,
    vllm_runner,
    example_prompts,
    model: str,
    dtype: str,
) -> None:
    # NOTE: https://github.com/vllm-project/vllm/pull/32403
    # `disham993/electrical-ner-ModernBERT-base` is a randomly initialized
    # model, which can cause numerical precision variance and edge cases.
    # We use @flaky(reruns=3) to mitigate intermittent failures.
    print(
        f"\n[NOTE] Testing {model} (randomly initialized weights) - "
        "flaky tolerance enabled due to numerical precision variance."
    )

    with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
        vllm_outputs = vllm_model.token_classify(example_prompts)

    # Use eager attention on ROCm to avoid HF Transformers flash attention
    # accuracy issues: https://github.com/vllm-project/vllm/issues/30167
    hf_model_kwargs = {}
    if current_platform.is_rocm():
        hf_model_kwargs["attn_implementation"] = "eager"

    with hf_runner(
        model,
        dtype=dtype,
        auto_cls=AutoModelForTokenClassification,
        model_kwargs=hf_model_kwargs,
    ) as hf_model:
        tokenizer = hf_model.tokenizer
        hf_outputs = []
        for prompt in example_prompts:
            inputs = tokenizer([prompt], return_tensors="pt")
            inputs = hf_model.wrap_device(inputs)
            output = hf_model.model(**inputs)
            hf_outputs.append(softmax(output.logits[0]))

    # check logits difference
    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
        hf_output = hf_output.detach().clone().cpu().float()
        vllm_output = vllm_output.detach().clone().cpu().float()
        torch.testing.assert_close(hf_output, vllm_output, atol=3.2e-2, rtol=1e-3)


@pytest.mark.parametrize("model", ["bd2lcco/Qwen3-0.6B-finetuned"])
@pytest.mark.parametrize("dtype", ["float"])
@torch.inference_mode
def test_auto_conversion(
    hf_runner,
    vllm_runner,
    example_prompts,
    model: str,
    dtype: str,
) -> None:
    with vllm_runner(model, max_model_len=1024, dtype=dtype) as vllm_model:
        vllm_outputs = vllm_model.token_classify(example_prompts)

    with hf_runner(
        model, dtype=dtype, auto_cls=AutoModelForTokenClassification
    ) as hf_model:
        tokenizer = hf_model.tokenizer
        hf_outputs = []
        for prompt in example_prompts:
            inputs = tokenizer([prompt], return_tensors="pt")
            inputs = hf_model.wrap_device(inputs)
            output = hf_model.model(**inputs)
            hf_outputs.append(softmax(output.logits[0]))

    # check logits difference
    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
        hf_output = hf_output.detach().clone().cpu().float()
        vllm_output = vllm_output.detach().clone().cpu().float()
        assert torch.allclose(hf_output, vllm_output, atol=1e-2)
[New Model] Support BertForTokenClassification / Named Entity Recognition (NER) task (#24872) Signed-off-by: wang.yuqi <noooop@126.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> 2025-09-18 23:22:01 +08:00			`# SPDX-License-Identifier: Apache-2.0`
			`# SPDX-FileCopyrightText: Copyright contributors to the vLLM project`
[CI][Pooling] Stabilize ModernBERT test (#32909) Signed-off-by: Andreas Karatzas <akaratza@amd.com> 2026-01-26 23:26:48 -06:00			`import random`

			`import numpy as np`
[New Model] Support BertForTokenClassification / Named Entity Recognition (NER) task (#24872) Signed-off-by: wang.yuqi <noooop@126.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> 2025-09-18 23:22:01 +08:00			`import pytest`
			`import torch`
			`from transformers import AutoModelForTokenClassification`

			`from tests.models.utils import softmax`
[ROCm][CI] Fix test_token_classification.py::test_bert_models (#31993) Signed-off-by: Divakar Verma <divakar.verma@amd.com> 2026-01-08 22:04:33 -06:00			`from vllm.platforms import current_platform`
[New Model] Support BertForTokenClassification / Named Entity Recognition (NER) task (#24872) Signed-off-by: wang.yuqi <noooop@126.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> 2025-09-18 23:22:01 +08:00

[CI][Pooling] Stabilize ModernBERT test (#32909) Signed-off-by: Andreas Karatzas <akaratza@amd.com> 2026-01-26 23:26:48 -06:00			`@pytest.fixture(autouse=True)`
			`def seed_everything():`
			`"""Seed all random number generators for reproducibility."""`
			`seed = 0`
			`random.seed(seed)`
			`np.random.seed(seed)`
			`torch.manual_seed(seed)`
			`if torch.cuda.is_available():`
			`torch.cuda.manual_seed_all(seed)`
			`torch.backends.cudnn.deterministic = True`
			`torch.backends.cudnn.benchmark = False`
			`yield`


[Model] Add support for BERT-like Chinese ERNIE pooling models (#36385) Signed-off-by: whyiug <whyiug@hotmail.com> Co-authored-by: wang.yuqi <yuqi.wang@daocloud.io> 2026-03-13 11:23:53 +08:00			`@pytest.mark.parametrize(`
			`"model",`
			`[`
			`"boltuix/NeuroBERT-NER",`
			`"gyr66/Ernie-3.0-base-chinese-finetuned-ner",`
			`],`
			`)`
[New Model] Support BertForTokenClassification / Named Entity Recognition (NER) task (#24872) Signed-off-by: wang.yuqi <noooop@126.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> 2025-09-18 23:22:01 +08:00			`# The float32 is required for this tiny model to pass the test.`
			`@pytest.mark.parametrize("dtype", ["float"])`
			`@torch.inference_mode`
[Model] Add support for BERT-like Chinese ERNIE pooling models (#36385) Signed-off-by: whyiug <whyiug@hotmail.com> Co-authored-by: wang.yuqi <yuqi.wang@daocloud.io> 2026-03-13 11:23:53 +08:00			`def test_bert_like_models(`
[Model] Add support for ModernBertForTokenClassification (#26340) Signed-off-by: Antoine Recanati Le Goat <antoine.recanati@sancare.fr> Signed-off-by: antrec <antoine.recanati@gmail.com> Co-authored-by: Antoine Recanati Le Goat <antoine.recanati@sancare.fr> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> 2025-10-07 16:29:19 +02:00			`hf_runner,`
			`vllm_runner,`
			`example_prompts,`
			`model: str,`
			`dtype: str,`
			`) -> None:`
			`with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:`
[Model][2/N] Improve all pooling task \| Support multi-vector retrieval (#25370) Signed-off-by: wang.yuqi <noooop@126.com> 2025-10-15 19:14:41 +08:00			`vllm_outputs = vllm_model.token_classify(example_prompts)`
[Model] Add support for ModernBertForTokenClassification (#26340) Signed-off-by: Antoine Recanati Le Goat <antoine.recanati@sancare.fr> Signed-off-by: antrec <antoine.recanati@gmail.com> Co-authored-by: Antoine Recanati Le Goat <antoine.recanati@sancare.fr> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> 2025-10-07 16:29:19 +02:00
[ROCm][CI] Fix test_token_classification.py::test_bert_models (#31993) Signed-off-by: Divakar Verma <divakar.verma@amd.com> 2026-01-08 22:04:33 -06:00			`# Use eager attention on ROCm to avoid HF Transformers flash attention`
			`# accuracy issues: https://github.com/vllm-project/vllm/issues/30167`
			`hf_model_kwargs = {}`
			`if current_platform.is_rocm():`
			`hf_model_kwargs["attn_implementation"] = "eager"`

[Model] Add support for ModernBertForTokenClassification (#26340) Signed-off-by: Antoine Recanati Le Goat <antoine.recanati@sancare.fr> Signed-off-by: antrec <antoine.recanati@gmail.com> Co-authored-by: Antoine Recanati Le Goat <antoine.recanati@sancare.fr> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> 2025-10-07 16:29:19 +02:00			`with hf_runner(`
[ROCm][CI] Fix test_token_classification.py::test_bert_models (#31993) Signed-off-by: Divakar Verma <divakar.verma@amd.com> 2026-01-08 22:04:33 -06:00			`model,`
			`dtype=dtype,`
			`auto_cls=AutoModelForTokenClassification,`
			`model_kwargs=hf_model_kwargs,`
[Model] Add support for ModernBertForTokenClassification (#26340) Signed-off-by: Antoine Recanati Le Goat <antoine.recanati@sancare.fr> Signed-off-by: antrec <antoine.recanati@gmail.com> Co-authored-by: Antoine Recanati Le Goat <antoine.recanati@sancare.fr> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> 2025-10-07 16:29:19 +02:00			`) as hf_model:`
			`tokenizer = hf_model.tokenizer`
			`hf_outputs = []`
			`for prompt in example_prompts:`
			`inputs = tokenizer([prompt], return_tensors="pt")`
			`inputs = hf_model.wrap_device(inputs)`
			`output = hf_model.model(**inputs)`
			`hf_outputs.append(softmax(output.logits[0]))`

			`# check logits difference`
			`for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):`
[ROCm][CI] Fix ModernBERT token classification test (#31612) Signed-off-by: Andreas Karatzas <akaratza@amd.com> 2026-01-01 22:19:08 -06:00			`hf_output = hf_output.detach().clone().cpu().float()`
			`vllm_output = vllm_output.detach().clone().cpu().float()`
[Performance] Improve Triton prefill attention kernel's performance (#32403) Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> 2026-01-18 12:19:59 +08:00			`torch.testing.assert_close(hf_output, vllm_output, atol=3.2e-2, rtol=1e-3)`
[Model] Add support for ModernBertForTokenClassification (#26340) Signed-off-by: Antoine Recanati Le Goat <antoine.recanati@sancare.fr> Signed-off-by: antrec <antoine.recanati@gmail.com> Co-authored-by: Antoine Recanati Le Goat <antoine.recanati@sancare.fr> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> 2025-10-07 16:29:19 +02:00

			`@pytest.mark.parametrize("model", ["disham993/electrical-ner-ModernBERT-base"])`
			`@pytest.mark.parametrize("dtype", ["float"])`
[CI][Pooling] Stabilize ModernBERT test (#32909) Signed-off-by: Andreas Karatzas <akaratza@amd.com> 2026-01-26 23:26:48 -06:00			`@pytest.mark.flaky(reruns=3)`
[Model] Add support for ModernBertForTokenClassification (#26340) Signed-off-by: Antoine Recanati Le Goat <antoine.recanati@sancare.fr> Signed-off-by: antrec <antoine.recanati@gmail.com> Co-authored-by: Antoine Recanati Le Goat <antoine.recanati@sancare.fr> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> 2025-10-07 16:29:19 +02:00			`@torch.inference_mode`
			`def test_modernbert_models(`
[New Model] Support BertForTokenClassification / Named Entity Recognition (NER) task (#24872) Signed-off-by: wang.yuqi <noooop@126.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> 2025-09-18 23:22:01 +08:00			`hf_runner,`
			`vllm_runner,`
			`example_prompts,`
			`model: str,`
			`dtype: str,`
			`) -> None:`
[CI][Pooling] Stabilize ModernBERT test (#32909) Signed-off-by: Andreas Karatzas <akaratza@amd.com> 2026-01-26 23:26:48 -06:00			`# NOTE: https://github.com/vllm-project/vllm/pull/32403`
			# `disham993/electrical-ner-ModernBERT-base` is a randomly initialized
			`# model, which can cause numerical precision variance and edge cases.`
			`# We use @flaky(reruns=3) to mitigate intermittent failures.`
			`print(`
			`f"\n[NOTE] Testing {model} (randomly initialized weights) - "`
			`"flaky tolerance enabled due to numerical precision variance."`
			`)`

[New Model] Support BertForTokenClassification / Named Entity Recognition (NER) task (#24872) Signed-off-by: wang.yuqi <noooop@126.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> 2025-09-18 23:22:01 +08:00			`with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:`
[Model][2/N] Improve all pooling task \| Support multi-vector retrieval (#25370) Signed-off-by: wang.yuqi <noooop@126.com> 2025-10-15 19:14:41 +08:00			`vllm_outputs = vllm_model.token_classify(example_prompts)`
[New Model] Support BertForTokenClassification / Named Entity Recognition (NER) task (#24872) Signed-off-by: wang.yuqi <noooop@126.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> 2025-09-18 23:22:01 +08:00
[ROCm][CI] Fix ModernBERT token classification test (#31612) Signed-off-by: Andreas Karatzas <akaratza@amd.com> 2026-01-01 22:19:08 -06:00			`# Use eager attention on ROCm to avoid HF Transformers flash attention`
			`# accuracy issues: https://github.com/vllm-project/vllm/issues/30167`
			`hf_model_kwargs = {}`
			`if current_platform.is_rocm():`
			`hf_model_kwargs["attn_implementation"] = "eager"`

[New Model] Support BertForTokenClassification / Named Entity Recognition (NER) task (#24872) Signed-off-by: wang.yuqi <noooop@126.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> 2025-09-18 23:22:01 +08:00			`with hf_runner(`
[ROCm][CI] Fix ModernBERT token classification test (#31612) Signed-off-by: Andreas Karatzas <akaratza@amd.com> 2026-01-01 22:19:08 -06:00			`model,`
			`dtype=dtype,`
			`auto_cls=AutoModelForTokenClassification,`
			`model_kwargs=hf_model_kwargs,`
[New Model] Support BertForTokenClassification / Named Entity Recognition (NER) task (#24872) Signed-off-by: wang.yuqi <noooop@126.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> 2025-09-18 23:22:01 +08:00			`) as hf_model:`
			`tokenizer = hf_model.tokenizer`
			`hf_outputs = []`
			`for prompt in example_prompts:`
			`inputs = tokenizer([prompt], return_tensors="pt")`
			`inputs = hf_model.wrap_device(inputs)`
			`output = hf_model.model(**inputs)`
			`hf_outputs.append(softmax(output.logits[0]))`

			`# check logits difference`
			`for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):`
[ROCm][CI] Fix ModernBERT token classification test (#31612) Signed-off-by: Andreas Karatzas <akaratza@amd.com> 2026-01-01 22:19:08 -06:00			`hf_output = hf_output.detach().clone().cpu().float()`
			`vllm_output = vllm_output.detach().clone().cpu().float()`
[Performance] Improve Triton prefill attention kernel's performance (#32403) Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> 2026-01-18 12:19:59 +08:00			`torch.testing.assert_close(hf_output, vllm_output, atol=3.2e-2, rtol=1e-3)`
[Model] Automatic conversion of TokenClassification model (#30666) Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io> 2025-12-15 16:13:00 +08:00

			`@pytest.mark.parametrize("model", ["bd2lcco/Qwen3-0.6B-finetuned"])`
			`@pytest.mark.parametrize("dtype", ["float"])`
			`@torch.inference_mode`
			`def test_auto_conversion(`
			`hf_runner,`
			`vllm_runner,`
			`example_prompts,`
			`model: str,`
			`dtype: str,`
			`) -> None:`
			`with vllm_runner(model, max_model_len=1024, dtype=dtype) as vllm_model:`
			`vllm_outputs = vllm_model.token_classify(example_prompts)`

			`with hf_runner(`
			`model, dtype=dtype, auto_cls=AutoModelForTokenClassification`
			`) as hf_model:`
			`tokenizer = hf_model.tokenizer`
			`hf_outputs = []`
			`for prompt in example_prompts:`
			`inputs = tokenizer([prompt], return_tensors="pt")`
			`inputs = hf_model.wrap_device(inputs)`
			`output = hf_model.model(**inputs)`
			`hf_outputs.append(softmax(output.logits[0]))`

			`# check logits difference`
			`for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):`
[ROCm][CI] Fix ModernBERT token classification test (#31612) Signed-off-by: Andreas Karatzas <akaratza@amd.com> 2026-01-01 22:19:08 -06:00			`hf_output = hf_output.detach().clone().cpu().float()`
			`vllm_output = vllm_output.detach().clone().cpu().float()`
[Model] Automatic conversion of TokenClassification model (#30666) Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io> 2025-12-15 16:13:00 +08:00			`assert torch.allclose(hf_output, vllm_output, atol=1e-2)`