# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
LM eval harness on model to compare vs HF baseline computed offline.
Configs are found in configs/$MODEL.yaml

pytest -s -v test_lm_eval_correctness.py \
    --config-list-file=configs/models-small.txt \
    --tp-size=1
"""

import os
from contextlib import contextmanager

import lm_eval
import pytest
import yaml

from vllm.platforms import current_platform

DEFAULT_RTOL = 0.08


@contextmanager
def scoped_env_vars(new_env: dict[str, str]):
    if not new_env:
        # Fast path: nothing to do
        yield
        return

    old_values = {}
    new_keys = []

    try:
        for key, value in new_env.items():
            if key in os.environ:
                old_values[key] = os.environ[key]
            else:
                new_keys.append(key)
            os.environ[key] = str(value)
        yield
    finally:
        # Restore / clean up
        for key, value in old_values.items():
            os.environ[key] = value
        for key in new_keys:
            os.environ.pop(key, None)


def launch_lm_eval(eval_config, tp_size):
    trust_remote_code = eval_config.get("trust_remote_code", False)
    max_model_len = eval_config.get("max_model_len", 4096)
    batch_size = eval_config.get("batch_size", "auto")
    backend = eval_config.get("backend", "vllm")
    enforce_eager = eval_config.get("enforce_eager", "true")
    kv_cache_dtype = eval_config.get("kv_cache_dtype", "auto")
    model_args = (
        f"pretrained={eval_config['model_name']},"
        f"tensor_parallel_size={tp_size},"
        f"enforce_eager={enforce_eager},"
        f"kv_cache_dtype={kv_cache_dtype},"
        f"add_bos_token=true,"
        f"trust_remote_code={trust_remote_code},"
        f"max_model_len={max_model_len},"
        "allow_deprecated_quantization=True,"
    )

    if current_platform.is_rocm() and "Nemotron-3" in eval_config["model_name"]:
        model_args += "attention_backend=TRITON_ATTN"

    env_vars = eval_config.get("env_vars", None)
    with scoped_env_vars(env_vars):
        results = lm_eval.simple_evaluate(
            model=backend,
            model_args=model_args,
            tasks=[task["name"] for task in eval_config["tasks"]],
            num_fewshot=eval_config["num_fewshot"],
            limit=eval_config["limit"],
            # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
            # text models. however, this is regressing measured strict-match for
            # existing text models in CI, so only apply it for mm, or explicitly set
            apply_chat_template=eval_config.get(
                "apply_chat_template", backend == "vllm-vlm"
            ),
            fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
            # Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
            gen_kwargs=eval_config.get("gen_kwargs"),
            batch_size=batch_size,
        )
    return results


def _check_rocm_gpu_arch_requirement(eval_config):
    """Skip the test if the model requires a ROCm GPU arch not present.

    Model YAML configs can specify::

        required_gpu_arch:
          - gfx942
          - gfx950

    The check only applies on ROCm.  On other platforms (e.g. CUDA) the
    field is ignored so that shared config files work for both NVIDIA and
    AMD CI pipelines.
    """
    required_archs = eval_config.get("required_gpu_arch")
    if not required_archs:
        return

    if not current_platform.is_rocm():
        return

    from vllm.platforms.rocm import _GCN_ARCH  # noqa: E402

    if not any(arch in _GCN_ARCH for arch in required_archs):
        pytest.skip(
            f"Model requires GPU arch {required_archs}, "
            f"but detected arch is '{_GCN_ARCH}'"
        )


def test_lm_eval_correctness_param(config_filename, tp_size):
    eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))

    _check_rocm_gpu_arch_requirement(eval_config)

    results = launch_lm_eval(eval_config, tp_size)

    rtol = eval_config.get("rtol", DEFAULT_RTOL)

    success = True
    for task in eval_config["tasks"]:
        for metric in task["metrics"]:
            ground_truth = metric["value"]
            measured_value = results["results"][task["name"]][metric["name"]]
            print(
                f"{task['name']} | {metric['name']}: "
                f"ground_truth={ground_truth:.3f} | "
                f"measured={measured_value:.3f} | rtol={rtol}"
            )

            min_acceptable = ground_truth * (1 - rtol)
            success = success and measured_value >= min_acceptable

    assert success