[V1] V1 Enablement Oracle (#13726)

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com> Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com> Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: Michael Goin <michael@neuralmagic.com>
2025-03-15 01:02:20 -04:00
parent 8c0d15d5c5
commit d4d93db2c5
96 changed files with 1537 additions and 512 deletions
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -117,10 +117,10 @@ steps:
  - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
-  - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
+  - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/correctness/
  - pytest -v -s entrypoints/test_chat_utils.py
-  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+  - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

 - label: Distributed Tests (4 GPUs) # 10min
  working_dir: "/vllm-workspace/tests"
@@ -136,7 +136,7 @@ steps:
  - examples/offline_inference/rlhf_colocate.py
  - tests/examples/offline_inference/data_parallel.py
  commands:
-  - VLLM_USE_V1=1 python3 ../examples/offline_inference/data_parallel.py
+  - python3 ../examples/offline_inference/data_parallel.py
  - pytest -v -s distributed/test_utils.py
  - pytest -v -s compile/test_basic_correctness.py
  - pytest -v -s distributed/test_pynccl.py
@@ -197,16 +197,17 @@ steps:
    - tests/v1
  commands:
    # split the test to avoid interference
-    - VLLM_USE_V1=1 pytest -v -s v1/core
-    - VLLM_USE_V1=1 pytest -v -s v1/engine
-    - VLLM_USE_V1=1 pytest -v -s v1/sample
-    - VLLM_USE_V1=1 pytest -v -s v1/worker
-    - VLLM_USE_V1=1 pytest -v -s v1/structured_output
-    - VLLM_USE_V1=1 pytest -v -s v1/test_stats.py
-    - VLLM_USE_V1=1 pytest -v -s v1/test_utils.py
+    - pytest -v -s v1/core
+    - pytest -v -s v1/engine
+    - pytest -v -s v1/sample
+    - pytest -v -s v1/worker
+    - pytest -v -s v1/structured_output
+    - pytest -v -s v1/test_stats.py
+    - pytest -v -s v1/test_utils.py
+    - pytest -v -s v1/test_oracle.py
    # TODO: accuracy does not match, whether setting
    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
-    - VLLM_USE_V1=1 pytest -v -s v1/e2e
+    - pytest -v -s v1/e2e
    # Integration test for streaming correctness (requires special branch).
    - pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
@@ -226,12 +227,12 @@ steps:
    - python3 offline_inference/llm_engine_example.py
    - python3 offline_inference/vision_language.py
    - python3 offline_inference/vision_language_multi_image.py
-    - python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference/encoder_decoder.py
    - python3 offline_inference/basic/classify.py
    - python3 offline_inference/basic/embed.py
    - python3 offline_inference/basic/score.py
-    - python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
+    - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2

 - label: Prefix Caching Test # 9min
  mirror_hardwares: [amd]
@@ -375,7 +376,8 @@ steps:
  commands:
    - pytest -v -s models/test_transformers.py
    - pytest -v -s models/test_registry.py
-    - pytest -v -s models/test_initialization.py
+    # V1 Test: https://github.com/vllm-project/vllm/issues/14531
+    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py

 - label: Language Models Test (Standard) # 32min
  #mirror_hardwares: [amd]
@@ -518,8 +520,8 @@ steps:
  # this test fails consistently.
  # TODO: investigate and fix
  # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
-  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
-  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
+  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
+  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py

 - label: Plugin Tests (2 GPUs) # 40min
  working_dir: "/vllm-workspace/tests"