bump version to v0.6.1.post2 (#8473 )

[HotFix] Fix final output truncation with stop string + streaming (#8468 )
[Doc] Add oneDNN installation to CPU backend documentation (#8467 )
2024-09-13 11:35:00 -07:00 · 2024-09-13 11:31:12 -07:00 · 2024-09-13 18:06:30 +00:00 · 2024-09-13 10:20:06 -07:00 · 2024-09-13 09:32:42 -07:00 · 2024-09-13 09:02:26 -07:00
117 changed files with 2993 additions and 1833 deletions
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -23,12 +23,10 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
 # Run basic model test
 docker exec cpu-test bash -c "
  pip install pytest matplotlib einops transformers_stream_generator
-  pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py \
+  pytest -v -s tests/models/decoder_only/language \
-      --ignore=tests/models/test_oot_registration.py \
+    --ignore=tests/models/test_fp8.py \
-      --ignore=tests/models/test_registry.py \
+    --ignore=tests/models/decoder_only/language/test_jamba.py \
-      --ignore=tests/models/test_fp8.py \
+    --ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
      --ignore=tests/models/test_jamba.py \
      --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
 # Run compressed-tensor test
 docker exec cpu-test bash -c "
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -50,6 +50,7 @@ steps:
  - tests/worker
  commands:
  - pytest -v -s async_engine # Async Engine
  - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
  - pytest -v -s test_inputs.py
  - pytest -v -s multimodal
  - pytest -v -s test_utils.py # Utils
@@ -91,7 +92,7 @@ steps:
  - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
  - pytest -v -s entrypoints/openai
  - pytest -v -s entrypoints/test_chat_utils.py
-
+  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 - label: Distributed Tests (4 GPUs) # 10min
  working_dir: "/vllm-workspace/tests"
@@ -162,15 +163,6 @@ steps:
    - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference_encoder_decoder.py
 - label: Models Test # 1hr10min
  source_file_dependencies:
  - vllm/
  - tests/models
  commands:
    - pip install -e ./plugins/vllm_add_dummy_model
    - pytest -v -s models/test_oot_registration.py # it needs a clean process
    - pytest -v -s models -m \"not vlm\" --ignore=models/test_oot_registration.py
 - label: torch compile integration test
  source_file_dependencies:
  - vllm/
@@ -178,14 +170,6 @@ steps:
    - pytest -v -s ./compile/test_full_graph.py
    - pytest -v -s ./compile/test_wrapper.py
 - label: Vision Language Models Test # 42min
  #mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  commands:
    - pytest -v -s models -m vlm
 - label: Prefix Caching Test # 7min
  #mirror_hardwares: [amd]
  source_file_dependencies:
@@ -284,6 +268,45 @@ steps:
  commands:
    - pytest -v -s tool_use
 #####  models test  #####
 - label: Basic Models Test # 3min
  source_file_dependencies:
  - vllm/
  - tests/models
  commands:
    - pip install -e ./plugins/vllm_add_dummy_model
    - pytest -v -s models/test_oot_registration.py # it needs a clean process
    - pytest -v -s models/*.py --ignore=models/test_oot_registration.py
 - label: Decoder-only Language Models Test # 1h3min
  #mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  - tests/models/decoder_only/language
  commands:
    - pytest -v -s models/decoder_only/language
 - label: Decoder-only Multi-Modal Models Test # 56min
  #mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  - tests/models/decoder_only/audio_language
  - tests/models/decoder_only/vision_language
  commands:
    - pytest -v -s models/decoder_only/audio_language
    - pytest -v -s models/decoder_only/vision_language
 - label: Other Models Test # 5min
  #mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  - tests/models/embedding/language
  - tests/models/encoder_decoder/language
  commands:
    - pytest -v -s models/embedding/language
    - pytest -v -s models/encoder_decoder/language
 #####  1 GPU test  #####
 #####  multi gpus test  #####
@@ -309,11 +332,11 @@ steps:
  - tests/distributed/
  commands:
  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
  - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
 - label: Distributed Tests (2 GPUs) # 28min
  #mirror_hardwares: [amd]
@@ -326,11 +349,10 @@ steps:
  - vllm/model_executor/models/
  - tests/distributed/
  commands:
-  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
+  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
-  - TARGET_TEST_SUITE=L4 pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
-  - pytest -v -s distributed/test_basic_distributed_correctness_enc_dec.py
+  # Avoid importing model tests that cause CUDA reinitialization error
-  - pytest -v -s distributed/test_chunked_prefill_distributed.py
+  - pytest models/encoder_decoder/language/test_bart.py models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
  - pytest -v -s distributed/test_multimodal_broadcast.py
  - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
  - pip install -e ./plugins/vllm_add_dummy_model
  - pytest -v -s distributed/test_distributed_oot.py
--- a/.github/ISSUE_TEMPLATE/400-bug
+++ b/.github/ISSUE_TEMPLATE/400-bug
@@ -30,6 +30,15 @@ body:
      </details>
  validations:
    required: true
 - type: textarea
  attributes:
    label: Model Input Dumps
    description: |
      If you are facing crashing due to illegal memory access or other issues with model execution, vLLM may dump the problematic input of the model. In this case, you will see the message `Error in model execution (input dumped to /tmp/err_xxx.pkl)`. If you see this message, please zip the file (because GitHub doesn't support .pkl file format) and upload it here. This will help us to reproduce the issue and facilitate the debugging process.
    placeholder: |
      Upload the dumped input file.
  validations:
    required: false
 - type: textarea
  attributes:
    label: 🐛 Describe the bug
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -54,10 +54,21 @@ void gelu_fast(torch::Tensor& out, torch::Tensor& input);
 void gelu_quick(torch::Tensor& out, torch::Tensor& input);
-void advance_step(int64_t num_seqs, int64_t num_queries, int64_t block_size,
+void advance_step_flashattn(int64_t num_seqs, int64_t num_queries,
-                  torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
+                            int64_t block_size, torch::Tensor& input_tokens,
-                  torch::Tensor& input_positions, torch::Tensor& seq_lens,
+                            torch::Tensor& sampled_token_ids,
-                  torch::Tensor& slot_mapping, torch::Tensor& block_tables);
+                            torch::Tensor& input_positions,
                            torch::Tensor& seq_lens,
                            torch::Tensor& slot_mapping,
                            torch::Tensor& block_tables);
 void advance_step_flashinfer(
    int64_t num_seqs, int64_t num_queries, int64_t block_size,
    torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
    torch::Tensor& input_positions, torch::Tensor& seq_lens,
    torch::Tensor& slot_mapping, torch::Tensor& block_tables,
    torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr,
    torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bounds);
 #ifndef USE_ROCM
 torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
--- a/csrc/prepare_inputs/advance_step.cu
+++ b/csrc/prepare_inputs/advance_step.cu
@@ -12,13 +12,11 @@ namespace prepare_inputs {
 //
 template <int const num_threads>
-__global__ void advance_step_kernel(int num_seqs, int num_queries,
+__global__ void advance_step_flashattn_kernel(
-                                    int block_size, long* input_tokens_ptr,
+    int num_seqs, int num_queries, int block_size, long* input_tokens_ptr,
-                                    long const* sampled_token_ids_ptr,
+    long const* sampled_token_ids_ptr, long* input_positions_ptr,
-                                    long* input_positions_ptr,
+    int* seq_lens_ptr, long* slot_mapping_ptr, int const* block_tables_ptr,
-                                    int* seq_lens_ptr, long* slot_mapping_ptr,
+    int64_t const block_tables_stride) {
                                    int const* block_tables_ptr,
                                    int64_t const block_tables_stride) {
  int num_query_blocks = div_ceil(num_queries, num_threads);
  if (blockIdx.x >= num_query_blocks) {
@@ -79,16 +77,91 @@ inline void verify_tensor(std::string const& name, torch::Tensor& t,
  }
 }
-void advance_step(int num_seqs, int num_queries, int block_size,
+__global__ void advance_step_flashinfer_kernel(
-                  torch::Tensor& input_tokens,       // type: long
+    int num_threads, int num_seqs, int num_queries, int block_size,
-                  torch::Tensor& sampled_token_ids,  // type: long
+    long* input_tokens_ptr, long const* sampled_token_ids_ptr,
-                  torch::Tensor& input_positions,    // type: long
+    long* input_positions_ptr, int* seq_lens_ptr, long* slot_mapping_ptr,
-                  torch::Tensor& seq_lens,           // type: int
+    int const* block_tables_ptr, int64_t const block_tables_stride,
-                  torch::Tensor& slot_mapping,       // type: long
+    int* paged_kv_last_page_len_ptr, int* block_table_bound_ptr) {
-                  torch::Tensor& block_tables) {     // type: int
+  int num_query_blocks = div_ceil(num_queries, num_threads);
  if (blockIdx.x < num_query_blocks) {
    int cur_query_id = blockIdx.x * num_threads + threadIdx.x;
    if (cur_query_id < num_queries) {
      // Update input_tokens
      input_tokens_ptr[cur_query_id] = sampled_token_ids_ptr[cur_query_id];
      int seq_len = seq_lens_ptr[cur_query_id];
      int next_seq_len = seq_len + 1;
      int next_input_pos = next_seq_len - 1;
      // Update seq_lens
      seq_lens_ptr[cur_query_id] = next_seq_len;
      // Update input_positions
      input_positions_ptr[cur_query_id] = next_input_pos;
      int const* seq_block_tables_ptr =
          block_tables_ptr + block_tables_stride * cur_query_id;
      int block_index = next_input_pos / block_size;
      int block_offset = next_input_pos % block_size;
      // Update paged_kv_last_page_len
      paged_kv_last_page_len_ptr[cur_query_id] = block_offset + 1;
      int slot_num =
          seq_block_tables_ptr[block_index] * block_size + block_offset;
      // Update slot_mapping
      slot_mapping_ptr[cur_query_id] = slot_num;
      block_table_bound_ptr[cur_query_id] = div_ceil(next_seq_len, block_size);
    }
  }
 }
 __global__ void advance_step_flashinfer_indptr_kernel(
    int num_threads, int num_seqs, int num_queries, int* paged_kv_indptr_ptr,
    int* block_table_bound_ptr) {
  int idx = blockIdx.x * num_threads + threadIdx.x;
  // Update paged_kv_indptr
  if (idx < num_queries) {
    int sum = 0;
    for (int i = 0; i <= idx; ++i) {
      sum += block_table_bound_ptr[i];
    }
    paged_kv_indptr_ptr[idx + 1] = sum;
  }
 }
 __global__ void advance_step_flashinfer_indices_kernel(
    int num_threads, int num_seqs, int num_queries, int const* block_tables_ptr,
    int64_t const block_tables_stride, int* paged_kv_indices_ptr,
    int* paged_kv_indptr_ptr, int* block_table_bound_ptr) {
  int idx = blockIdx.x * num_threads + threadIdx.x;
  int row = idx / block_tables_stride;
  int col = idx % block_tables_stride;
  if (row < num_queries && col < block_table_bound_ptr[row]) {
    paged_kv_indices_ptr[paged_kv_indptr_ptr[row] + col] =
        block_tables_ptr[row * block_tables_stride + col];
  }
  // if cudagraph, fill padded seqs with the last valid seq's indptr
  if (num_queries < row && row <= num_seqs) {
    paged_kv_indptr_ptr[row] = paged_kv_indptr_ptr[num_queries];
  }
 }
 void advance_step_flashattn(int num_seqs, int num_queries, int block_size,
                            torch::Tensor& input_tokens,       // type: long
                            torch::Tensor& sampled_token_ids,  // type: long
                            torch::Tensor& input_positions,    // type: long
                            torch::Tensor& seq_lens,           // type: int
                            torch::Tensor& slot_mapping,       // type: long
                            torch::Tensor& block_tables) {     // type: int
  if (logging) {
-    printf("advance_step:\n");
+    printf("advance_step_flashattn:\n");
    printf("  num_seqs = %d\n", num_seqs);
    printf("  num_queries = %d\n", num_queries);
    printf("  block_size = %d\n", block_size);
@@ -108,24 +181,126 @@ void advance_step(int num_seqs, int num_queries, int block_size,
  int blocks;
  cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
-  advance_step_kernel<max_threads><<<blocks, max_threads, 0, stream>>>(
+  advance_step_flashattn_kernel<max_threads>
-      num_seqs, num_queries, block_size,
+      <<<blocks, max_threads, 0, stream>>>(
          num_seqs, num_queries, block_size,
          reinterpret_cast<long*>(input_tokens.data_ptr()),
          reinterpret_cast<long const*>(sampled_token_ids.data_ptr()),
          reinterpret_cast<long*>(input_positions.data_ptr()),
          reinterpret_cast<int*>(seq_lens.data_ptr()),
          reinterpret_cast<long*>(slot_mapping.data_ptr()),
          reinterpret_cast<int const*>(block_tables.data_ptr()),
          block_tables.stride(0));
 }
 void advance_step_flashinfer(
    int num_seqs, int num_queries, int block_size,
    torch::Tensor& input_tokens,            // type: long
    torch::Tensor& sampled_token_ids,       // type: long
    torch::Tensor& input_positions,         // type: long
    torch::Tensor& seq_lens,                // type: int
    torch::Tensor& slot_mapping,            // type: long
    torch::Tensor& block_tables,            // type: int
    torch::Tensor& paged_kv_indices,        // type: int
    torch::Tensor& paged_kv_indptr,         // type: int
    torch::Tensor& paged_kv_last_page_len,  // type: int
    torch::Tensor& block_table_bound) {     // type: int
  if (logging) {
    printf("advance_step_flashinfer:\n");
    printf("  num_seqs = %d\n", num_seqs);
    printf("  num_queries = %d\n", num_queries);
    printf("  block_size = %d\n", block_size);
    printf("  block_tables.stride(0) = %d\n", block_tables.stride(0));
  }
  // Verify all tensors
  verify_tensor("input_tokens", input_tokens, num_seqs, -1, at::kLong);
  // verify_tensor("sampled_token_ids", sampled_token_ids, num_queries, 1,
  //               at::kLong);
  verify_tensor("input_positions", input_positions, num_seqs, -1, at::kLong);
  verify_tensor("seq_lens", seq_lens, num_seqs, -1, at::kInt);
  verify_tensor("slot_mapping", slot_mapping, num_seqs, -1, at::kLong);
  verify_tensor("block_tables", block_tables, num_seqs, -1, at::kInt);
  verify_tensor("paged_kv_indices", paged_kv_indices, -1, -1, at::kInt);
  verify_tensor("paged_kv_indptr", paged_kv_indptr, num_seqs + 1, -1, at::kInt);
  verify_tensor("paged_kv_last_page_len", paged_kv_last_page_len, num_seqs, -1,
                at::kInt);
  verify_tensor("block_table_bound", block_table_bound, num_seqs, -1, at::kInt);
  int dev = sampled_token_ids.get_device();
  cudaStream_t stream = at::cuda::getCurrentCUDAStream(dev);
  int blocks;
  int threads;
  cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
  cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev);
  if (logging) {
    printf("launching kernel with %d blocks\n", blocks);
  }
  // TODO(will): support arbitrary block_tables stride
  if ((blocks * threads) / block_tables.stride(0) < num_queries) {
    TORCH_CHECK(false,
                "multi-step: not enough threads to map block_table to"
                "FlashInfer's paged_kv_indices on GPU. Try reducing the number "
                "of seqs,",
                " increasing the block size or take smaller steps.",
                " num_queries = ", num_queries,
                " block_tables.stride(0) = ", block_tables.stride(0),
                " blocks = ", blocks, " max_threads = ", threads);
  }
  advance_step_flashinfer_kernel<<<blocks, threads, 0, stream>>>(
      threads, num_seqs, num_queries, block_size,
      reinterpret_cast<long*>(input_tokens.data_ptr()),
      reinterpret_cast<long const*>(sampled_token_ids.data_ptr()),
      reinterpret_cast<long*>(input_positions.data_ptr()),
      reinterpret_cast<int*>(seq_lens.data_ptr()),
      reinterpret_cast<long*>(slot_mapping.data_ptr()),
      reinterpret_cast<int const*>(block_tables.data_ptr()),
-      block_tables.stride(0));
+      block_tables.stride(0),
      reinterpret_cast<int*>(paged_kv_last_page_len.data_ptr()),
      reinterpret_cast<int*>(block_table_bound.data_ptr()));
  advance_step_flashinfer_indptr_kernel<<<blocks, threads, 0, stream>>>(
      threads, num_seqs, num_queries,
      reinterpret_cast<int*>(paged_kv_indptr.data_ptr()),
      reinterpret_cast<int*>(block_table_bound.data_ptr()));
  advance_step_flashinfer_indices_kernel<<<blocks, threads, 0, stream>>>(
      threads, num_seqs, num_queries,
      reinterpret_cast<int const*>(block_tables.data_ptr()),
      block_tables.stride(0),
      reinterpret_cast<int*>(paged_kv_indices.data_ptr()),
      reinterpret_cast<int*>(paged_kv_indptr.data_ptr()),
      reinterpret_cast<int*>(block_table_bound.data_ptr()));
 }
 }  // namespace prepare_inputs
-void advance_step(int64_t num_seqs, int64_t num_queries, int64_t block_size,
+void advance_step_flashattn(int64_t num_seqs, int64_t num_queries,
-                  torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
+                            int64_t block_size, torch::Tensor& input_tokens,
-                  torch::Tensor& input_positions, torch::Tensor& seq_lens,
+                            torch::Tensor& sampled_token_ids,
-                  torch::Tensor& slot_mapping, torch::Tensor& block_tables) {
+                            torch::Tensor& input_positions,
-  prepare_inputs::advance_step(num_seqs, num_queries, block_size, input_tokens,
+                            torch::Tensor& seq_lens,
-                               sampled_token_ids, input_positions, seq_lens,
+                            torch::Tensor& slot_mapping,
-                               slot_mapping, block_tables);
+                            torch::Tensor& block_tables) {
  prepare_inputs::advance_step_flashattn(
      num_seqs, num_queries, block_size, input_tokens, sampled_token_ids,
      input_positions, seq_lens, slot_mapping, block_tables);
 }
 void advance_step_flashinfer(
    int64_t num_seqs, int64_t num_queries, int64_t block_size,
    torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
    torch::Tensor& input_positions, torch::Tensor& seq_lens,
    torch::Tensor& slot_mapping, torch::Tensor& block_tables,
    torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr,
    torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bound) {
  prepare_inputs::advance_step_flashinfer(
      num_seqs, num_queries, block_size, input_tokens, sampled_token_ids,
      input_positions, seq_lens, slot_mapping, block_tables, paged_kv_indices,
      paged_kv_indptr, paged_kv_last_page_len, block_table_bound);
 }
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -74,11 +74,22 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // prepare_inputs advance_step
  ops.def(
-      "advance_step(int num_seqs, int num_queries, int block_size, "
+      "advance_step_flashattn(int num_seqs, int num_queries, int block_size, "
      "Tensor! input_tokens, Tensor sampled_token_ids, "
      "Tensor! input_positions, Tensor! seq_lens, Tensor! slot_mapping, "
      "Tensor block_tables) -> ()");
-  ops.impl("advance_step", torch::kCUDA, &advance_step);
+  ops.impl("advance_step_flashattn", torch::kCUDA, &advance_step_flashattn);
  ops.def(
      "advance_step_flashinfer("
      "    int num_seqs, int num_queries, int block_size,"
      "    Tensor! input_tokens, Tensor sampled_token_ids,"
      "    Tensor! input_positions, Tensor! seq_lens, Tensor! slot_mapping,"
      "    Tensor block_tables, Tensor! paged_kv_indices,"
      "    Tensor! paged_kv_indptr, Tensor! paged_kv_last_page_len,"
      "    Tensor! block_table_bounds"
      ") -> ()");
  ops.impl("advance_step_flashinfer", torch::kCUDA, &advance_step_flashinfer);
  // Layernorm
  // Apply Root Mean Square (RMS) Normalization to the input tensor.
--- a/docs/source/getting_started/cpu-installation.rst
+++ b/docs/source/getting_started/cpu-installation.rst
@@ -59,6 +59,20 @@ Build from source
    $ pip install wheel packaging ninja "setuptools>=49.4.0" numpy
    $ pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
 - Third, build and install oneDNN library from source:
 .. code-block:: console
    $ git clone -b rls-v3.5 https://github.com/oneapi-src/oneDNN.git
    $ cmake -B ./oneDNN/build -S ./oneDNN -G Ninja -DONEDNN_LIBRARY_TYPE=STATIC \ 
        -DONEDNN_BUILD_DOC=OFF \ 
        -DONEDNN_BUILD_EXAMPLES=OFF \ 
        -DONEDNN_BUILD_TESTS=OFF \ 
        -DONEDNN_BUILD_GRAPH=OFF \ 
        -DONEDNN_ENABLE_WORKLOAD=INFERENCE \ 
        -DONEDNN_ENABLE_PRIMITIVE=MATMUL
    $ cmake --build ./oneDNN/build --target install --config Release
 - Finally, build and install vLLM CPU backend: 
 .. code-block:: console
--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
@@ -26,6 +26,10 @@ You can install vLLM using pip:
    $ # Install vLLM with CUDA 12.1.
    $ pip install vllm
 .. note::
    Although we recommend using ``conda`` to create and manage Python environments, it is highly recommended to use ``pip`` to install vLLM. This is because ``pip`` can install ``torch`` with separate library packages like ``NCCL``, while ``conda`` installs ``torch`` with statically linked ``NCCL``. This can cause issues when vLLM tries to use ``NCCL``. See `this issue <https://github.com/vllm-project/vllm/issues/8420>`_ for more details.
 .. note::
    As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default.
@@ -34,7 +38,7 @@ You can install vLLM using pip:
    .. code-block:: console
        $ # Install vLLM with CUDA 11.8.
-        $ export VLLM_VERSION=0.4.0
+        $ export VLLM_VERSION=0.6.1.post1
        $ export PYTHON_VERSION=310
        $ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
@@ -48,7 +52,7 @@ You can install vLLM using pip:
    .. code-block:: console
-        $ export VLLM_VERSION=0.5.4 # vLLM's main branch version is currently set to latest released tag
+        $ export VLLM_VERSION=0.6.1.post1 # vLLM's main branch version is currently set to latest released tag
        $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-${VLLM_VERSION}-cp38-abi3-manylinux1_x86_64.whl
        $ # You can also access a specific commit
        $ # export VLLM_COMMIT=...
@@ -80,11 +84,11 @@ You can also build and install vLLM from source:
 .. tip::
-    Building from source requires quite a lot compilation. If you are building from source for multiple times, it is beneficial to cache the compilation results. For example, you can install `ccache <https://github.com/ccache/ccache>`_ via either `conda install ccache` or `apt install ccache` . As long as `which ccache` command can find the `ccache` binary, it will be used automatically by the build system. After the first build, the subsequent builds will be much faster.
+    Building from source requires quite a lot compilation. If you are building from source for multiple times, it is beneficial to cache the compilation results. For example, you can install `ccache <https://github.com/ccache/ccache>`_ via either ``conda install ccache`` or ``apt install ccache`` . As long as ``which ccache`` command can find the ``ccache`` binary, it will be used automatically by the build system. After the first build, the subsequent builds will be much faster.
 .. tip::
    To avoid your system being overloaded, you can limit the number of compilation jobs
-    to be run simultaneously, via the environment variable `MAX_JOBS`. For example:
+    to be run simultaneously, via the environment variable ``MAX_JOBS``. For example:
    .. code-block:: console
@@ -99,7 +103,7 @@ You can also build and install vLLM from source:
        $ # Use `--ipc=host` to make sure the shared memory is large enough.
        $ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3
-    If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from `the official website <https://developer.nvidia.com/cuda-toolkit-archive>`_. After installation, set the environment variable `CUDA_HOME` to the installation path of CUDA Toolkit, and make sure that the `nvcc` compiler is in your `PATH`, e.g.:
+    If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from `the official website <https://developer.nvidia.com/cuda-toolkit-archive>`_. After installation, set the environment variable ``CUDA_HOME`` to the installation path of CUDA Toolkit, and make sure that the ``nvcc`` compiler is in your ``PATH``, e.g.:
    .. code-block:: console
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -254,7 +254,7 @@ Multimodal Language Models
    -
  * - :code:`QWenLMHeadModel`
    - Qwen-VL
-    - Image\ :sup:`E`
+    - Image\ :sup:`E+`
    - :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc.
    -
  * - :code:`Qwen2VLForConditionalGeneration`
@@ -342,7 +342,7 @@ Note that, as an inference engine, vLLM does not introduce new models. Therefore
 We have the following levels of testing for models:
-1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to `test_models.py <https://github.com/vllm-project/vllm/blob/main/tests/models/test_models.py>`_ and `test_big_models.py <https://github.com/vllm-project/vllm/blob/main/tests/models/test_big_models.py>`_ for the models that have passed this test.
+1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to `models tests <https://github.com/vllm-project/vllm/blob/main/tests/models>`_ for the models that have passed this test.
 2. **Output Sensibility**: We check if the output of the model is sensible and coherent, by measuring the perplexity of the output and checking for any obvious errors. This is a less stringent test.
 3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to `functionality tests <https://github.com/vllm-project/vllm/tree/main/tests>`_ and `examples <https://github.com/vllm-project/vllm/tree/main/examples>`_ for the models that have passed this test.
 4. **Community Feedback**: We rely on the community to provide feedback on the models. If a model is broken or not working as expected, we encourage users to raise issues to report it or open pull requests to fix it. The rest of the models fall under this category.
--- a/examples/offline_inference_pixtral.py
+++ b/examples/offline_inference_pixtral.py
@@ -11,7 +11,7 @@ from vllm.sampling_params import SamplingParams
 # - Server:
 #
 # ```bash
-# vllm serve mistralai/Pixtral-12B-2409 --tokenizer_mode mistral --limit_mm_per_prompt 'image=4' --max_num_batched_tokens 16384
+# vllm serve mistralai/Pixtral-12B-2409 --tokenizer-mode mistral --limit-mm-per-prompt 'image=4' --max-model-len 16384
 # ```
 #
 # - Client:
@@ -45,6 +45,7 @@ def run_simple_demo():
    model_name = "mistralai/Pixtral-12B-2409"
    sampling_params = SamplingParams(max_tokens=8192)
    # Lower max_num_seqs or max_model_len on low-VRAM GPUs.
    llm = LLM(model=model_name, tokenizer_mode="mistral")
    prompt = "Describe this image in one sentence."
@@ -83,7 +84,7 @@ def run_advanced_demo():
        model=model_name,
        tokenizer_mode="mistral",
        limit_mm_per_prompt={"image": max_img_per_msg},
-        max_num_batched_tokens=max_img_per_msg * max_tokens_per_img,
+        max_model_len=max_img_per_msg * max_tokens_per_img,
    )
    prompt = "Describe the following image."
--- a/examples/offline_inference_vision_language_multi_image.py
+++ b/examples/offline_inference_vision_language_multi_image.py
@@ -19,7 +19,39 @@ IMAGE_URLS = [
 ]
-def load_phi3v(question, image_urls: List[str]):
+def load_qwenvl_chat(question: str, image_urls: List[str]):
    model_name = "Qwen/Qwen-VL-Chat"
    llm = LLM(
        model=model_name,
        trust_remote_code=True,
        max_num_seqs=5,
        limit_mm_per_prompt={"image": len(image_urls)},
    )
    placeholders = "".join(f"Picture {i}: <img></img>\n"
                           for i, _ in enumerate(image_urls, start=1))
    # This model does not have a chat_template attribute on its tokenizer,
    # so we need to explicitly pass it. We use ChatML since it's used in the
    # generation utils of the model:
    # https://huggingface.co/Qwen/Qwen-VL-Chat/blob/main/qwen_generation_utils.py#L265
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
    # Copied from: https://huggingface.co/docs/transformers/main/en/chat_templating
    chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"  # noqa: E501
    messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
    prompt = tokenizer.apply_chat_template(messages,
                                           tokenize=False,
                                           add_generation_prompt=True,
                                           chat_template=chat_template)
    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
    return llm, prompt, stop_token_ids, None, chat_template
 def load_phi3v(question: str, image_urls: List[str]):
    llm = LLM(
        model="microsoft/Phi-3.5-vision-instruct",
        trust_remote_code=True,
@@ -30,10 +62,10 @@ def load_phi3v(question, image_urls: List[str]):
                             for i, _ in enumerate(image_urls, start=1))
    prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
    stop_token_ids = None
-    return llm, prompt, stop_token_ids, None
+    return llm, prompt, stop_token_ids, None, None
-def load_internvl(question, image_urls: List[str]):
+def load_internvl(question: str, image_urls: List[str]):
    model_name = "OpenGVLab/InternVL2-2B"
    llm = LLM(
@@ -61,7 +93,7 @@ def load_internvl(question, image_urls: List[str]):
    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
-    return llm, prompt, stop_token_ids, None
+    return llm, prompt, stop_token_ids, None, None
 def load_qwen2_vl(question, image_urls: List[str]):
@@ -111,18 +143,19 @@ def load_qwen2_vl(question, image_urls: List[str]):
    else:
        image_data, _ = process_vision_info(messages)
-    return llm, prompt, stop_token_ids, image_data
+    return llm, prompt, stop_token_ids, image_data, None
 model_example_map = {
    "phi3_v": load_phi3v,
    "internvl_chat": load_internvl,
    "qwen2_vl": load_qwen2_vl,
    "qwen_vl_chat": load_qwenvl_chat,
 }
 def run_generate(model, question: str, image_urls: List[str]):
-    llm, prompt, stop_token_ids, image_data = model_example_map[model](
+    llm, prompt, stop_token_ids, image_data, _ = model_example_map[model](
        question, image_urls)
    if image_data is None:
        image_data = [fetch_image(url) for url in image_urls]
@@ -146,29 +179,32 @@ def run_generate(model, question: str, image_urls: List[str]):
 def run_chat(model: str, question: str, image_urls: List[str]):
-    llm, _, stop_token_ids, _ = model_example_map[model](question, image_urls)
+    llm, _, stop_token_ids, _, chat_template = model_example_map[model](
        question, image_urls)
    sampling_params = SamplingParams(temperature=0.0,
                                     max_tokens=128,
                                     stop_token_ids=stop_token_ids)
-
+    outputs = llm.chat(
-    outputs = llm.chat([{
+        [{
-        "role":
+            "role":
-        "user",
+            "user",
-        "content": [
+            "content": [
-            {
+                {
-                "type": "text",
+                    "type": "text",
-                "text": question,
+                    "text": question,
            },
            *({
                "type": "image_url",
                "image_url": {
                    "url": image_url
                },
-            } for image_url in image_urls),
+                *({
-        ],
+                    "type": "image_url",
-    }],
+                    "image_url": {
-                       sampling_params=sampling_params)
+                        "url": image_url
                    },
                } for image_url in image_urls),
            ],
        }],
        sampling_params=sampling_params,
        chat_template=chat_template,
    )
    for o in outputs:
        generated_text = o.outputs[0].text
--- a/examples/offline_inference_with_profiler.py
+++ b/examples/offline_inference_with_profiler.py
@@ -16,7 +16,7 @@ prompts = [
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 # Create an LLM.
-llm = LLM(model="facebook/opt-125m")
+llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1)
 llm.start_profile()
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -76,7 +76,7 @@ exclude = [
 [tool.codespell]
 ignore-words-list = "dout, te, indicies, subtile"
-skip = "./tests/prompts,./benchmarks/sonnet.txt,./tests/lora/data,./build"
+skip = "./tests/models/fixtures,./tests/prompts,./benchmarks/sonnet.txt,./tests/lora/data,./build"
 [tool.isort]
 use_parentheses = true
@@ -85,5 +85,6 @@ skip_gitignore = true
 [tool.pytest.ini_options]
 markers = [
    "skip_global_cleanup",
-    "vlm: run tests for vision language models only",
+    "core_model: run this model test in each PR instead of just daily",
    "distributed_2_gpus: run this test only in distributed tests for 2 GPUs",
 ]
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -7,11 +7,12 @@ py-cpuinfo
 transformers >= 4.43.2  # Required for Chameleon and Llama 3.1 hotfox.
 tokenizers >= 0.19.1  # Required for Llama 3.
 protobuf # Required by LlamaTokenizer.
-fastapi
+fastapi < 0.113.0; python_version < '3.9'
 fastapi >= 0.114.1; python_version >= '3.9'
 aiohttp
 openai >= 1.40.0 # Ensure modern openai package (ensure types module present)
 uvicorn[standard]
-pydantic >= 2.8  # Required for OpenAI server.
+pydantic >= 2.9  # Required for fastapi >= 0.113.0
 pillow  # Required for image processing
 prometheus_client >= 0.18.0
 prometheus-fastapi-instrumentator >= 7.0.0
--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
@@ -1,4 +1,3 @@
 import os
 import subprocess
 import sys
 import time
@@ -26,8 +25,7 @@ def _query_server_long(prompt: str) -> dict:
@pytest.fixture
-def api_server(tokenizer_pool_size: int, engine_use_ray: bool,
+def api_server(tokenizer_pool_size: int, worker_use_ray: bool):
               worker_use_ray: bool):
    script_path = Path(__file__).parent.joinpath(
        "api_server_async_engine.py").absolute()
    commands = [
@@ -37,25 +35,17 @@ def api_server(tokenizer_pool_size: int, engine_use_ray: bool,
        str(tokenizer_pool_size)
    ]
    # Copy the environment variables and append `VLLM_ALLOW_ENGINE_USE_RAY=1`
    # to prevent `--engine-use-ray` raises an exception due to it deprecation
    env_vars = os.environ.copy()
    env_vars["VLLM_ALLOW_ENGINE_USE_RAY"] = "1"
    if engine_use_ray:
        commands.append("--engine-use-ray")
    if worker_use_ray:
        commands.append("--worker-use-ray")
-    uvicorn_process = subprocess.Popen(commands, env=env_vars)
+    uvicorn_process = subprocess.Popen(commands)
    yield
    uvicorn_process.terminate()
@pytest.mark.parametrize("tokenizer_pool_size", [0, 2])
@pytest.mark.parametrize("worker_use_ray", [False, True])
-@pytest.mark.parametrize("engine_use_ray", [False, True])
+def test_api_server(api_server, tokenizer_pool_size: int,
-def test_api_server(api_server, tokenizer_pool_size: int, worker_use_ray: bool,
+                    worker_use_ray: bool):
                    engine_use_ray: bool):
    """
    Run the API server and test it.
--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
@@ -1,8 +1,10 @@
 import asyncio
 import os
 import uuid
 from asyncio import CancelledError
 from copy import copy
 from dataclasses import dataclass
-from typing import Optional
+from typing import List, Optional
 import pytest
 import pytest_asyncio
@@ -12,6 +14,7 @@ from vllm import SamplingParams
 from vllm.config import ParallelConfig
 from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine
 from vllm.outputs import RequestOutput as RealRequestOutput
 from vllm.sampling_params import RequestOutputKind
 from ..conftest import cleanup
 from ..utils import wait_for_gpu_memory_to_clear
@@ -72,14 +75,12 @@ class MockEngine:
 class MockAsyncLLMEngine(AsyncLLMEngine):
-
+    _engine_class = MockEngine
    def _init_engine(self, *args, **kwargs):
        return MockEngine()
@pytest.mark.asyncio
 async def test_new_requests_event():
-    engine = MockAsyncLLMEngine(worker_use_ray=False, engine_use_ray=False)
+    engine = MockAsyncLLMEngine(worker_use_ray=False)
    engine.start_background_loop()
    await asyncio.sleep(0.01)
    assert engine.engine.step_calls == 0
@@ -112,16 +113,11 @@ async def test_new_requests_event():
    assert engine.engine.add_request_calls == 3
    assert engine.engine.step_calls == old_step_calls + 1
-    # Allow deprecated engine_use_ray to not raise exception
+    engine = MockAsyncLLMEngine(worker_use_ray=True)
    os.environ["VLLM_ALLOW_ENGINE_USE_RAY"] = "1"
    engine = MockAsyncLLMEngine(worker_use_ray=True, engine_use_ray=True)
    assert engine.get_model_config() is not None
    assert engine.get_tokenizer() is not None
    assert engine.get_decoding_config() is not None
    os.environ.pop("VLLM_ALLOW_ENGINE_USE_RAY")
 def start_engine():
    wait_for_gpu_memory_to_clear(
@@ -130,8 +126,17 @@ def start_engine():
        timeout_s=60,
    )
    num_scheduler_steps = int(os.getenv("NUM_SCHEDULER_STEPS", "1"))
    print(f"Starting engine with num_scheduler_steps={num_scheduler_steps}")
    return AsyncLLMEngine.from_engine_args(
-        AsyncEngineArgs(model="facebook/opt-125m", enforce_eager=True))
+        AsyncEngineArgs(model="facebook/opt-125m",
                        enforce_eager=True,
                        num_scheduler_steps=num_scheduler_steps))
 def uid() -> str:
    return str(uuid.uuid4())
@pytest_asyncio.fixture(scope="module")
@@ -154,59 +159,195 @@ def should_do_global_cleanup_after_test(request) -> bool:
@pytest.mark.asyncio(scope="module")
-async def test_asyncio_run(async_engine):
+@pytest.mark.parametrize("stop", [None, ["a stop string"]])
 async def test_asyncio_run(async_engine, stop):
    scheduler_config = await async_engine.get_scheduler_config()
    num_scheduler_steps = scheduler_config.num_scheduler_steps
    async def run(prompt: str):
        sampling_params = SamplingParams(
            temperature=0,
            max_tokens=32,
            min_tokens=32,
            stop=stop,
        )
        output_count = 0
        final_output = None
        async for output in async_engine.generate(prompt,
                                                  sampling_params,
-                                                  request_id=prompt):
+                                                  request_id=uid()):
            output_count += 1
            final_output = output
-        return final_output
+        return final_output, output_count
    results = await asyncio.gather(
        run("test0"),
-        run("test1"),
+        run("test0"),
    )
    assert len(results) == 2
    first, second = results
    # remove nondeterministic fields for comparison
    first[0].metrics = None
    second[0].metrics = None
    first[0].request_id = None
    second[0].request_id = None
    assert str(first) == str(second)
    output_count = results[0][1]
    if num_scheduler_steps == 1:
        assert output_count == 32
    else:
        assert 1 < output_count < 32
@pytest.mark.asyncio(scope="module")
-async def test_cancellation(async_engine):
+@pytest.mark.parametrize("stop", [None, ["a stop string"]])
 async def test_output_kinds(async_engine, stop):
    """Test that output_kind works as expected and that
    results are equivalent across different kinds."""
    scheduler_config = await async_engine.get_scheduler_config()
    num_scheduler_steps = scheduler_config.num_scheduler_steps
    sampling_params = SamplingParams(
        temperature=0,
-        min_tokens=10,
+        max_tokens=32,
-        max_tokens=10,
+        min_tokens=32,
        stop=stop,
    )
    async def run(prompt: str, kind: RequestOutputKind):
        params = copy(sampling_params)
        params.output_kind = kind
        output_count = 0
        final_output = None
        async for output in async_engine.generate(prompt,
                                                  params,
                                                  request_id=uid()):
            output_count += 1
            final_output = output
        assert final_output is not None
        assert final_output.finished
        return (final_output.prompt_token_ids,
                final_output.outputs[0].token_ids,
                final_output.outputs[0].text, output_count)
    async def run_deltas(prompt: str):
        params = copy(sampling_params)
        params.output_kind = RequestOutputKind.DELTA
        prompt_tokens = None
        output_tokens: List[int] = []
        output_text = ""
        output_count = 0
        final_output = None
        async for output in async_engine.generate(prompt,
                                                  params,
                                                  request_id=uid()):
            token_ids = output.outputs[0].token_ids
            text = output.outputs[0].text
            final_output = output
            # Ensure we get prompt ids iff we haven't yet received output tokens
            if output_tokens:
                assert 1 <= len(token_ids) <= num_scheduler_steps
                assert stop or text
                assert not output.prompt_token_ids
            else:
                assert output.prompt_token_ids
                prompt_tokens = output.prompt_token_ids
            output_tokens.extend(token_ids)
            output_text += text
            output_count += 1
        assert final_output is not None
        assert final_output.finished
        return prompt_tokens, output_tokens, output_text, output_count
    results = await asyncio.gather(
        run("common input prompt", RequestOutputKind.CUMULATIVE),
        run("common input prompt", RequestOutputKind.FINAL_ONLY),
        run_deltas("common input prompt"))
    # Make sure outputs are the same
    prompt_set = set(tuple(prompt_ids) for prompt_ids, _, _, _ in results)
    assert len(prompt_set) == 1
    text_set = set(text for _, _, text, _ in results)
    assert len(text_set) == 1
    tokens_set = set(tuple(ids) for _, ids, _, _ in results)
    assert len(tokens_set) == 1
    cumulative, final, deltas = results
    # output message counts
    assert cumulative[3] == deltas[3]
    if num_scheduler_steps == 1:
        assert cumulative[3] == 32
    else:
        assert 1 < cumulative[3] < 32
    assert final[3] == 1
@pytest.mark.asyncio(scope="module")
@pytest.mark.parametrize("stop", [None, ["a stop string"]])
 async def test_cancellation(async_engine, stop):
    scheduler_config = await async_engine.get_scheduler_config()
    num_scheduler_steps = scheduler_config.num_scheduler_steps
    sampling_params = SamplingParams(
        temperature=0,
        min_tokens=13,
        max_tokens=13,
        stop=stop,
    )
    stop_at = 5 if num_scheduler_steps == 1 else 1
    request_id = uid()
    i = 0
    with pytest.raises(CancelledError):
        async for output in async_engine.generate("test2",
                                                  sampling_params,
-                                                  request_id="test2"):
+                                                  request_id=request_id):
            assert not output.finished
            i += 1
-            if i == 5:
+            if i == stop_at:
-                await async_engine.abort("test2")
+                await async_engine.abort(request_id)
-    assert i == 5
+    assert i == stop_at
@pytest.mark.asyncio(scope="module")
-async def test_delayed_generator(async_engine):
+@pytest.mark.parametrize("stop", [None, ["a stop string"]])
 async def test_delayed_generator(async_engine, stop):
    scheduler_config = await async_engine.get_scheduler_config()
    if scheduler_config.num_scheduler_steps != 1:
        pytest.skip("no need to test this one with multistep")
    sampling_params = SamplingParams(
        temperature=0,
        min_tokens=10,
        max_tokens=10,
        stop=stop,
    )
-    stream = async_engine.generate("test3",
+    stream = async_engine.generate("test3", sampling_params, request_id=uid())
                                   sampling_params,
                                   request_id="test3")
    i = 0
    final_output: Optional[RealRequestOutput] = None
    async for output in stream:
--- a/tests/async_engine/test_openapi_server_ray.py
+++ b/tests/async_engine/test_openapi_server_ray.py
@@ -19,16 +19,11 @@ def server():
        "--max-model-len",
        "2048",
        "--enforce-eager",
        "--engine-use-ray",
        "--chat-template",
        str(chatml_jinja_path),
    ]
-    # Allow `--engine-use-ray`, otherwise the launch of the server throw
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
    # an error due to try to use a deprecated feature
    env_dict = {"VLLM_ALLOW_ENGINE_USE_RAY": "1"}
    with RemoteOpenAIServer(MODEL_NAME, args,
                            env_dict=env_dict) as remote_server:
        yield remote_server
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -3,20 +3,27 @@
 Run `pytest tests/basic_correctness/test_basic_correctness.py`.
 """
 import os
 import pickle
 import re
 import weakref
 from unittest.mock import patch
 import pytest
 from vllm import LLM
 from vllm.utils import is_hip
 from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
 from ..models.utils import check_outputs_equal
 from ..utils import multi_gpu_test
 MODELS = [
    "facebook/opt-125m",
    "meta-llama/Llama-2-7b-hf",
 ]
 TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
 def test_vllm_gc_ed():
    """Verify vllm instance is GC'ed when it is deleted"""
@@ -64,3 +71,88 @@ def test_models(
        name_0="hf",
        name_1="vllm",
    )
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize(
    "model, distributed_executor_backend, attention_backend, "
    "test_suite", [
        ("facebook/opt-125m", "ray", "", "L4"),
        ("facebook/opt-125m", "mp", "", "L4"),
        ("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
        ("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
        ("facebook/opt-125m", "ray", "", "A100"),
        ("facebook/opt-125m", "mp", "", "A100"),
        ("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
        ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
    ])
 def test_models_distributed(
    hf_runner,
    vllm_runner,
    example_prompts,
    model: str,
    distributed_executor_backend: str,
    attention_backend: str,
    test_suite: str,
 ) -> None:
    if test_suite != TARGET_TEST_SUITE:
        pytest.skip(f"Skip test for {test_suite}")
    if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
        # test ray adag
        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
    if attention_backend:
        os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
    dtype = "half"
    max_tokens = 5
    # NOTE: take care of the order. run vLLM first, and then run HF.
    # vLLM needs a fresh new process without cuda initialization.
    # if we run HF first, the cuda initialization will be done and it
    # will hurt multiprocessing backend with fork method (the default method).
    with vllm_runner(model,
                     dtype=dtype,
                     tensor_parallel_size=2,
                     distributed_executor_backend=distributed_executor_backend
                     ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
    with hf_runner(model, dtype=dtype) as hf_model:
        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
    check_outputs_equal(
        outputs_0_lst=hf_outputs,
        outputs_1_lst=vllm_outputs,
        name_0="hf",
        name_1="vllm",
    )
 def test_model_with_failure(vllm_runner) -> None:
    try:
        with patch("vllm.model_executor.models.opt.OPTForCausalLM.forward",
                   side_effect=ValueError()):
            with pytest.raises(ValueError) as exc_info:
                vllm_runner("facebook/opt-125m",
                            dtype="half",
                            enforce_eager=False,
                            gpu_memory_utilization=0.7)
            matches = re.search(r"input dumped to (.+).pkl",
                                str(exc_info.value))
            assert matches is not None
            filename = f"{matches.group(1)}.pkl"
        with open(filename, "rb") as filep:
            inputs = pickle.load(filep)
        if any(key not in inputs for key in ("arg_1", "arg_2", "arg_3")):
            raise AssertionError("Missing keys in dumped inputs. Dumped keys: "
                                 f"{list(inputs.keys())}")
        assert isinstance(inputs["arg_1"],
                          ModelInputForGPUWithSamplingMetadata)
    finally:
        os.remove(filename)
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -6,11 +6,13 @@ prefill requests are chunked.
 Run `pytest tests/models/test_chunked_prefill.py`.
 """
 import os
 from contextlib import nullcontext
 import pytest
 from ..models.utils import check_logprobs_close, check_outputs_equal
 from ..utils import multi_gpu_test
 MODELS = [
    "facebook/opt-125m",
@@ -66,6 +68,59 @@ def test_models(
    )
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
@pytest.mark.parametrize("model", MODELS)
 def test_models_distributed(
    hf_runner,
    vllm_runner,
    example_prompts,
    model: str,
    distributed_executor_backend: str,
 ) -> None:
    if (model == "meta-llama/Llama-2-7b-hf"
            and distributed_executor_backend == "ray"):
        # test ray adag
        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
    dtype = "half"
    max_tokens = 5
    chunked_prefill_token_size = 16
    # Add a chunked prefill config.
    max_num_seqs = min(chunked_prefill_token_size, 256)
    assert chunked_prefill_token_size != -1
    enable_chunked_prefill = True
    max_num_batched_tokens = chunked_prefill_token_size
    # NOTE: take care of the order. run vLLM first, and then run HF.
    # vLLM needs a fresh new process without cuda initialization.
    # if we run HF first, the cuda initialization will be done and it
    # will hurt multiprocessing backend with fork method (the default method).
    with vllm_runner(
            model,
            dtype=dtype,
            tensor_parallel_size=2,
            max_num_seqs=max_num_seqs,
            enable_chunked_prefill=enable_chunked_prefill,
            max_num_batched_tokens=max_num_batched_tokens,
            distributed_executor_backend=distributed_executor_backend,
    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
    with hf_runner(model, dtype=dtype) as hf_model:
        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
    check_outputs_equal(
        outputs_0_lst=hf_outputs,
        outputs_1_lst=vllm_outputs,
        name_0="hf",
        name_1="vllm",
    )
@pytest.mark.parametrize(
    "kv_cache_dtype,model",
    [("fp8_e4m3",
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -19,10 +19,13 @@ MODELS = [
    "facebook/opt-125m",
 ]
-assert ENABLE_ARTIFICIAL_PREEMPT is True, (
+
-    "Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1. "
+@pytest.fixture(scope="module", autouse=True)
-    "`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest "
+def check_settings():
-    "tests/basic_correctness/test_preemption.py`")
+    assert ENABLE_ARTIFICIAL_PREEMPT is True, (
        "Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1. "
        "`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest "
        "tests/basic_correctness/test_preemption.py`")
@pytest.fixture
@@ -64,6 +67,7 @@ def test_chunked_prefill_recompute(
            enable_chunked_prefill=enable_chunked_prefill,
            max_num_seqs=max_num_seqs,
            worker_use_ray=worker_use_ray,
            disable_log_stats=False,
    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -16,5 +16,7 @@ def test_full_graph(model):
        "The future of AI is",
    ]
    sampling_params = SamplingParams(temperature=0)
-    llm = LLM(model="meta-llama/Meta-Llama-3-8B")
+    llm = LLM(model="meta-llama/Meta-Llama-3-8B",
              enforce_eager=True,
              load_format="dummy")
    llm.generate(prompts, sampling_params)
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -6,8 +6,8 @@ import sys
 import tempfile
 from collections import UserList
 from enum import Enum
-from typing import (Any, Callable, Dict, List, Optional, Tuple, TypedDict,
+from typing import (Any, Callable, Dict, List, Optional, Tuple, Type,
-                    TypeVar, Union)
+                    TypedDict, TypeVar, Union)
 import numpy as np
 import pytest
@@ -18,6 +18,7 @@ from huggingface_hub import snapshot_download
 from PIL import Image
 from transformers import (AutoModelForCausalLM, AutoTokenizer, BatchEncoding,
                          BatchFeature)
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
@@ -260,7 +261,7 @@ class HfRunner:
        *,
        model_kwargs: Optional[Dict[str, Any]] = None,
        is_embedding_model: bool = False,
-        auto_cls=AutoModelForCausalLM,
+        auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM,
        postprocess_inputs: Callable[[BatchEncoding],
                                     BatchEncoding] = identity,
    ) -> None:
@@ -292,20 +293,14 @@ class HfRunner:
            trust_remote_code=True,
        )
-        try:
+        # don't put this import at the top level
-            # don't put this import at the top level
+        # it will call torch.cuda.device_count()
-            # it will call torch.cuda.device_count()
+        from transformers import AutoProcessor  # noqa: F401
-            from transformers import AutoProcessor  # noqa: F401
+        self.processor = AutoProcessor.from_pretrained(
-            self.processor = AutoProcessor.from_pretrained(
+            model_name,
-                model_name,
+            torch_dtype=torch_dtype,
-                torch_dtype=torch_dtype,
+            trust_remote_code=True,
-                trust_remote_code=True,
+        )
            )
        except Exception as exc:
            logger.warning(
                "Unable to auto-load HuggingFace processor for model (%s). "
                "Using tokenizer instead. Reason: %s", model_name, exc)
            self.processor = self.tokenizer
        self.postprocess_inputs = postprocess_inputs
@@ -658,8 +653,8 @@ class VllmRunner:
            outputs.append((req_sample_output_ids, req_sample_output_strs))
        return outputs
    @staticmethod
    def _final_steps_generate_w_logprobs(
        self,
        req_outputs: List[RequestOutput],
    ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
        outputs: List[Tuple[List[int], str, Optional[SampleLogprobs]]] = []
--- a/tests/distributed/test_basic_distributed_correctness.py
+++ b/tests/distributed/test_basic_distributed_correctness.py
@@ -1,80 +0,0 @@
 """Compare the outputs of HF and distributed vLLM when using greedy sampling.
 Run:
 ```sh
 cd $VLLM_PATH/tests
 pytest distributed/test_basic_distributed_correctness.py
 ```
 """
 import os
 import pytest
 from vllm.utils import cuda_device_count_stateless
 from ..models.utils import check_outputs_equal
 from ..utils import fork_new_process_for_each_test
 TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
@pytest.mark.skipif(cuda_device_count_stateless() < 2,
                    reason="Need at least 2 GPUs to run the test.")
@pytest.mark.parametrize(
    "model, distributed_executor_backend, attention_backend, "
    "test_suite", [
        ("facebook/opt-125m", "ray", "", "L4"),
        ("facebook/opt-125m", "mp", "", "L4"),
        ("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
        ("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
        ("facebook/opt-125m", "ray", "", "A100"),
        ("facebook/opt-125m", "mp", "", "A100"),
        ("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
        ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
    ])
@fork_new_process_for_each_test
 def test_models(
    hf_runner,
    vllm_runner,
    example_prompts,
    model: str,
    distributed_executor_backend: str,
    attention_backend: str,
    test_suite: str,
 ) -> None:
    if test_suite != TARGET_TEST_SUITE:
        pytest.skip(f"Skip test for {test_suite}")
    if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
        # test ray adag
        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
    if attention_backend:
        os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
    dtype = "half"
    max_tokens = 5
    # NOTE: take care of the order. run vLLM first, and then run HF.
    # vLLM needs a fresh new process without cuda initialization.
    # if we run HF first, the cuda initialization will be done and it
    # will hurt multiprocessing backend with fork method (the default method).
    with vllm_runner(model,
                     dtype=dtype,
                     tensor_parallel_size=2,
                     distributed_executor_backend=distributed_executor_backend
                     ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
    with hf_runner(model, dtype=dtype) as hf_model:
        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
    check_outputs_equal(
        outputs_0_lst=hf_outputs,
        outputs_1_lst=vllm_outputs,
        name_0="hf",
        name_1="vllm",
    )
--- a/tests/distributed/test_basic_distributed_correctness_enc_dec.py
+++ b/tests/distributed/test_basic_distributed_correctness_enc_dec.py
@@ -1,102 +0,0 @@
 """For encoder/decoder models only:
 Compare the outputs of HF and distributed vLLM when using greedy sampling.
 Run:
 ```sh
 cd $VLLM_PATH/tests
 pytest distributed/test_basic_distributed_correctness_enc_dec.py
 ```
 """
 import pytest
 from transformers import AutoModelForSeq2SeqLM
 from vllm.utils import cuda_device_count_stateless
 from ..conftest import DecoderPromptType
 from ..models.utils import check_logprobs_close
 from ..utils import fork_new_process_for_each_test
@pytest.mark.skipif(cuda_device_count_stateless() < 2,
                    reason="Need at least 2 GPUs to run the test.")
@pytest.mark.parametrize("model, distributed_executor_backend", [
    ("facebook/bart-large-cnn", "ray"),
    ("facebook/bart-large-cnn", "mp"),
 ])
@fork_new_process_for_each_test
 def test_models(
    model: str,
    distributed_executor_backend: str,
    hf_runner,
    vllm_runner,
    example_encoder_decoder_prompts,
 ) -> None:
    '''
    Test vLLM BART inference on more than one GPU, comparing
    outputs against HF as a baseline.
    Fork a new process for each test, to prevent CUDA from
    being re-initialized by successive tests within the same
    process.
    Arguments:
    * model: the HF ID of the specific BART variant under test
    * distributed_executor_backend
    * hf_runner: HuggingFace (HF) test model runner
    * vllm_runner: vLLM test model runner
    * example_encoder_decoder_prompts: test fixture which provides a 
                                        dictionary of dummy prompts
    '''
    dtype = "float"
    max_tokens = 64
    num_logprobs = 5
    # Example inputs with non-trivial (i.e. not None/empty) encoder &
    # decoder prompts.
    test_prompts = example_encoder_decoder_prompts[DecoderPromptType.CUSTOM]
    # NOTE: take care of the order. run vLLM first, and then run HF.
    # vLLM needs a fresh new process without cuda initialization.
    # if we run HF first, the cuda initialization will be done and it
    # will hurt multiprocessing backend with fork method (the default method).
    with vllm_runner(
            model,
            dtype=dtype,
            tensor_parallel_size=2,
            distributed_executor_backend=distributed_executor_backend,
            enforce_eager=True,
    ) as vllm_model:
        vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
            test_prompts, max_tokens, num_logprobs)
    # Configuration settings for HF baseline
    hf_kwargs = {
        "top_k": None,
        "num_beams": 1,
        "repetition_penalty": 1.0,
        "top_p": 1.0,
        "length_penalty": 1.0,
        "early_stopping": False,
        "no_repeat_ngram_size": None,
        "min_length": 0
    }
    with hf_runner(model, dtype=dtype,
                   auto_cls=AutoModelForSeq2SeqLM) as hf_model:
        hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit(
            test_prompts,
            max_tokens,
            num_logprobs,
            **hf_kwargs,
        ))
    check_logprobs_close(
        outputs_0_lst=hf_outputs,
        outputs_1_lst=vllm_outputs,
        name_0="hf",
        name_1="vllm",
    )
--- a/tests/distributed/test_chunked_prefill_distributed.py
+++ b/tests/distributed/test_chunked_prefill_distributed.py
@@ -1,75 +0,0 @@
 """Compare the outputs of HF and distributed vLLM when using greedy sampling.
 Run:
 ```sh
 pytest test_chunked_prefill_distributed.py
 ```
 """
 import os
 import pytest
 from vllm.utils import cuda_device_count_stateless
 from ..models.utils import check_outputs_equal
 from ..utils import fork_new_process_for_each_test
@pytest.mark.skipif(cuda_device_count_stateless() < 2,
                    reason="Need at least 2 GPUs to run the test.")
@pytest.mark.parametrize("model, distributed_executor_backend", [
    ("facebook/opt-125m", "ray"),
    ("meta-llama/Llama-2-7b-hf", "ray"),
    ("facebook/opt-125m", "mp"),
    ("meta-llama/Llama-2-7b-hf", "mp"),
 ])
@fork_new_process_for_each_test
 def test_models(
    hf_runner,
    vllm_runner,
    example_prompts,
    model: str,
    distributed_executor_backend: str,
 ) -> None:
    if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray":  # noqa
        assert distributed_executor_backend == "ray"
        # test ray adag
        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
    dtype = "half"
    max_tokens = 5
    chunked_prefill_token_size = 16
    # Add a chunked prefill config.
    max_num_seqs = min(chunked_prefill_token_size, 256)
    assert chunked_prefill_token_size != -1
    enable_chunked_prefill = True
    max_num_batched_tokens = chunked_prefill_token_size
    # NOTE: take care of the order. run vLLM first, and then run HF.
    # vLLM needs a fresh new process without cuda initialization.
    # if we run HF first, the cuda initialization will be done and it
    # will hurt multiprocessing backend with fork method (the default method).
    with vllm_runner(
            model,
            dtype=dtype,
            tensor_parallel_size=2,
            max_num_seqs=max_num_seqs,
            enable_chunked_prefill=enable_chunked_prefill,
            max_num_batched_tokens=max_num_batched_tokens,
            distributed_executor_backend=distributed_executor_backend,
    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
    with hf_runner(model, dtype=dtype) as hf_model:
        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
    check_outputs_equal(
        outputs_0_lst=hf_outputs,
        outputs_1_lst=vllm_outputs,
        name_0="hf",
        name_1="vllm",
    )
--- a/tests/distributed/test_multimodal_broadcast.py
+++ b/tests/distributed/test_multimodal_broadcast.py
@@ -1,58 +0,0 @@
 """Compare the outputs of HF and distributed vLLM when using greedy sampling.
 Run:
 ```sh
 pytest -s -v test_multimodal_broadcast.py
 ```
 """
 import pytest
 from vllm.utils import cuda_device_count_stateless
 from ..utils import fork_new_process_for_each_test
@pytest.mark.skipif(cuda_device_count_stateless() < 2,
                    reason="Need at least 2 GPUs to run the test.")
@pytest.mark.parametrize("model, distributed_executor_backend", [
    ("llava-hf/llava-1.5-7b-hf", "ray"),
    ("llava-hf/llava-v1.6-mistral-7b-hf", "ray"),
    ("facebook/chameleon-7b", "ray"),
    ("llava-hf/llava-1.5-7b-hf", "mp"),
    ("llava-hf/llava-v1.6-mistral-7b-hf", "mp"),
    ("facebook/chameleon-7b", "mp"),
 ])
@fork_new_process_for_each_test
 def test_models(hf_runner, vllm_runner, image_assets, model: str,
                distributed_executor_backend: str) -> None:
    dtype = "half"
    max_tokens = 5
    num_logprobs = 5
    tensor_parallel_size = 2
    if model.startswith("llava-hf/llava-1.5"):
        from ..models.test_llava import models, run_test
    elif model.startswith("llava-hf/llava-v1.6"):
        from ..models.test_llava_next import run_test  # type: ignore[no-redef]
        from ..models.test_llava_next import models
    elif model.startswith("facebook/chameleon"):
        from ..models.test_chameleon import run_test  # type: ignore[no-redef]
        from ..models.test_chameleon import models
    else:
        raise NotImplementedError(f"Unsupported model: {model}")
    run_test(
        hf_runner,
        vllm_runner,
        image_assets,
        model=models[0],
        # So that LLaVA-NeXT processor may return nested list
        size_factors=[0.25, 0.5, 1.0],
        dtype=dtype,
        max_tokens=max_tokens,
        num_logprobs=num_logprobs,
        tensor_parallel_size=tensor_parallel_size,
        distributed_executor_backend=distributed_executor_backend,
    )
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -32,9 +32,11 @@ VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
        (1, 4, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
        (2, 2, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
        (2, 2, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
-        (1, 2, 1, 1, 1, "OpenGVLab/InternVL2-1B", "ray"),
+        # NOTE: InternVL2 multi-node tests are flaky,
-        (1, 2, 1, 1, 1, "OpenGVLab/InternVL2-2B", "ray"),
+        # use mp backend to skip the multi-node tests
-        (1, 2, 1, 0, 1, "OpenGVLab/InternVL2-4B", "ray"),
+        (1, 2, 1, 1, 1, "OpenGVLab/InternVL2-1B", "mp"),
        (1, 2, 1, 1, 1, "OpenGVLab/InternVL2-2B", "mp"),
        (1, 2, 1, 0, 1, "OpenGVLab/InternVL2-4B", "mp"),
    ],
 )
@fork_new_process_for_each_test
--- a/tests/distributed/test_same_node.py
+++ b/tests/distributed/test_same_node.py
@@ -1,13 +1,13 @@
 import os
-import torch
+import torch.distributed as dist
 from vllm.distributed.parallel_state import in_the_same_node_as
-torch.distributed.init_process_group(backend="gloo")
+if __name__ == "__main__":
-test_result = all(
+    dist.init_process_group(backend="gloo")
-    in_the_same_node_as(torch.distributed.group.WORLD, source_rank=0))
+    test_result = all(in_the_same_node_as(dist.group.WORLD, source_rank=0))
-expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
+    expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
-assert test_result == expected, f"Expected {expected}, got {test_result}"
+    assert test_result == expected, f"Expected {expected}, got {test_result}"
-print("Same node test passed!")
+    print("Same node test passed!")
--- a/tests/engine/test_skip_tokenizer_init.py
+++ b/tests/engine/test_skip_tokenizer_init.py
@@ -11,9 +11,10 @@ def test_skip_tokenizer_initialization(model: str):
    # token ids.
    llm = LLM(model=model, skip_tokenizer_init=True)
    sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
-    with pytest.raises(ValueError) as err:
+
    with pytest.raises(ValueError, match="cannot pass text prompts when"):
        llm.generate("abc", sampling_params)
-    assert "prompts must be None if" in str(err.value)
+
    outputs = llm.generate({"prompt_token_ids": [1, 2, 3]},
                           sampling_params=sampling_params)
    assert len(outputs) > 0
--- a/tests/entrypoints/offline_mode/init.py
+++ b/tests/entrypoints/offline_mode/init.py
--- a/tests/entrypoints/offline_mode/test_offline_mode.py
+++ b/tests/entrypoints/offline_mode/test_offline_mode.py
@@ -0,0 +1,77 @@
 """Tests for HF_HUB_OFFLINE mode"""
 import importlib
 import sys
 import weakref
 import pytest
 from vllm import LLM
 from ...conftest import cleanup
 MODEL_NAME = "facebook/opt-125m"
@pytest.fixture(scope="module")
 def llm():
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
    llm = LLM(model=MODEL_NAME,
              max_num_batched_tokens=4096,
              tensor_parallel_size=1,
              gpu_memory_utilization=0.10,
              enforce_eager=True)
    with llm.deprecate_legacy_api():
        yield weakref.proxy(llm)
        del llm
    cleanup()
@pytest.mark.skip_global_cleanup
 def test_offline_mode(llm: LLM, monkeypatch):
    # we use the llm fixture to ensure the model files are in-cache
    del llm
    # Set HF to offline mode and ensure we can still construct an LLM
    try:
        monkeypatch.setenv("HF_HUB_OFFLINE", "1")
        # Need to re-import huggingface_hub and friends to setup offline mode
        _re_import_modules()
        # Cached model files should be used in offline mode
        LLM(model=MODEL_NAME,
            max_num_batched_tokens=4096,
            tensor_parallel_size=1,
            gpu_memory_utilization=0.10,
            enforce_eager=True)
    finally:
        # Reset the environment after the test
        # NB: Assuming tests are run in online mode
        monkeypatch.delenv("HF_HUB_OFFLINE")
        _re_import_modules()
        pass
 def _re_import_modules():
    hf_hub_module_names = [
        k for k in sys.modules if k.startswith("huggingface_hub")
    ]
    transformers_module_names = [
        k for k in sys.modules if k.startswith("transformers")
        and not k.startswith("transformers_modules")
    ]
    reload_exception = None
    for module_name in hf_hub_module_names + transformers_module_names:
        try:
            importlib.reload(sys.modules[module_name])
        except Exception as e:
            reload_exception = e
            # Try to continue clean up so that other tests are less likely to
            # be affected
    # Error this test if reloading a module failed
    if reload_exception is not None:
        raise reload_exception
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -10,7 +10,6 @@ import pytest
 import torch
 from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType
 from vllm.attention.backends.xformers import XFormersBackend
 from vllm.utils import (STR_BACKEND_ENV_VAR, STR_XFORMERS_ATTN_VAL,
                        make_tensor_with_pad)
@@ -521,6 +520,9 @@ def make_backend(backend_name: str) -> AttentionBackend:
    * Backend instance
    '''
    if backend_name == STR_XFORMERS_ATTN_VAL:
        # NOTE: xFormers backend cannot be imported for CPU and AMD GPUs.
        from vllm.attention.backends.xformers import XFormersBackend
        return XFormersBackend()
    raise AssertionError(
        f"Unrecognized backend_name {backend_name} for unit test")
--- a/tests/models/decoder_only/init.py
+++ b/tests/models/decoder_only/init.py
--- a/tests/models/decoder_only/audio_language/init.py
+++ b/tests/models/decoder_only/audio_language/init.py
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -7,10 +7,8 @@ from transformers import AutoModel, AutoTokenizer, BatchEncoding
 from vllm.sequence import SampleLogprobs
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
-from ..conftest import HfRunner, VllmRunner
+from ....conftest import HfRunner, VllmRunner
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
 pytestmark = pytest.mark.vlm
 MODEL_NAME = "fixie-ai/ultravox-v0_3"
--- a/tests/models/decoder_only/language/init.py
+++ b/tests/models/decoder_only/language/init.py
--- a/tests/models/decoder_only/language/test_aqlm.py
+++ b/tests/models/decoder_only/language/test_aqlm.py
--- a/tests/models/decoder_only/language/test_big_models.py
+++ b/tests/models/decoder_only/language/test_big_models.py
@@ -7,7 +7,7 @@ Run `pytest tests/models/test_big_models.py`.
 import pytest
 import torch
-from .utils import check_outputs_equal
+from ...utils import check_outputs_equal
 MODELS = [
    "meta-llama/Llama-2-7b-hf",
--- a/tests/models/decoder_only/language/test_danube3_4b.py
+++ b/tests/models/decoder_only/language/test_danube3_4b.py
@@ -6,7 +6,7 @@ Run `pytest tests/models/test_danube3_4b.py`.
 """
 import pytest
-from .utils import check_outputs_equal
+from ...utils import check_outputs_equal
 MODELS = ["h2oai/h2o-danube3-4b-base"]
--- a/tests/models/decoder_only/language/test_fp8.py
+++ b/tests/models/decoder_only/language/test_fp8.py
@@ -10,7 +10,7 @@ import pytest
 from tests.kernels.utils import override_backend_env_variable
 from tests.quantization.utils import is_quant_method_supported
-from ..models.utils import check_logprobs_close
+from ...utils import check_logprobs_close
 os.environ["TOKENIZERS_PARALLELISM"] = "true"
--- a/tests/models/decoder_only/language/test_gguf.py
+++ b/tests/models/decoder_only/language/test_gguf.py
@@ -11,7 +11,7 @@ from transformers import AutoTokenizer
 from tests.quantization.utils import is_quant_method_supported
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
 os.environ["TOKENIZERS_PARALLELISM"] = "true"
--- a/tests/models/decoder_only/language/test_gptq_marlin.py
+++ b/tests/models/decoder_only/language/test_gptq_marlin.py
@@ -15,7 +15,7 @@ import pytest
 from tests.quantization.utils import is_quant_method_supported
 from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
 os.environ["TOKENIZERS_PARALLELISM"] = "true"
--- a/tests/models/decoder_only/language/test_gptq_marlin_24.py
+++ b/tests/models/decoder_only/language/test_gptq_marlin_24.py
@@ -10,9 +10,10 @@ from dataclasses import dataclass
 import pytest
 from tests.models.utils import check_logprobs_close
 from tests.quantization.utils import is_quant_method_supported
 from ...utils import check_logprobs_close
@dataclass
 class ModelPair:
--- a/tests/models/decoder_only/language/test_granite.py
+++ b/tests/models/decoder_only/language/test_granite.py
@@ -6,7 +6,7 @@ import importlib.metadata
 import pytest
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
 TRANSFORMERS_VERSION = tuple(
    map(int,
--- a/tests/models/decoder_only/language/test_jamba.py
+++ b/tests/models/decoder_only/language/test_jamba.py
@@ -1,8 +1,9 @@
 import pytest
 from tests.models.utils import check_outputs_equal
 from vllm.worker.model_runner import _get_graph_batch_size
 from ...utils import check_outputs_equal
 MODELS = ["ai21labs/Jamba-tiny-random"]
--- a/tests/models/decoder_only/language/test_marlin.py
+++ b/tests/models/decoder_only/language/test_marlin.py
@@ -16,7 +16,7 @@ import pytest
 from tests.quantization.utils import is_quant_method_supported
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
@dataclass
--- a/tests/models/decoder_only/language/test_mistral.py
+++ b/tests/models/decoder_only/language/test_mistral.py
@@ -4,7 +4,7 @@ Run `pytest tests/models/test_mistral.py`.
 """
 import pytest
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
 MODELS = [
    "mistralai/Mistral-7B-Instruct-v0.1",
--- a/tests/models/decoder_only/language/test_modelopt.py
+++ b/tests/models/decoder_only/language/test_modelopt.py
--- a/tests/models/decoder_only/language/test_models.py
+++ b/tests/models/decoder_only/language/test_models.py
@@ -7,7 +7,7 @@ Run `pytest tests/models/test_models.py`.
 """
 import pytest
-from .utils import check_outputs_equal
+from ...utils import check_outputs_equal
 MODELS = [
    "facebook/opt-125m",
--- a/tests/models/decoder_only/language/test_phimoe.py
+++ b/tests/models/decoder_only/language/test_phimoe.py
@@ -7,7 +7,7 @@ import torch
 from vllm.utils import is_cpu
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
 MODELS = [
    "microsoft/Phi-3.5-MoE-instruct",
--- a/tests/models/decoder_only/vision_language/init.py
+++ b/tests/models/decoder_only/vision_language/init.py
--- a/tests/models/decoder_only/vision_language/test_blip2.py
+++ b/tests/models/decoder_only/vision_language/test_blip2.py
@@ -6,10 +6,8 @@ from transformers import AutoModelForVision2Seq, AutoTokenizer
 from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
-from ..conftest import IMAGE_ASSETS
+from ....conftest import IMAGE_ASSETS
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
 pytestmark = pytest.mark.vlm
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "stop_sign":
@@ -56,7 +54,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
                dtype: str, max_tokens: int, num_logprobs: int) -> None:
    """Inference result should be the same between hf and vllm.
-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalData objects and corresponding
    MultiModalConfig as input.
--- a/tests/models/decoder_only/vision_language/test_broadcast.py
+++ b/tests/models/decoder_only/vision_language/test_broadcast.py
@@ -0,0 +1,42 @@
 import pytest
 from ....utils import multi_gpu_test
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
@pytest.mark.parametrize("model", [
    "llava-hf/llava-1.5-7b-hf",
    "llava-hf/llava-v1.6-mistral-7b-hf",
    "facebook/chameleon-7b",
 ])
 def test_models(hf_runner, vllm_runner, image_assets,
                distributed_executor_backend, model) -> None:
    dtype = "half"
    max_tokens = 5
    num_logprobs = 5
    tensor_parallel_size = 2
    if model.startswith("llava-hf/llava-1.5"):
        from .test_llava import models, run_test
    elif model.startswith("llava-hf/llava-v1.6"):
        from .test_llava_next import models, run_test  # type: ignore[no-redef]
    elif model.startswith("facebook/chameleon"):
        from .test_chameleon import models, run_test  # type: ignore[no-redef]
    else:
        raise NotImplementedError(f"Unsupported model: {model}")
    run_test(
        hf_runner,
        vllm_runner,
        image_assets,
        model=models[0],
        # So that LLaVA-NeXT processor may return nested list
        size_factors=[0.25, 0.5, 1.0],
        dtype=dtype,
        max_tokens=max_tokens,
        num_logprobs=num_logprobs,
        tensor_parallel_size=tensor_parallel_size,
        distributed_executor_backend=distributed_executor_backend,
    )
--- a/tests/models/decoder_only/vision_language/test_chameleon.py
+++ b/tests/models/decoder_only/vision_language/test_chameleon.py
@@ -6,10 +6,8 @@ from transformers import AutoModelForVision2Seq, BatchEncoding
 from vllm.multimodal.utils import rescale_image_size
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
-from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
-from .utils import check_outputs_equal
+from ...utils import check_outputs_equal
 pytestmark = pytest.mark.vlm
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "stop_sign":
@@ -36,7 +34,7 @@ def run_test(
 ):
    """Inference result should be the same between hf and vllm.
-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalDataDict objects 
    and corresponding vision language config as input.
--- a/tests/models/decoder_only/vision_language/test_fuyu.py
+++ b/tests/models/decoder_only/vision_language/test_fuyu.py
@@ -6,10 +6,8 @@ from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
 from vllm.utils import is_cpu
-from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
 pytestmark = pytest.mark.vlm
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "stop_sign":
@@ -46,7 +44,7 @@ def run_test(
 ):
    """Inference result should be the same between hf and vllm.
-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalDataDict objects 
    and corresponding MultiModalConfig as input.
--- a/tests/models/decoder_only/vision_language/test_intern_vit.py
+++ b/tests/models/decoder_only/vision_language/test_intern_vit.py
@@ -6,9 +6,7 @@ import torch.nn as nn
 from huggingface_hub import snapshot_download
 from transformers import AutoConfig, AutoModel, CLIPImageProcessor
-from ..conftest import _ImageAssets, cleanup
+from ....conftest import _ImageAssets, cleanup
 pytestmark = pytest.mark.vlm
 # we use snapshot_download to prevent conflicts between
 # dynamic_module and trust_remote_code for hf_runner
--- a/tests/models/decoder_only/vision_language/test_internvl.py
+++ b/tests/models/decoder_only/vision_language/test_internvl.py
@@ -9,11 +9,9 @@ from transformers import AutoConfig
 from vllm.multimodal.utils import rescale_image_size
 from vllm.utils import is_cpu
-from ..conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
+from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
-                        _ImageAssets)
+                          _ImageAssets)
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
 pytestmark = pytest.mark.vlm
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "stop_sign":
@@ -78,7 +76,7 @@ def run_test(
 ):
    """Inference result should be the same between hf and vllm.
-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalDataDict objects 
    and corresponding MultiModalConfig as input.
@@ -331,6 +329,41 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
    )
@pytest.mark.parametrize("model", ["OpenGVLab/InternVL2-2B"])
@pytest.mark.parametrize("size_factors", [[0.5, 1.0]])
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
@torch.inference_mode()
 def test_different_num_patches(hf_runner, vllm_runner, image_assets, model,
                               size_factors, dtype: str, max_tokens: int,
                               num_logprobs: int) -> None:
    images = [asset.pil_image.resize((896, 896)) for asset in image_assets]
    inputs_batching = [(
        [prompt for _ in size_factors],
        [rescale_image_size(image, factor) for factor in size_factors],
    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
    inputs_multi_images = [
        ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
         [[rescale_image_size(image, factor) for image in images]
          for factor in size_factors])
    ]
    for inputs in [inputs_batching, inputs_multi_images]:
        run_test(
            hf_runner,
            vllm_runner,
            inputs,
            model,
            dtype=dtype,
            max_tokens=max_tokens,
            num_logprobs=num_logprobs,
            mm_limit=2,
            tensor_parallel_size=1,
        )
@pytest.mark.parametrize(
    "models", [("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")])
@pytest.mark.parametrize(
--- a/tests/models/decoder_only/vision_language/test_llava.py
+++ b/tests/models/decoder_only/vision_language/test_llava.py
@@ -8,11 +8,9 @@ from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
-from ..conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
+from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
-                        _ImageAssets)
+                          _ImageAssets)
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
 pytestmark = pytest.mark.vlm
 _LIMIT_IMAGE_PER_PROMPT = 4
@@ -143,7 +141,7 @@ def _run_test(
 ):
    """Inference result should be the same between hf and vllm.
-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalDataDict objects 
    and corresponding MultiModalConfig as input.
@@ -239,7 +237,7 @@ def _run_test(
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
 def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
-                dtype: str, max_tokens: int, num_logprobs: int) -> None:
+                dtype, max_tokens, num_logprobs) -> None:
    run_test(
        hf_runner,
        vllm_runner,
--- a/tests/models/decoder_only/vision_language/test_llava_image_embeds.py
+++ b/tests/models/decoder_only/vision_language/test_llava_image_embeds.py
@@ -5,10 +5,8 @@ from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
 from vllm.sequence import SampleLogprobs
-from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
 pytestmark = pytest.mark.vlm
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "stop_sign":
@@ -62,7 +60,7 @@ def run_test(
 ):
    """Inference result should be the same between hf and vllm.
-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalDataDict objects 
    and corresponding vision language config as input.
--- a/tests/models/decoder_only/vision_language/test_llava_next.py
+++ b/tests/models/decoder_only/vision_language/test_llava_next.py
@@ -6,11 +6,9 @@ from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
 from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
-from ..conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
+from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
-                        _ImageAssets)
+                          _ImageAssets)
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
 pytestmark = pytest.mark.vlm
 _LIMIT_IMAGE_PER_PROMPT = 4
@@ -197,7 +195,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
                dtype, max_tokens, num_logprobs) -> None:
    """Inference result should be the same between hf and vllm.
-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalDataDict objects
    and corresponding MultiModalConfig as input.
--- a/tests/models/decoder_only/vision_language/test_llava_next_video.py
+++ b/tests/models/decoder_only/vision_language/test_llava_next_video.py
@@ -8,10 +8,8 @@ from vllm.multimodal.utils import (rescale_video_size, resize_video,
                                   sample_frames_from_video)
 from vllm.sequence import SampleLogprobs
-from ..conftest import VIDEO_ASSETS, HfRunner, VllmRunner, _VideoAssets
+from ....conftest import VIDEO_ASSETS, HfRunner, VllmRunner, _VideoAssets
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
 pytestmark = pytest.mark.vlm
 _PREFACE = (
    "A chat between a curious human and an artificial intelligence assistant. "
--- a/tests/models/decoder_only/vision_language/test_minicpmv.py
+++ b/tests/models/decoder_only/vision_language/test_minicpmv.py
@@ -9,10 +9,8 @@ from transformers import BatchEncoding
 from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
-from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner
+from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
 pytestmark = pytest.mark.vlm
 # The image token is placed before "user" on purpose so that the test can pass
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
@@ -65,7 +63,7 @@ def run_test(
 ):
    """Inference result should be the same between hf and vllm.
-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalDataDict objects 
    and corresponding MultiModalConfig as input.
--- a/tests/models/decoder_only/vision_language/test_paligemma.py
+++ b/tests/models/decoder_only/vision_language/test_paligemma.py
@@ -8,10 +8,8 @@ from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
 from vllm.utils import is_hip
-from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
 pytestmark = pytest.mark.vlm
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "stop_sign":
@@ -69,7 +67,7 @@ def run_test(
 ):
    """Inference result should be the same between hf and vllm.
-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalDataDict objects 
    and corresponding MultiModalConfig as input.
--- a/tests/models/decoder_only/vision_language/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/test_phi3v.py
@@ -9,10 +9,8 @@ from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
 from vllm.utils import is_cpu, is_hip
-from ..conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
+from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close
 pytestmark = pytest.mark.vlm
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "stop_sign":
@@ -71,7 +69,7 @@ def run_test(
 ):
    """Inference result should be the same between hf and vllm.
-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalDataDict objects 
    and corresponding MultiModalConfig as input.
--- a/tests/models/decoder_only/vision_language/test_pixtral.py
+++ b/tests/models/decoder_only/vision_language/test_pixtral.py
@@ -0,0 +1,199 @@
 """Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
 Run `pytest tests/models/test_mistral.py`.
 """
 import json
 import uuid
 from dataclasses import asdict
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
 import pytest
 from mistral_common.protocol.instruct.messages import ImageURLChunk
 from mistral_common.protocol.instruct.request import ChatCompletionRequest
 from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
 from mistral_common.tokens.tokenizers.multimodal import image_from_chunk
 from vllm import EngineArgs, LLMEngine, SamplingParams, TokensPrompt
 from vllm.multimodal import MultiModalDataBuiltins
 from vllm.sequence import Logprob, SampleLogprobs
 from ....utils import VLLM_PATH
 from ...utils import check_logprobs_close
 if TYPE_CHECKING:
    from _typeshed import StrPath
 MODELS = ["mistralai/Pixtral-12B-2409"]
 IMG_URLS = [
    "https://picsum.photos/id/237/400/300",
    "https://picsum.photos/id/231/200/300",
    "https://picsum.photos/id/27/500/500",
    "https://picsum.photos/id/17/150/600",
 ]
 PROMPT = "Describe each image in one short sentence."
 def _create_msg_format(urls: List[str]) -> List[Dict[str, Any]]:
    return [{
        "role":
        "user",
        "content": [{
            "type": "text",
            "text": PROMPT,
        }] + [{
            "type": "image_url",
            "image_url": {
                "url": url
            }
        } for url in urls],
    }]
 def _create_engine_inputs(urls: List[str]) -> TokensPrompt:
    msg = _create_msg_format(urls)
    tokenizer = MistralTokenizer.from_model("pixtral")
    request = ChatCompletionRequest(messages=msg)  # type: ignore[type-var]
    tokenized = tokenizer.encode_chat_completion(request)
    engine_inputs = TokensPrompt(prompt_token_ids=tokenized.tokens)
    images = []
    for chunk in request.messages[0].content:
        if isinstance(chunk, ImageURLChunk):
            images.append(image_from_chunk(chunk))
    mm_data = MultiModalDataBuiltins(image=images)
    engine_inputs["multi_modal_data"] = mm_data
    return engine_inputs
 MSGS = [
    _create_msg_format(IMG_URLS[:1]),
    _create_msg_format(IMG_URLS[:2]),
    _create_msg_format(IMG_URLS),
 ]
 ENGINE_INPUTS = [
    _create_engine_inputs(IMG_URLS[:1]),
    _create_engine_inputs(IMG_URLS[:2]),
    _create_engine_inputs(IMG_URLS),
 ]
 SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
 LIMIT_MM_PER_PROMPT = dict(image=4)
 MAX_MODEL_LEN = [8192, 65536]
 FIXTURES_PATH = VLLM_PATH / "tests/models/fixtures"
 assert FIXTURES_PATH.exists()
 FIXTURE_LOGPROBS_CHAT = FIXTURES_PATH / "pixtral_chat.json"
 FIXTURE_LOGPROBS_ENGINE = FIXTURES_PATH / "pixtral_chat_engine.json"
 OutputsLogprobs = List[Tuple[List[int], str, Optional[SampleLogprobs]]]
 # For the test author to store golden output in JSON
 def _dump_outputs_w_logprobs(
    outputs: OutputsLogprobs,
    filename: "StrPath",
 ) -> None:
    json_data = [(tokens, text,
                  [{k: asdict(v)
                    for k, v in token_logprobs.items()}
                   for token_logprobs in (logprobs or [])])
                 for tokens, text, logprobs in outputs]
    with open(filename, "w") as f:
        json.dump(json_data, f)
 def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
    with open(filename, "rb") as f:
        json_data = json.load(f)
    return [(tokens, text,
             [{int(k): Logprob(**v)
               for k, v in token_logprobs.items()}
              for token_logprobs in logprobs])
            for tokens, text, logprobs in json_data]
@pytest.mark.skip(
    reason=
    "Model is too big, test passed on A100 locally but will OOM on CI machine."
 )
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_model_len", MAX_MODEL_LEN)
@pytest.mark.parametrize("dtype", ["bfloat16"])
 def test_chat(
    vllm_runner,
    max_model_len: int,
    model: str,
    dtype: str,
 ) -> None:
    EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_CHAT)
    with vllm_runner(
            model,
            dtype=dtype,
            tokenizer_mode="mistral",
            enable_chunked_prefill=False,
            max_model_len=max_model_len,
            limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
    ) as vllm_model:
        outputs = []
        for msg in MSGS:
            output = vllm_model.model.chat(msg,
                                           sampling_params=SAMPLING_PARAMS)
            outputs.extend(output)
    logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
    check_logprobs_close(outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
                         outputs_1_lst=logprobs,
                         name_0="h100_ref",
                         name_1="output")
@pytest.mark.skip(
    reason=
    "Model is too big, test passed on A100 locally but will OOM on CI machine."
 )
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
 def test_model_engine(vllm_runner, model: str, dtype: str) -> None:
    EXPECTED_ENGINE_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_ENGINE)
    args = EngineArgs(
        model=model,
        tokenizer_mode="mistral",
        enable_chunked_prefill=False,
        limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
        dtype=dtype,
    )
    engine = LLMEngine.from_engine_args(args)
    engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[0], SAMPLING_PARAMS)
    engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[1], SAMPLING_PARAMS)
    outputs = []
    count = 0
    while True:
        out = engine.step()
        count += 1
        for request_output in out:
            if request_output.finished:
                outputs.append(request_output)
        if count == 2:
            engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[2],
                               SAMPLING_PARAMS)
        if not engine.has_unfinished_requests():
            break
    logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
    check_logprobs_close(outputs_0_lst=EXPECTED_ENGINE_LOGPROBS,
                         outputs_1_lst=logprobs,
                         name_0="h100_ref",
                         name_1="output")
--- a/tests/models/decoder_only/vision_language/test_qwen.py
+++ b/tests/models/decoder_only/vision_language/test_qwen.py
@@ -0,0 +1,401 @@
 import pathlib
 from typing import Dict, List, Optional, Tuple, Type, Union
 import pytest
 import torch
 from PIL.Image import Image
 from vllm.config import ModelConfig
 from vllm.inputs import InputContext, LLMInputs
 from vllm.multimodal.base import MultiModalInputs
 from vllm.multimodal.utils import cached_get_tokenizer, rescale_image_size
 from ....conftest import (IMAGE_ASSETS, HfRunner, ImageAsset, PromptImageInput,
                          VllmRunner, _ImageAssets)
 from ...utils import check_logprobs_close
 text_only_models = [
    "Qwen/Qwen-7B-Chat"  # Has no visual component
 ]
 multimodal_models = ["Qwen/Qwen-VL"]
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "stop_sign":
    "Picture 1: <img></img>\nWhat's the content of the image?: ",
    "cherry_blossom":
    "Picture 1: <img></img>\nWhat is the season?: ",
 })
 HF_MULTIIMAGE_IMAGE_PROMPT = "Picture 1: <img></img>\nPicture 2: <img></img>\nCan you compare these images?\n"  # noqa: E501
 HF_MULTIIMAGE_IMAGE_PROMPT = "Picture 1: <img></img>\nPicture 2: <img></img>\nDescribe the two images in detail.\n"  # noqa: E501
 ### Multimodal preprocessing tests
 SAMPLE_IMAGE = IMAGE_ASSETS[0].pil_image
 # These values are specific to Qwen-VL/Chat; we can get these from the model
 # config also, but they are hardcoded here to keep the parameterize/fixtures
 # easy to read.
 IMG_START_ID = 151857
 IMG_END_ID = 151858
 IMG_PAD_ID = 151859
 TOKS_PER_IMG = 256
 VIS_ENC_DIM = 4096
 IMG_SIZE = 448
 def build_model_context(model_name: str,
                        tokenizer_name: Optional[str] = None,
                        trust_remote_code: bool = False):
    """Creates an InputContext for a given model.
    Args:
        model_name: Name of the model being considered.
        tokenizer_name: Name of the tokenizer being considered.
        trust_remote_code: Whether or not to allow loading remote code.
    Returns:
        InputContext for the model being considered.
    """
    if tokenizer_name is None:
        tokenizer_name = model_name
    model_config = ModelConfig(
        model_name,
        tokenizer_name,
        tokenizer_mode="auto",
        trust_remote_code=trust_remote_code,
        dtype="float32",
        seed=0,
    )
    return InputContext(model_config)
@pytest.fixture()
 def input_mapper_for_qwen():
    # Lazy import to avoid initializing CUDA during test collection
    from vllm.model_executor.models.qwen import input_mapper_for_qwen
    return input_mapper_for_qwen
@pytest.fixture()
 def input_processor_for_qwen():
    # Lazy import to avoid initializing CUDA during test collection
    from vllm.model_executor.models.qwen import input_processor_for_qwen
    return input_processor_for_qwen
@pytest.fixture()
 def qwen_vl_context() -> InputContext:
    """Get an InputContext for Qwen-VL."""
    return build_model_context(model_name="Qwen/Qwen-VL",
                               trust_remote_code=True)
 # Happy path tests for single/multi-image scenarios for the multimodal
 # input processor and mapper, respectively
@pytest.mark.parametrize("num_images", [1, 2])
 def test_input_processor_valid_mm_data(input_processor_for_qwen,
                                       qwen_vl_context: InputContext,
                                       num_images: int):
    """Happy cases for image inputs to Qwen's multimodal input processor."""
    prompt = "".join(
        [f"Picture {num}: <img></img>\n" for num in range(1, num_images + 1)])
    inputs = LLMInputs(
        prompt=prompt,
        # When processing multimodal data for a multimodal model, the qwen
        # input processor will overwrite the provided prompt_token_ids with
        # the image prompts
        prompt_token_ids=None,
        multi_modal_data={"image": torch.rand(num_images, TOKS_PER_IMG, 4096)},
    )
    proc_inputs = input_processor_for_qwen(qwen_vl_context, inputs)
    assert isinstance(proc_inputs, dict)
    # Each image should have one start / stop and a fixed context of 256
    proc_tokens = proc_inputs["prompt_token_ids"]
    assert proc_tokens.count(IMG_START_ID) == num_images
    assert proc_tokens.count(IMG_END_ID) == num_images
    assert proc_tokens.count(IMG_PAD_ID) == num_images * TOKS_PER_IMG
@pytest.mark.parametrize(
    "img_data,expected_shape",
    [
        # single / multi-image
        (SAMPLE_IMAGE, (1, 3, IMG_SIZE, IMG_SIZE)),
        (2 * [SAMPLE_IMAGE], (2, 3, IMG_SIZE, IMG_SIZE)),
        # single / multi-image embeddings
        (torch.rand(
            (TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
        (torch.rand(
            (1, TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
        (torch.rand(
            (2, TOKS_PER_IMG, VIS_ENC_DIM)), (2, TOKS_PER_IMG, VIS_ENC_DIM)),
    ])
 def test_input_mapper_valid_mm_data(input_mapper_for_qwen,
                                    qwen_vl_context: InputContext,
                                    img_data: Union[torch.Tensor, List[Image],
                                                    Image],
                                    expected_shape: List[int]):
    """Happy cases for image inputs to Qwen's multimodal input mapper."""
    mapped_img_data = input_mapper_for_qwen(qwen_vl_context, img_data)
    # Ensure that we get the appropriately shaped pixel_values
    # for images and image embeddings, respectively.
    assert isinstance(mapped_img_data, MultiModalInputs)
    assert "pixel_values" in mapped_img_data
    assert mapped_img_data["pixel_values"].shape == expected_shape
 # Sad path tests for the multimodal input processor and mapper, respectively
@pytest.mark.parametrize("mm_data", [
    {
        "image": torch.rand((5))
    },
    {
        "image": torch.rand((5, 5, 5, 5, 5))
    },
 ])
 def test_input_processor_invalid_mm_data(input_processor_for_qwen,
                                         qwen_vl_context: InputContext,
                                         mm_data: Dict[str, torch.Tensor]):
    """Test sad cases validated in Qwen's multimodal input processor."""
    tokenizer = cached_get_tokenizer(qwen_vl_context.model_config.tokenizer,
                                     trust_remote_code=True)
    prompt = "Picture 1: <img></img>\n"
    prompt_token_ids = tokenizer.encode(prompt)
    inputs = LLMInputs(prompt=prompt,
                       prompt_token_ids=prompt_token_ids,
                       multi_modal_data=mm_data)
    # Should fail since we have too many or too few dimensions for embeddings
    with pytest.raises(ValueError):
        input_processor_for_qwen(qwen_vl_context, inputs)
@pytest.mark.parametrize(
    "img_data",
    [
        # Wrong context length
        torch.rand((1, TOKS_PER_IMG + 10, VIS_ENC_DIM)),
        # Wrong visual encoder output size
        torch.rand((1, TOKS_PER_IMG, VIS_ENC_DIM + 10)),
    ])
 def test_input_mapper_invalid_mm_data(
    input_mapper_for_qwen,
    qwen_vl_context: InputContext,
    img_data: Union[torch.Tensor, List[Image], Image],
 ):
    """Sad cases validated in Qwen VL's multimodal input mapper."""
    with pytest.raises(ValueError):
        input_mapper_for_qwen(qwen_vl_context, img_data)
 ### End-to-end generation tests
 def get_prompt_with_path(tmp_path: pathlib.PosixPath, prompt: str,
                         assets: Union[_ImageAssets, List[ImageAsset]]) -> str:
    """Given a temporary dir path, export one or more image assets into the
    tempdir & replace its contents with the local path to the string so that
    the HF version of Qwen-VL can resolve the path and load the image ni its
    forward() call.
    Args:
        tmp_path: Tempdir for test under consideration.
        prompt: Prompt with image placeholders.
        assets: List of image assets whose len equals the num placeholders.
    """
    # Ensure that the number of placeholders matches the number of assets;
    # If this is not true, the test is probably written incorrectly.
    assert prompt.count("<img></img>") == len(assets)
    # Replace the placeholders with local paths to the exported assets
    for asset in assets:
        image_tmp_path = tmp_path / f"{asset.name}.jpg"
        asset.pil_image.save(image_tmp_path)
        prompt = prompt.replace(
            "<img></img>",
            f"<img>{image_tmp_path}</img>",
            1,
        )
    return prompt
 def run_test(
    hf_runner: Type[HfRunner],
    vllm_runner: Type[VllmRunner],
    inputs: List[Tuple[List[str], PromptImageInput]],
    model: str,
    *,
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
    mm_limit: int,
    tensor_parallel_size: int,
    distributed_executor_backend: Optional[str] = None,
 ):
    """Inference result should be the same between hf and vllm.
    All the image fixtures for the test is under tests/images.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalDataDict objects
    and corresponding MultiModalConfig as input.
    Note, the text input is also adjusted to abide by vllm contract.
    The text output is sanitized to be able to compare with hf.
    """
    # NOTE: take care of the order. run vLLM first, and then run HF.
    # vLLM needs a fresh new process without cuda initialization.
    # if we run HF first, the cuda initialization will be done and it
    # will hurt multiprocessing backend with fork method (the default method).
    # max_model_len should be greater than image_feature_size
    # Qwen encodes each image into a fixed content size of 256
    with vllm_runner(model,
                     max_model_len=1024,
                     max_num_seqs=1,
                     dtype=dtype,
                     limit_mm_per_prompt={"image": mm_limit},
                     tensor_parallel_size=tensor_parallel_size,
                     distributed_executor_backend=distributed_executor_backend,
                     enforce_eager=True) as vllm_model:
        vllm_outputs_per_image = [
            vllm_model.generate_greedy_logprobs(prompts,
                                                max_tokens,
                                                num_logprobs=num_logprobs,
                                                images=images)
            for prompts, images in inputs
        ]
    with hf_runner(model, dtype=dtype) as hf_model:
        hf_outputs_per_image = [
            hf_model.generate_greedy_logprobs_limit(prompts,
                                                    max_tokens,
                                                    num_logprobs=num_logprobs,
                                                    images=images)
            for prompts, images in inputs
        ]
    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
                                        vllm_outputs_per_image):
        check_logprobs_close(
            outputs_0_lst=hf_outputs,
            outputs_1_lst=vllm_outputs,
            name_0="hf",
            name_1="vllm",
        )
@pytest.mark.parametrize("model", multimodal_models)
@pytest.mark.parametrize(
    "size_factors",
    [
        # No image
        [],
        # Single-scale
        [1.0],
        # Single-scale, batched
        [1.0, 1.0, 1.0],
        # Multi-scale
        [0.25, 0.5, 1.0],
    ],
 )
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [8])
@pytest.mark.parametrize("num_logprobs", [5])
 def test_multimodal_models_single_image(tmp_path: pathlib.PosixPath,
                                        hf_runner: Type[HfRunner],
                                        vllm_runner: Type[VllmRunner],
                                        image_assets: _ImageAssets, model: str,
                                        size_factors: List[float], dtype: str,
                                        max_tokens: int,
                                        num_logprobs: int) -> None:
    """Tests multimodal models with single image prompts."""
    images = [asset.pil_image for asset in image_assets]
    prompts = [
        get_prompt_with_path(tmp_path, prompt, [asset])
        for prompt, asset in zip(HF_IMAGE_PROMPTS, image_assets)
    ]
    inputs = [(
        [prompt for _ in size_factors],
        [rescale_image_size(image, factor) for factor in size_factors],
    ) for image, prompt in zip(images, prompts)]
    run_test(
        hf_runner,
        vllm_runner,
        inputs,
        model,
        dtype=dtype,
        max_tokens=max_tokens,
        num_logprobs=num_logprobs,
        mm_limit=1,
        tensor_parallel_size=1,
    )
@pytest.mark.parametrize("model", multimodal_models)
@pytest.mark.parametrize(
    "size_factors",
    [
        # No image
        [],
        # Single-scale
        [1.0],
        # Single-scale, batched
        [1.0, 1.0, 1.0],
        # Multi-scale
        [0.25, 0.5, 1.0],
    ],
 )
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
 def test_multimodal_models_multi_image(tmp_path: pathlib.PosixPath,
                                       hf_runner: Type[HfRunner],
                                       vllm_runner: Type[VllmRunner],
                                       image_assets: _ImageAssets, model: str,
                                       size_factors: List[float], dtype: str,
                                       max_tokens: int,
                                       num_logprobs: int) -> None:
    """Tests multimodal models with multi-image prompts."""
    images = [asset.pil_image for asset in image_assets]
    # Put all of the images into one prompt.
    prompt = get_prompt_with_path(tmp_path, HF_MULTIIMAGE_IMAGE_PROMPT,
                                  image_assets)
    inputs = [([prompt for _ in size_factors],
               [[rescale_image_size(image, factor) for image in images]
                for factor in size_factors])]
    run_test(
        hf_runner,
        vllm_runner,
        inputs,
        model,
        dtype=dtype,
        max_tokens=max_tokens,
        num_logprobs=num_logprobs,
        mm_limit=2,
        tensor_parallel_size=1,
    )
 # Ensure that a text-only Qwen model can still be loaded and
 # used for inference in VLLM without throwing.
@pytest.mark.parametrize("model", text_only_models)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
 def test_text_only_qwen_model_can_be_loaded_and_run(
    vllm_runner: Type[VllmRunner],
    example_prompts: List[str],
    model: str,
    *,
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
 ):
    with vllm_runner(model, dtype=dtype) as vllm_model:
        vllm_model.generate_greedy_logprobs(
            example_prompts,
            max_tokens,
            num_logprobs=num_logprobs,
        )
--- a/tests/models/embedding/init.py
+++ b/tests/models/embedding/init.py
--- a/tests/models/embedding/language/init.py
+++ b/tests/models/embedding/language/init.py
--- a/tests/models/embedding/language/test_embedding.py
+++ b/tests/models/embedding/language/test_embedding.py
--- a/tests/models/encoder_decoder/init.py
+++ b/tests/models/encoder_decoder/init.py
--- a/tests/models/encoder_decoder/language/init.py
+++ b/tests/models/encoder_decoder/language/init.py
--- a/tests/models/encoder_decoder/language/test_bart.py
+++ b/tests/models/encoder_decoder/language/test_bart.py
@@ -1,8 +1,8 @@
 """Compare the outputs of HF and vLLM for BART models using greedy sampling.
-Run `pytest tests/models/test_bart.py`.
+Run `pytest tests/models/encoder_decoder/language/test_bart.py`.
 """
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Type
 from vllm.utils import is_cpu
@@ -16,8 +16,10 @@ if not is_cpu():
    from vllm.sequence import SampleLogprobs
-    from ..conftest import DecoderPromptType
+    from ....conftest import (DecoderPromptType, ExplicitEncoderDecoderPrompt,
-    from .utils import check_logprobs_close
+                              HfRunner, VllmRunner)
    from ....utils import multi_gpu_test
    from ...utils import check_logprobs_close
    MODELS = ["facebook/bart-base", "facebook/bart-large-cnn"]
@@ -34,20 +36,18 @@ if not is_cpu():
        return output_ids, hf_output_str, out_logprobs
-    @pytest.mark.parametrize("model", MODELS)
+    def run_test(
-    @pytest.mark.parametrize("dtype", ["float", "bfloat16"])
+        hf_runner: Type[HfRunner],
-    @pytest.mark.parametrize("max_tokens", [64])
+        vllm_runner: Type[VllmRunner],
-    @pytest.mark.parametrize("num_logprobs", [5])
+        prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
-    @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
+        decoder_prompt_type: DecoderPromptType,
    def test_models(
        hf_runner,
        vllm_runner,
        example_encoder_decoder_prompts,
        model: str,
        *,
        dtype: str,
        max_tokens: int,
        num_logprobs: int,
-        decoder_prompt_type: DecoderPromptType,
+        tensor_parallel_size: int,
        distributed_executor_backend: Optional[str] = None,
    ) -> None:
        '''
        Test the vLLM BART model for a variety of encoder/decoder input prompts,
@@ -116,8 +116,29 @@ if not is_cpu():
        token during the process of validating the vLLM decoded output.
        '''
-        test_case_prompts = example_encoder_decoder_prompts[
+        # NOTE: take care of the order. run vLLM first, and then run HF.
-            decoder_prompt_type]
+        # vLLM needs a fresh new process without cuda initialization.
        # if we run HF first, the cuda initialization will be done and it
        # will hurt multiprocessing backend with fork method (the default).
        # Note: currently encoder/decoder models are only compatible with
        # enforce_eager=True. Normally this is not a problem because
        # for encoder/decoder models vLLM will
        # default to enforce_eager=True if enforce_eager
        # is left unspecified. However, the
        # VllmRunner test fixture (which wraps around the LLM class) defaults to
        # enforce_eager=False (a behavior which a number of already-exisitng
        # decoder-only unit tests expect), so when testing an encoder/decoder
        # model we must explicitly specify enforce_eager=True in the VllmRunner
        # constructor.
        with vllm_runner(
                model,
                dtype=dtype,
                tensor_parallel_size=tensor_parallel_size,
                distributed_executor_backend=distributed_executor_backend,
                enforce_eager=True) as vllm_model:
            vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
                prompts, max_tokens, num_logprobs)
        # Configuration settings for HF baseline
        hf_kwargs = {
@@ -135,26 +156,12 @@ if not is_cpu():
                       auto_cls=AutoModelForSeq2SeqLM) as hf_model:
            hf_outputs = (
                hf_model.generate_encoder_decoder_greedy_logprobs_limit(
-                    test_case_prompts,
+                    prompts,
                    max_tokens,
                    num_logprobs,
                    **hf_kwargs,
                ))
        # Note: currently encoder/decoder models are only compatible with
        # enforce_eager=True. Normally this is not a problem because
        # for encoder/decoder models vLLM will
        # default to enforce_eager=True if enforce_eager
        # is left unspecified. However, the
        # VllmRunner test fixture (which wraps around the LLM class) defaults to
        # enforce_eager=False (a behavior which a number of already-exisitng
        # decoder-only unit tests expect), so when testing an encoder/decoder
        # model we must explicitly specify enforce_eager=True in the VllmRunner
        # constructor.
        with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
            vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
                test_case_prompts, max_tokens, num_logprobs)
        hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE
                          else 0)
@@ -168,3 +175,49 @@ if not is_cpu():
            name_1="vllm",
            num_outputs_0_skip_tokens=hf_skip_tokens,
        )
    @pytest.mark.parametrize("model", MODELS)
    @pytest.mark.parametrize("dtype", ["float", "bfloat16"])
    @pytest.mark.parametrize("max_tokens", [64])
    @pytest.mark.parametrize("num_logprobs", [5])
    @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
    def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts,
                    model, dtype, max_tokens, num_logprobs,
                    decoder_prompt_type) -> None:
        run_test(
            hf_runner,
            vllm_runner,
            example_encoder_decoder_prompts[decoder_prompt_type],
            decoder_prompt_type,
            model,
            dtype=dtype,
            max_tokens=max_tokens,
            num_logprobs=num_logprobs,
            tensor_parallel_size=1,
        )
    @multi_gpu_test(num_gpus=2)
    @pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
    @pytest.mark.parametrize("model", ["facebook/bart-large-cnn"])
    @pytest.mark.parametrize("dtype", ["float"])
    @pytest.mark.parametrize("max_tokens", [64])
    @pytest.mark.parametrize("num_logprobs", [5])
    @pytest.mark.parametrize("decoder_prompt_type", [DecoderPromptType.CUSTOM])
    def test_models_distributed(hf_runner, vllm_runner,
                                example_encoder_decoder_prompts,
                                distributed_executor_backend, model, dtype,
                                max_tokens, num_logprobs,
                                decoder_prompt_type) -> None:
        run_test(
            hf_runner,
            vllm_runner,
            example_encoder_decoder_prompts[decoder_prompt_type],
            decoder_prompt_type,
            model,
            dtype=dtype,
            max_tokens=max_tokens,
            num_logprobs=num_logprobs,
            tensor_parallel_size=2,
            distributed_executor_backend=distributed_executor_backend,
        )
--- a/tests/models/fixtures/pixtral_chat.json
+++ b/tests/models/fixtures/pixtral_chat.json
--- a/tests/models/fixtures/pixtral_chat_engine.json
+++ b/tests/models/fixtures/pixtral_chat_engine.json
--- a/tests/models/test_pixtral.py
+++ b/tests/models/test_pixtral.py
@@ -1,64 +0,0 @@
 """Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
 Run `pytest tests/models/test_mistral.py`.
 """
 import pytest
 from vllm.sampling_params import SamplingParams
 pytestmark = pytest.mark.vlm
 MODELS = ["mistralai/Pixtral-12B-2409"]
@pytest.mark.skip(
    reason=
    "Model is too big, test passed on A100 locally but will OOM on CI machine."
 )
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
 def test_models(
    vllm_runner,
    example_prompts,
    model: str,
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
 ) -> None:
    image_urls = [
        "https://picsum.photos/id/237/200/300",
        "https://picsum.photos/seed/picsum/200/300"
    ]
    expected = [
        "The image depicts a black dog lying on a wooden surface, looking directly at the camera with a calm expression.",  # noqa
        "The image depicts a serene landscape with a snow-covered mountain under a pastel-colored sky during sunset."  # noqa
    ]
    prompt = "Describe the image in one short sentence."
    sampling_params = SamplingParams(max_tokens=512, temperature=0.0)
    with vllm_runner(model, dtype=dtype,
                     tokenizer_mode="mistral") as vllm_model:
        for i, image_url in enumerate(image_urls):
            messages = [
                {
                    "role":
                    "user",
                    "content": [{
                        "type": "text",
                        "text": prompt
                    }, {
                        "type": "image_url",
                        "image_url": {
                            "url": image_url
                        }
                    }]
                },
            ]
            outputs = vllm_model.model.chat(messages,
                                            sampling_params=sampling_params)
            assert outputs[0].outputs[0].text == expected[i]
--- a/tests/models/test_qwen.py
+++ b/tests/models/test_qwen.py
@@ -1,165 +0,0 @@
 import pathlib
 from typing import List, Optional, Type
 import pytest
 from vllm.multimodal.utils import rescale_image_size
 from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
 from .utils import check_logprobs_close
 pytestmark = pytest.mark.vlm
 text_only_models = [
    "Qwen/Qwen-7B-Chat"  # Has no visual component
 ]
 multimodal_models = ["Qwen/Qwen-VL"]
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "stop_sign":
    "Picture 1: <img></img>\nWhat's the content of the image?: ",
    "cherry_blossom":
    "Picture 1: <img></img>\nWhat is the season?: ",
 })
 ### Tests for multimodal Qwen models
 def run_test(
    tmp_path: pathlib.PosixPath,
    hf_runner: Type[HfRunner],
    vllm_runner: Type[VllmRunner],
    image_assets: _ImageAssets,
    model: str,
    *,
    size_factors: List[float],
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
    tensor_parallel_size: int,
    distributed_executor_backend: Optional[str] = None,
 ):
    """Inference result should be the same between hf and vllm.
    All the image fixtures for the test is under tests/images.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalDataDict objects
    and corresponding MultiModalConfig as input.
    Note, the text input is also adjusted to abide by vllm contract.
    The text output is sanitized to be able to compare with hf.
    """
    images = [asset.pil_image for asset in image_assets]
    # Export the images to a tempdir and substitute it into the hf prompt;
    # the contents between <img>/</img> will be ignored by VLLM, but the
    # transformers implementation for the visual transformer parses this to
    # reload it in the forward call; the contents are treated as a URL or a
    # local path.
    for idx, asset in enumerate(image_assets):
        image_tmp_path = tmp_path / f"{asset.name}.jpg"
        asset.pil_image.save(image_tmp_path)
        HF_IMAGE_PROMPTS[idx] = HF_IMAGE_PROMPTS[idx].replace(
            "<img></img>", f"<img>{image_tmp_path}</img>")
    inputs_per_image = [(
        [prompt for _ in size_factors],
        [rescale_image_size(image, factor) for factor in size_factors],
    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
    # NOTE: take care of the order. run vLLM first, and then run HF.
    # vLLM needs a fresh new process without cuda initialization.
    # if we run HF first, the cuda initialization will be done and it
    # will hurt multiprocessing backend with fork method (the default method).
    # max_model_len should be greater than image_feature_size
    # Qwen encodes images into a fixed content size of 256
    with vllm_runner(model,
                     max_model_len=300,
                     max_num_seqs=1,
                     dtype=dtype,
                     tensor_parallel_size=tensor_parallel_size,
                     distributed_executor_backend=distributed_executor_backend,
                     enforce_eager=True) as vllm_model:
        vllm_outputs_per_image = [
            vllm_model.generate_greedy_logprobs(prompts,
                                                max_tokens,
                                                num_logprobs=num_logprobs,
                                                images=images)
            for prompts, images in inputs_per_image
        ]
    with hf_runner(model, dtype=dtype) as hf_model:
        hf_outputs_per_image = [
            hf_model.generate_greedy_logprobs_limit(prompts,
                                                    max_tokens,
                                                    num_logprobs=num_logprobs,
                                                    images=images)
            for prompts, images in inputs_per_image
        ]
    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
                                        vllm_outputs_per_image):
        check_logprobs_close(
            outputs_0_lst=hf_outputs,
            outputs_1_lst=vllm_outputs,
            name_0="hf",
            name_1="vllm",
        )
@pytest.mark.parametrize("model", multimodal_models)
@pytest.mark.parametrize(
    "size_factors",
    [
        # No image
        [],
        # Single-scale
        [1.0],
        # Single-scale, batched
        [1.0, 1.0, 1.0],
        # Multi-scale
        [0.25, 0.5, 1.0],
    ],
 )
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [8])
@pytest.mark.parametrize("num_logprobs", [5])
 def test_multimodal_models(tmp_path, hf_runner, vllm_runner, image_assets,
                           model, size_factors, dtype, max_tokens,
                           num_logprobs) -> None:
    run_test(
        tmp_path,
        hf_runner,
        vllm_runner,
        image_assets,
        model,
        size_factors=size_factors,
        dtype=dtype,
        max_tokens=max_tokens,
        num_logprobs=num_logprobs,
        tensor_parallel_size=1,
    )
 # Ensure that a text-only Qwen model can still be loaded and
 # used for inference in VLLM without throwing.
@pytest.mark.parametrize("model", text_only_models)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
 def test_text_only_qwen_model_can_be_loaded_and_run(
    vllm_runner: Type[VllmRunner],
    example_prompts,
    model: str,
    *,
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
 ):
    with vllm_runner(model, dtype=dtype) as vllm_model:
        vllm_model.generate_greedy_logprobs(
            example_prompts,
            max_tokens,
            num_logprobs=num_logprobs,
        )
--- a/tests/multi_step/test_correctness_async_llm.py
+++ b/tests/multi_step/test_correctness_async_llm.py
@@ -1,9 +1,10 @@
 # Test the AsyncLLMEngine with multi-step-decoding
 from typing import List, Optional
 import pytest
 from tests.kernels.utils import override_backend_env_variable
 from ..models.utils import check_logprobs_close
 from ..utils import (completions_with_server_args, get_client_text_generations,
                     get_client_text_logprob_generations)
@@ -33,8 +34,9 @@ DEFAULT_SERVER_ARGS: List[str] = [
@pytest.mark.parametrize("eager_mode", [False, True])
@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
@pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
-@pytest.mark.parametrize("num_logprobs", [None, 5])
+@pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("is_async", [False, True])
+@pytest.mark.parametrize("is_async", [True])
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
@pytest.mark.asyncio
 async def test_multi_step(
    example_prompts,
@@ -46,6 +48,8 @@ async def test_multi_step(
    num_prompts: int,
    is_async: bool,
    num_logprobs: Optional[int],
    attention_backend: str,
    monkeypatch,
 ) -> None:
    """Test vLLM engine with multi-step scheduling in an OpenAI-protocol
    client/server environment.
@@ -71,6 +75,8 @@ async def test_multi_step(
                    completions endpoint; `None` -> no logprobs
    """
    override_backend_env_variable(monkeypatch, attention_backend)
    prompts = example_prompts
    if len(prompts) < num_prompts:
        prompts = prompts * ((num_prompts // len(prompts)) + 1)
--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@@ -10,6 +10,8 @@ import torch
 from tests.quantization.utils import is_quant_method_supported
 from ..utils import fork_new_process_for_each_test
 models_4bit_to_test = [
    ('huggyllama/llama-7b', 'quantize model inflight'),
 ]
@@ -29,6 +31,7 @@ models_pre_quant_8bit_to_test = [
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
                    reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
@fork_new_process_for_each_test
 def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                             model_name, description) -> None:
@@ -41,6 +44,7 @@ def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                    reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description",
                         models_pre_qaunt_4bit_to_test)
@fork_new_process_for_each_test
 def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                                       model_name, description) -> None:
@@ -52,6 +56,7 @@ def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                    reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description",
                         models_pre_quant_8bit_to_test)
@fork_new_process_for_each_test
 def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                             model_name, description) -> None:
@@ -77,18 +82,8 @@ def validate_generated_texts(hf_runner,
                             model_name,
                             hf_model_kwargs=None):
-    if hf_model_kwargs is None:
+    # NOTE: run vLLM first, as it requires a clean process
-        hf_model_kwargs = {}
+    # when using distributed inference
    # Run with HF runner
    with hf_runner(model_name, model_kwargs=hf_model_kwargs) as llm:
        hf_outputs = llm.generate_greedy(prompts, 8)
        hf_logs = log_generated_texts(prompts, hf_outputs, "HfRunner")
    # Clean up the GPU memory for the next test
    torch.cuda.synchronize()
    gc.collect()
    torch.cuda.empty_cache()
    #Run with vLLM runner
    with vllm_runner(model_name,
@@ -104,6 +99,19 @@ def validate_generated_texts(hf_runner,
    gc.collect()
    torch.cuda.empty_cache()
    if hf_model_kwargs is None:
        hf_model_kwargs = {}
    # Run with HF runner
    with hf_runner(model_name, model_kwargs=hf_model_kwargs) as llm:
        hf_outputs = llm.generate_greedy(prompts, 8)
        hf_logs = log_generated_texts(prompts, hf_outputs, "HfRunner")
    # Clean up the GPU memory for the next test
    torch.cuda.synchronize()
    gc.collect()
    torch.cuda.empty_cache()
    # Compare the generated strings
    for hf_log, vllm_log in zip(hf_logs, vllm_logs):
        hf_str = hf_log["generated_text"]
--- a/tests/quantization/utils.py
+++ b/tests/quantization/utils.py
@@ -1,12 +1,10 @@
 import torch
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.platforms import current_platform
 def is_quant_method_supported(quant_method: str) -> bool:
    # Currently, all quantization methods require Nvidia or AMD GPUs
-    if not torch.cuda.is_available():
+    if not (current_platform.is_cuda() or current_platform.is_rocm()):
        return False
    capability = current_platform.get_device_capability()
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -10,6 +10,7 @@ from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional
 import openai
 import pytest
 import requests
 from openai.types.completion import Completion
 from transformers import AutoTokenizer
@@ -22,7 +23,8 @@ from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.entrypoints.openai.cli_args import make_arg_parser
 from vllm.model_executor.model_loader.loader import get_model_loader
 from vllm.platforms import current_platform
-from vllm.utils import FlexibleArgumentParser, get_open_port, is_hip
+from vllm.utils import (FlexibleArgumentParser, cuda_device_count_stateless,
                        get_open_port, is_hip)
 if current_platform.is_rocm():
    from amdsmi import (amdsmi_get_gpu_vram_usage,
@@ -356,12 +358,23 @@ def error_on_warning():
        yield
 def get_physical_device_indices(devices):
    visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES")
    if visible_devices is None:
        return devices
    visible_indices = [int(x) for x in visible_devices.split(",")]
    index_mapping = {i: physical for i, physical in enumerate(visible_indices)}
    return [index_mapping[i] for i in devices if i in index_mapping]
@_nvml()
 def wait_for_gpu_memory_to_clear(devices: List[int],
                                 threshold_bytes: int,
                                 timeout_s: float = 120) -> None:
    # Use nvml instead of pytorch to reduce measurement error from torch cuda
    # context.
    devices = get_physical_device_indices(devices)
    start_time = time.time()
    while True:
        output: Dict[int, str] = {}
@@ -441,6 +454,22 @@ def fork_new_process_for_each_test(
    return wrapper
 def multi_gpu_test(*, num_gpus: int):
    """
    Decorate a test to be run only when multiple GPUs are available.
    """
    test_selector = getattr(pytest.mark, f"distributed_{num_gpus}_gpus")
    test_skipif = pytest.mark.skipif(
        cuda_device_count_stateless() < num_gpus,
        reason=f"Need at least {num_gpus} GPUs to run the test.",
    )
    def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
        return test_selector(test_skipif(fork_new_process_for_each_test(f)))
    return wrapper
 async def completions_with_server_args(
    prompts: List[str],
    model_name: str,
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -161,16 +161,36 @@ def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
    torch.ops._C.fused_add_rms_norm(input, residual, weight, epsilon)
-def advance_step(num_seqs: int, num_queries: int, block_size: int,
+def advance_step_flashattn(num_seqs: int, num_queries: int, block_size: int,
-                 input_tokens: torch.Tensor, sampled_token_ids: torch.Tensor,
+                           input_tokens: torch.Tensor,
-                 input_positions: torch.Tensor, seq_lens: torch.Tensor,
+                           sampled_token_ids: torch.Tensor,
-                 slot_mapping: torch.Tensor,
+                           input_positions: torch.Tensor,
-                 block_tables: torch.Tensor) -> None:
+                           seq_lens: torch.Tensor, slot_mapping: torch.Tensor,
                           block_tables: torch.Tensor) -> None:
    """Advance a step on GPU for existing inputs for a multi-step runner"""
-    return torch.ops._C.advance_step(num_seqs, num_queries, block_size,
+    return torch.ops._C.advance_step_flashattn(num_seqs, num_queries,
-                                     input_tokens, sampled_token_ids,
+                                               block_size, input_tokens,
-                                     input_positions, seq_lens, slot_mapping,
+                                               sampled_token_ids,
-                                     block_tables)
+                                               input_positions, seq_lens,
                                               slot_mapping, block_tables)
 def advance_step_flashinfer(num_seqs: int, num_queries: int, block_size: int,
                            input_tokens: torch.Tensor,
                            sampled_token_ids: torch.Tensor,
                            input_positions: torch.Tensor,
                            seq_lens: torch.Tensor, slot_mapping: torch.Tensor,
                            block_tables: torch.Tensor,
                            paged_kv_indices: torch.Tensor,
                            paged_kv_indptr: torch.Tensor,
                            paged_kv_last_page_len: torch.Tensor,
                            block_table_bound: torch.Tensor) -> None:
    return torch.ops._C.advance_step_flashinfer(
        num_seqs, num_queries, block_size, input_tokens, sampled_token_ids,
        input_positions, seq_lens, slot_mapping, block_tables,
        paged_kv_indices, paged_kv_indptr, paged_kv_last_page_len,
        block_table_bound)
 # quantization ops
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -83,7 +83,9 @@ class AttentionBackend(ABC):
    ) -> None:
        raise NotImplementedError
-    def advance_step(self, num_seqs: int, num_queries: int):
+    def advance_step(self, model_input: "ModelRunnerInputBase",
                     sampled_token_ids: Optional[torch.Tensor],
                     block_size: int, num_seqs: int, num_queries: int) -> None:
        raise NotImplementedError
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -122,6 +122,40 @@ def _(
    return torch.empty_like(decode_query)
@torch.library.custom_op("vllm::reshape_and_cache_flash",
                         mutates_args=["kv_cache"])
 def reshape_and_cache_flash(
    key: torch.Tensor,
    value: torch.Tensor,
    kv_cache: torch.Tensor,
    slot_mapping: torch.Tensor,
    kv_cache_dtype: str,
    k_scale: float,
    v_scale: float,
 ) -> None:
    """Inductor cannot deal with inplace operations on views.
    See https://github.com/pytorch/pytorch/issues/131192
    and https://github.com/pytorch/pytorch/issues/130174
    This is a workaround to hide the view operation from the inductor.
    """
    return torch.ops._C_cache_ops.reshape_and_cache_flash(
        key, value, kv_cache[0], kv_cache[1], slot_mapping, kv_cache_dtype,
        k_scale, v_scale)
@reshape_and_cache_flash.register_fake  # type: ignore
 def _(
    key: torch.Tensor,
    value: torch.Tensor,
    kv_cache: torch.Tensor,
    slot_mapping: torch.Tensor,
    kv_cache_dtype: str,
    k_scale: float,
    v_scale: float,
 ) -> None:
    pass
 class FlashAttentionBackend(AttentionBackend):
    @staticmethod
@@ -346,15 +380,15 @@ class FlashAttentionMetadata(AttentionMetadata):
            self.seq_lens[i] += 1
        self.max_decode_seq_len = max(self.seq_lens)
-        ops.advance_step(num_seqs=num_seqs,
+        ops.advance_step_flashattn(num_seqs=num_seqs,
-                         num_queries=num_queries,
+                                   num_queries=num_queries,
-                         block_size=block_size,
+                                   block_size=block_size,
-                         input_tokens=model_input.input_tokens,
+                                   input_tokens=model_input.input_tokens,
-                         sampled_token_ids=sampled_token_ids,
+                                   sampled_token_ids=sampled_token_ids,
-                         input_positions=model_input.input_positions,
+                                   input_positions=model_input.input_positions,
-                         seq_lens=self.seq_lens_tensor,
+                                   seq_lens=self.seq_lens_tensor,
-                         slot_mapping=self.slot_mapping,
+                                   slot_mapping=self.slot_mapping,
-                         block_tables=self.block_tables)
+                                   block_tables=self.block_tables)
 class FlashAttentionMetadataBuilder(
@@ -653,11 +687,10 @@ class FlashAttentionImpl(AttentionImpl):
            # Reshape the input keys and values and store them in the cache.
            # If kv_cache is not provided, the new key and value tensors are
            # not cached. This happens during the initial memory profiling run.
-            ops.reshape_and_cache_flash(
+            torch.ops.vllm.reshape_and_cache_flash(
                key,
                value,
-                key_cache,
+                kv_cache,
                value_cache,
                attn_metadata.slot_mapping.flatten(),
                self.kv_cache_dtype,
                k_scale,
@@ -669,7 +702,6 @@ class FlashAttentionImpl(AttentionImpl):
        assert key.shape[0] == num_prefill_tokens + num_decode_tokens
        assert value.shape[0] == num_prefill_tokens + num_decode_tokens
        output = torch.empty_like(query)
        # Query for decode. KV is not needed because it is already cached.
        decode_query = query[num_prefill_tokens:]
        # QKV for prefill.
@@ -680,6 +712,9 @@ class FlashAttentionImpl(AttentionImpl):
        assert query.shape[0] == num_prefill_tokens
        assert decode_query.shape[0] == num_decode_tokens
        prefill_output: Optional[torch.Tensor] = None
        decode_output: Optional[torch.Tensor] = None
        if prefill_meta := attn_metadata.prefill_metadata:
            # Prompt run.
            if (kv_cache is None or prefill_meta.block_tables is None
@@ -687,7 +722,7 @@ class FlashAttentionImpl(AttentionImpl):
                # normal attention
                # When block_tables are not filled, it means q and k are the
                # prompt, and they have the same length.
-                out = torch.ops.vllm.flash_attn_varlen_func(
+                prefill_output = torch.ops.vllm.flash_attn_varlen_func(
                    q=query,
                    k=key,
                    v=value,
@@ -701,42 +736,44 @@ class FlashAttentionImpl(AttentionImpl):
                    alibi_slopes=self.alibi_slopes,
                    softcap=self.logits_soft_cap,
                )
                assert output[:num_prefill_tokens].shape == out.shape
                output[:num_prefill_tokens] = out
            else:
                # prefix-enabled attention
                assert prefill_meta.seq_lens is not None
                max_seq_len = max(prefill_meta.seq_lens)
-                output[:
+                prefill_output = torch.ops.vllm.flash_attn_varlen_func(  # noqa
-                       num_prefill_tokens] = torch.ops.vllm.flash_attn_varlen_func(  # noqa
+                    q=query,
-                           q=query,
+                    k=key_cache,
-                           k=key_cache,
+                    v=value_cache,
-                           v=value_cache,
+                    cu_seqlens_q=prefill_meta.query_start_loc,
-                           cu_seqlens_q=prefill_meta.query_start_loc,
+                    max_seqlen_q=prefill_meta.max_query_len,
-                           max_seqlen_q=prefill_meta.max_query_len,
+                    cu_seqlens_k=prefill_meta.seq_start_loc,
-                           cu_seqlens_k=prefill_meta.seq_start_loc,
+                    max_seqlen_k=max_seq_len,
                           max_seqlen_k=max_seq_len,
                           softmax_scale=self.scale,
                           causal=True,
                           alibi_slopes=self.alibi_slopes,
                           block_table=prefill_meta.block_tables,
                           softcap=self.logits_soft_cap,
                       )
        if decode_meta := attn_metadata.decode_metadata:
            # Decoding run.
            output[
                num_prefill_tokens:] = torch.ops.vllm.flash_attn_with_kvcache(
                    decode_query.unsqueeze(1),
                    key_cache,
                    value_cache,
                    block_table=decode_meta.block_tables,
                    cache_seqlens=decode_meta.seq_lens_tensor,
                    softmax_scale=self.scale,
                    causal=True,
                    alibi_slopes=self.alibi_slopes,
                    block_table=prefill_meta.block_tables,
                    softcap=self.logits_soft_cap,
-                ).squeeze(1)
+                )
-        # Reshape the output tensor.
+        if decode_meta := attn_metadata.decode_metadata:
            # Decoding run.
            decode_output = torch.ops.vllm.flash_attn_with_kvcache(
                decode_query.unsqueeze(1),
                key_cache,
                value_cache,
                block_table=decode_meta.block_tables,
                cache_seqlens=decode_meta.seq_lens_tensor,
                softmax_scale=self.scale,
                causal=True,
                alibi_slopes=self.alibi_slopes,
                softcap=self.logits_soft_cap,
            ).squeeze(1)
        if prefill_output is None:
            assert decode_output is not None
            return decode_output.view(num_decode_tokens, hidden_size)
        if decode_output is None:
            assert prefill_output is not None
            return prefill_output.view(num_prefill_tokens, hidden_size)
        output = torch.cat([prefill_output, decode_output], dim=0)
        return output.view(num_tokens, hidden_size)
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -30,7 +30,8 @@ from vllm.utils import (async_tensor_h2d, get_kv_cache_torch_dtype,
                        make_tensor_with_pad)
 if TYPE_CHECKING:
-    from vllm.worker.model_runner import ModelInputForGPUBuilder
+    from vllm.worker.model_runner import (ModelInputForGPUBuilder,
                                          ModelInputForGPUWithSamplingMetadata)
 class FlashInferBackend(AttentionBackend):
@@ -268,6 +269,10 @@ class FlashInferMetadata(AttentionMetadata):
    query_start_loc: Optional[torch.Tensor] = None
    block_tables: Optional[torch.Tensor] = None
    # used for GPU in-place advance_step
    seq_lens_tensor: Optional[torch.Tensor] = None
    block_table_bound: Optional[torch.Tensor] = None
    # An example for paged_kv_indices, paged_kv_indptr:
    # request 1, page indices [0, 5, 8]
    # request 2, page indices [1, 6, 7]
@@ -318,6 +323,8 @@ class FlashInferMetadata(AttentionMetadata):
            assert self.paged_kv_indices is not None
            assert self.paged_kv_indptr is not None
            assert self.paged_kv_last_page_len is not None
            assert self.block_table_bound is not None
            assert self.seq_lens_tensor is not None
            batch_size = self.query_start_loc.shape[0] - 1
            assert batch_size >= 0
            # We will use flash attention for profiling to
@@ -327,6 +334,8 @@ class FlashInferMetadata(AttentionMetadata):
                self.paged_kv_indptr = self.paged_kv_indptr.to(self.device)
                self.paged_kv_last_page_len = self.paged_kv_last_page_len.to(
                    self.device)
                self.block_table_bound = self.block_table_bound.to(self.device)
                self.seq_lens_tensor = self.seq_lens_tensor.to(self.device)
                self.paged_kv_indices = self.paged_kv_indices.to(self.device)
                self.prefill_wrapper.end_forward()
                self.prefill_wrapper.begin_forward(
@@ -335,14 +344,18 @@ class FlashInferMetadata(AttentionMetadata):
                    self.num_qo_heads, self.num_kv_heads, self.head_dim,
                    self.page_size)
        else:
-            if not self.use_cuda_graph:
+            assert self.paged_kv_indices is not None
-                assert self.paged_kv_indices is not None
+            assert self.paged_kv_indptr is not None
-                assert self.paged_kv_indptr is not None
+            assert self.paged_kv_last_page_len is not None
-                assert self.paged_kv_last_page_len is not None
+            self.paged_kv_indices = self.paged_kv_indices.to(self.device)
-                self.paged_kv_indices = self.paged_kv_indices.to(self.device)
+            self.paged_kv_indptr = self.paged_kv_indptr.to(self.device)
-                self.paged_kv_indptr = self.paged_kv_indptr.to(self.device)
+            self.paged_kv_last_page_len = self.paged_kv_last_page_len.to(
-                self.paged_kv_last_page_len = self.paged_kv_last_page_len.to(
+                self.device)
-                    self.device)
+            # handle model warmup path
            if self.block_table_bound is not None:
                self.block_table_bound = self.block_table_bound.to(self.device)
            if self.seq_lens_tensor is not None:
                self.seq_lens_tensor = self.seq_lens_tensor.to(self.device)
            assert self.decode_wrapper is not None
            self.decode_wrapper.end_forward()
@@ -391,6 +404,48 @@ class FlashInferMetadata(AttentionMetadata):
        return self
    def advance_step(
        self,
        model_input: "ModelInputForGPUWithSamplingMetadata",
        sampled_token_ids: Optional[torch.Tensor],
        block_size: int,
        num_seqs: int,
        num_queries: int,
    ):
        """
        Update metadata in-place to advance one decode step.
        """
        assert num_seqs > 0
        assert num_queries > 0
        assert model_input.attn_metadata is not None
        assert sampled_token_ids is not None
        # When using cudagraph, the num_seqs is padded to the next captured
        # batch sized, but num_queries tracks the actual number of requests in
        # the batch. For --enforce-eager mode, num_seqs == num_queries
        if num_seqs != num_queries:
            assert num_seqs > num_queries
            assert self.use_cuda_graph
        model_input.input_tokens[:num_queries] = sampled_token_ids.flatten()
        # Update GPU tensors
        ops.advance_step_flashinfer(
            num_seqs=num_seqs,
            num_queries=num_queries,
            block_size=block_size,
            input_tokens=model_input.input_tokens,
            sampled_token_ids=model_input.input_tokens,
            input_positions=model_input.input_positions,
            seq_lens=self.seq_lens_tensor,
            slot_mapping=self.slot_mapping,
            block_tables=self.block_tables,
            paged_kv_indices=self.paged_kv_indices,
            paged_kv_indptr=self.paged_kv_indptr,
            paged_kv_last_page_len=self.paged_kv_last_page_len,
            block_table_bound=self.block_table_bound)
 class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
@@ -428,7 +483,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
        self.paged_kv_indptr: List[int] = [0]
        # paged_kv_last_page_len is the length of the last page of each request
        self.paged_kv_last_page_len: List[int] = []
-
+        self.total_blocks = 0
        self.is_profile_run: bool = False
    def _add_seq_group(
@@ -499,6 +554,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
        # block_table_bound is 1 with 1 valid block.
        # If seq_len = 15, block_size = 16,
        # block_table_bound is 0 + 1 with 1 valid block.
        self.total_blocks += len(block_table)
        block_table_bound = seq_len // self.block_size + 1 \
                            if seq_len % self.block_size != 0 \
                            else seq_len // self.block_size
@@ -541,9 +597,19 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
            # The shape of graph_block_tables is
            # [max batch size, max context len // block size].
            input_block_tables = self.runner.graph_block_tables[:batch_size]
            max_blocks = input_block_tables.shape[1]
            for i, block_table in enumerate(self.block_tables):
                if block_table:
-                    input_block_tables[i, :len(block_table)] = block_table
+                    num_blocks = len(block_table)
                    if num_blocks <= max_blocks:
                        input_block_tables[i, :num_blocks] = block_table
                    else:
                        # It may be possible to have more blocks allocated due
                        # to lookahead slots of multi-step, however, they are
                        # not used anyway, so can be safely ignored.
                        input_block_tables[
                            i, :max_blocks] = block_table[:max_blocks]
            block_tables = torch.from_numpy(input_block_tables).to(
                device, non_blocking=True)
@@ -583,6 +649,10 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                     out=query_start_loc[1:])
        if len(self.paged_kv_indptr) > 0:
            # extend to the maximum number of blocks as returned by the
            # scheduler
            self.paged_kv_indices.extend(
                [0] * (self.total_blocks - len(self.paged_kv_indices)))
            paged_kv_indices_tensor = torch.tensor(self.paged_kv_indices,
                                                   device="cpu",
                                                   dtype=torch.int)
@@ -591,10 +661,15 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                                                  dtype=torch.int)
            paged_kv_last_page_len_tensor = torch.tensor(
                self.paged_kv_last_page_len, device="cpu", dtype=torch.int)
            block_table_bound_tensor = torch.zeros(len(self.paged_kv_indptr) -
                                                   1,
                                                   device="cpu",
                                                   dtype=torch.int)
        else:
            paged_kv_indices_tensor = None
            paged_kv_indptr_tensor = None
            paged_kv_last_page_len_tensor = None
            block_table_bound_tensor = None
        if self.runner.kv_cache_dtype.startswith("fp8"):
            kv_cache_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer(
@@ -613,6 +688,8 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
            paged_kv_indptr=paged_kv_indptr_tensor,
            paged_kv_indices=paged_kv_indices_tensor,
            paged_kv_last_page_len=paged_kv_last_page_len_tensor,
            block_table_bound=block_table_bound_tensor,
            seq_lens_tensor=seq_lens_tensor,
            num_qo_heads=self.runner.model_config.get_num_attention_heads(
                self.runner.parallel_config),
            num_kv_heads=self.runner.model_config.get_num_kv_heads(
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -869,6 +869,13 @@ class ParallelConfig:
                                 f"distributed executor backend "
                                 f"'{self.distributed_executor_backend}'.")
        if current_platform.is_tpu() and self.world_size > 1:
            if self.distributed_executor_backend is None:
                self.distributed_executor_backend = "ray"
            if self.distributed_executor_backend != "ray":
                raise ValueError(
                    "TPU backend only supports Ray for distributed inference.")
        if self.distributed_executor_backend is None and self.world_size > 1:
            # We use multiprocessing by default if world_size fits on the
            # current node and we aren't in a ray placement group.
@@ -876,7 +883,7 @@ class ParallelConfig:
            from vllm.executor import ray_utils
            backend = "mp"
            ray_found = ray_utils.ray_is_available()
-            if (torch.cuda.is_available()
+            if (current_platform.is_cuda()
                    and cuda_device_count_stateless() < self.world_size):
                if not ray_found:
                    raise ValueError("Unable to load Ray which is "
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -843,6 +843,13 @@ class EngineArgs:
        device_config = DeviceConfig(device=self.device)
        model_config = self.create_model_config()
        if model_config.is_multimodal_model:
            if self.enable_prefix_caching:
                logger.warning(
                    "--enable-prefix-caching is currently not "
                    "supported for multimodal models and has been disabled.")
            self.enable_prefix_caching = False
        cache_config = CacheConfig(
            block_size=self.block_size if self.device != "neuron" else
            self.max_model_len,  # neuron needs block_size = max_model_len
@@ -874,7 +881,10 @@ class EngineArgs:
            # If not explicitly set, enable chunked prefill by default for
            # long context (> 32K) models. This is to avoid OOM errors in the
            # initial memory profiling phase.
-            if use_long_context:
+
            # Chunked prefill is currently disabled for multimodal models by
            # default.
            if use_long_context and not model_config.is_multimodal_model:
                is_gpu = device_config.device_type == "cuda"
                use_sliding_window = (model_config.get_sliding_window()
                                      is not None)
@@ -1035,7 +1045,6 @@ class EngineArgs:
@dataclass
 class AsyncEngineArgs(EngineArgs):
    """Arguments for asynchronous vLLM engine."""
    engine_use_ray: bool = False
    disable_log_requests: bool = False
    @staticmethod
@@ -1043,16 +1052,6 @@ class AsyncEngineArgs(EngineArgs):
                     async_args_only: bool = False) -> FlexibleArgumentParser:
        if not async_args_only:
            parser = EngineArgs.add_cli_args(parser)
        parser.add_argument('--engine-use-ray',
                            action='store_true',
                            help='Use Ray to start the LLM engine in a '
                            'separate process as the server process.'
                            '(DEPRECATED. This argument is deprecated '
                            'and will be removed in a future update. '
                            'Set `VLLM_ALLOW_ENGINE_USE_RAY=1` to force '
                            'use it. See '
                            'https://github.com/vllm-project/vllm/issues/7045.'
                            ')')
        parser.add_argument('--disable-log-requests',
                            action='store_true',
                            help='Disable logging requests.')
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -4,22 +4,18 @@ from functools import partial
 from typing import (Any, AsyncGenerator, Callable, Dict, Iterable, List,
                    Mapping, Optional, Set, Tuple, Type, Union)
 from typing_extensions import assert_never
 import vllm.envs as envs
 from vllm.config import (DecodingConfig, EngineConfig, LoRAConfig, ModelConfig,
                         ParallelConfig, SchedulerConfig)
 from vllm.core.scheduler import SchedulerOutputs
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_timeout import asyncio_timeout
-from vllm.engine.llm_engine import (DecoderPromptComponents, LLMEngine,
+from vllm.engine.llm_engine import LLMEngine, SchedulerOutputState
                                    PromptComponents, SchedulerOutputState)
 from vllm.engine.metrics_types import StatLoggerBase
 from vllm.executor.executor_base import ExecutorAsyncBase
-from vllm.executor.ray_utils import initialize_ray_cluster, ray
+from vllm.executor.gpu_executor import GPUExecutorAsync
-from vllm.inputs import (EncoderDecoderLLMInputs, LLMInputs, PromptInputs,
+from vllm.executor.ray_utils import initialize_ray_cluster
-                         SingletonPromptInputs)
+from vllm.inputs import PromptInputs
 from vllm.inputs.parse import is_explicit_encoder_decoder_prompt
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.layers.sampler import SamplerOutput
@@ -30,7 +26,6 @@ from vllm.sampling_params import SamplingParams
 from vllm.sequence import ExecuteModelRequest
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import print_warning_once
 logger = init_logger(__name__)
 ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S
@@ -404,139 +399,6 @@ class _AsyncLLMEngine(LLMEngine):
        """Stop the remote worker execution loop."""
        await self.model_executor.stop_remote_worker_execution_loop_async()
    async def _tokenize_prompt_async(
        self,
        prompt: str,
        request_id: str,
        lora_request: Optional[LoRARequest],
    ) -> List[int]:
        """Async version of :meth:`_tokenize_prompt`."""
        tokenizer = self.get_tokenizer_group(
            missing_msg="prompts must be None if skip_tokenizer_init is True")
        return await tokenizer.encode_async(request_id=request_id,
                                            prompt=prompt,
                                            lora_request=lora_request)
    async def _extract_prompt_components_async(
        self,
        inputs: SingletonPromptInputs,
        request_id: str,
        lora_request: Optional[LoRARequest] = None,
    ) -> PromptComponents:
        """Async version of :meth:`_extract_prompt_components`."""
        if isinstance(inputs, str):
            prompt = inputs
            prompt_token_ids = await self._tokenize_prompt_async(
                prompt,
                request_id=request_id,
                lora_request=lora_request,
            )
            multi_modal_data = None
        elif isinstance(inputs, dict):
            if "prompt_token_ids" in inputs:
                prompt = None
                prompt_token_ids = inputs["prompt_token_ids"]
            else:
                # NOTE: This extra assignment is required to pass mypy
                prompt = parsed_prompt = inputs["prompt"]
                prompt_token_ids = await self._tokenize_prompt_async(
                    parsed_prompt,
                    request_id=request_id,
                    lora_request=lora_request,
                )
            multi_modal_data = inputs.get("multi_modal_data")
        else:
            assert_never(inputs)
        return prompt, prompt_token_ids, multi_modal_data
    async def _process_encoder_decoder_prompt_async(
        self,
        inputs: PromptInputs,
        request_id: str,
    ) -> EncoderDecoderLLMInputs:
        """Async version of :meth:`_process_encoder_decoder_prompt`."""
        encoder_comps: PromptComponents
        decoder_comps: DecoderPromptComponents
        if is_explicit_encoder_decoder_prompt(inputs):
            encoder_task = self._extract_prompt_components_async(
                inputs["encoder_prompt"],
                request_id=request_id,
            )
            if (decoder_input := inputs["decoder_prompt"]) is None:
                encoder_comps = await encoder_task
                decoder_comps = None, None, None
            else:
                decoder_task = self._extract_prompt_components_async(
                    decoder_input,
                    request_id=request_id,
                )
                encoder_comps, decoder_comps = await asyncio.gather(
                    encoder_task, decoder_task)
        else:
            encoder_comps = await self._extract_prompt_components_async(
                inputs,
                request_id=request_id,
            )
            decoder_comps = None, None, None
        return self._build_enc_dec_llm_inputs(encoder_comps, decoder_comps)
    async def _process_decoder_only_prompt_async(
        self,
        inputs: SingletonPromptInputs,
        request_id: str,
        lora_request: Optional[LoRARequest] = None,
        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
    ) -> LLMInputs:
        """Async version of :meth:`_process_decoder_only_prompt`."""
        prompt_comps = await self._extract_prompt_components_async(
            inputs,
            request_id=request_id,
            lora_request=lora_request,
        )
        return self._build_decoder_only_llm_inputs(
            prompt_comps,
            prompt_adapter_request=prompt_adapter_request,
        )
    async def process_model_inputs_async(
        self,
        inputs: PromptInputs,
        request_id: str,
        lora_request: Optional[LoRARequest] = None,
        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
    ) -> Union[LLMInputs, EncoderDecoderLLMInputs]:
        """Async version of :meth:`process_model_inputs`."""
        if self.is_encoder_decoder_model():
            # Encoder-decoder model requires special mapping of
            # input prompts to encoder & decoder
            model_inputs = await self._process_encoder_decoder_prompt_async(
                inputs,
                request_id=request_id,
            )
        else:
            if is_explicit_encoder_decoder_prompt(inputs):
                raise ValueError("Cannot pass encoder-decoder prompt "
                                 "to decoder-only models")
            # Decoder-only operation
            model_inputs = await self._process_decoder_only_prompt_async(
                inputs,
                request_id=request_id,
                lora_request=lora_request,
                prompt_adapter_request=prompt_adapter_request,
            )
        return self.input_processor(model_inputs)
    async def add_request_async(
        self,
        request_id: str,
@@ -554,12 +416,13 @@ class _AsyncLLMEngine(LLMEngine):
        if arrival_time is None:
            arrival_time = time.time()
-        processed_inputs = await self.process_model_inputs_async(
+        preprocessed_inputs = await self.input_preprocessor.preprocess_async(
            inputs,
            request_id=request_id,
            lora_request=lora_request,
            prompt_adapter_request=prompt_adapter_request,
        )
        processed_inputs = self.input_processor(preprocessed_inputs)
        self._add_processed_request(
            request_id=request_id,
@@ -590,9 +453,6 @@ class AsyncLLMEngine:
        worker_use_ray: Whether to use Ray for model workers. Required for
            distributed execution. Should be the same as
            `parallel_config.worker_use_ray`.
        engine_use_ray: Whether to make LLMEngine a Ray actor. If so, the
            async frontend will be executed in a separate process as the
            model workers.
        log_requests: Whether to log the requests.
        start_engine_loop: If True, the background task to run the engine
            will be automatically started in the generate call.
@@ -604,41 +464,23 @@ class AsyncLLMEngine:
    def __init__(self,
                 worker_use_ray: bool,
                 engine_use_ray: bool,
                 *args,
                 log_requests: bool = True,
                 start_engine_loop: bool = True,
                 **kwargs) -> None:
        self.worker_use_ray = worker_use_ray
        self.engine_use_ray = engine_use_ray
        self.log_requests = log_requests
-        self.engine = self._init_engine(*args, **kwargs)
+        self.engine = self._engine_class(*args, **kwargs)
        # This ensures quick processing of request outputs
        # so the append to asyncio queues is not delayed,
        # especially for multi-step.
        #
-        # TODO: Currently, disabled for engine_use_ray, ask
+        self.use_process_request_outputs_callback = True
        # Cody/Will/Woosuk about this case.
        self.use_process_request_outputs_callback = not self.engine_use_ray
        if self.use_process_request_outputs_callback:
            self.engine.process_request_outputs_callback = \
                self.process_request_outputs
        if self.engine_use_ray:
            print_warning_once(
                "DEPRECATED. `--engine-use-ray` is deprecated and will "
                "be removed in a future update. "
                "See https://github.com/vllm-project/vllm/issues/7045.")
            if envs.VLLM_ALLOW_ENGINE_USE_RAY:
                print_warning_once(
                    "VLLM_ALLOW_ENGINE_USE_RAY is set, force engine use Ray")
            else:
                raise ValueError("`--engine-use-ray` is deprecated. "
                                 "Set `VLLM_ALLOW_ENGINE_USE_RAY=1` to "
                                 "force use it")
        self.background_loop: Optional[asyncio.Future] = None
        # We need to keep a reference to unshielded
        # task as well to prevent it from being garbage
@@ -725,16 +567,11 @@ class AsyncLLMEngine:
        # Create the engine configs.
        engine_config = engine_args.create_engine_config()
        if engine_args.engine_use_ray:
            from vllm.executor import ray_utils
            ray_utils.assert_ray_available()
        executor_class = cls._get_executor_cls(engine_config)
        # Create the async LLM engine.
        engine = cls(
            executor_class.uses_ray,
            engine_args.engine_use_ray,
            **engine_config.to_dict(),
            executor_class=executor_class,
            log_requests=not engine_args.disable_log_requests,
@@ -777,10 +614,6 @@ class AsyncLLMEngine:
        self,
        lora_request: Optional[LoRARequest] = None,
    ) -> AnyTokenizer:
        if self.engine_use_ray:
            return await self.engine.get_tokenizer.remote(  # type: ignore
                lora_request)
        return await (self.engine.get_tokenizer_group().
                      get_lora_tokenizer_async(lora_request))
@@ -814,26 +647,6 @@ class AsyncLLMEngine:
            self._background_loop_unshielded = None
        self.background_loop = None
    def _init_engine(self, *args,
                     **kwargs) -> Union[_AsyncLLMEngine, "ray.ObjectRef"]:
        if not self.engine_use_ray:
            engine_class = self._engine_class
        elif self.worker_use_ray:
            engine_class = ray.remote(num_cpus=0)(self._engine_class).remote
        else:
            # FIXME(woosuk): This is a bit hacky. Be careful when changing the
            # order of the arguments.
            cache_config = kwargs["cache_config"]
            parallel_config = kwargs["parallel_config"]
            if (parallel_config.tensor_parallel_size == 1
                    and parallel_config.pipeline_parallel_size == 1):
                num_gpus = cache_config.gpu_memory_utilization
            else:
                num_gpus = 1
            engine_class = ray.remote(num_gpus=num_gpus)(
                self._engine_class).remote
        return engine_class(*args, **kwargs)
    async def engine_step(self, virtual_engine: int) -> bool:
        """Kick the engine to process the waiting requests.
@@ -844,13 +657,8 @@ class AsyncLLMEngine:
        for new_request in new_requests:
            # Add the request into the vLLM engine's waiting queue.
            # TODO: Maybe add add_request_batch to reduce Ray overhead
            try:
-                if self.engine_use_ray:
+                await self.engine.add_request_async(**new_request)
                    await self.engine.add_request.remote(  # type: ignore
                        **new_request)
                else:
                    await self.engine.add_request_async(**new_request)
            except ValueError as e:
                # TODO: use a vLLM specific error for failed validation
                self._request_tracker.process_exception(
@@ -862,10 +670,7 @@ class AsyncLLMEngine:
        if aborted_requests:
            await self._engine_abort(aborted_requests)
-        if self.engine_use_ray:
+        request_outputs = await self.engine.step_async(virtual_engine)
            request_outputs = await self.engine.step.remote()  # type: ignore
        else:
            request_outputs = await self.engine.step_async(virtual_engine)
        # Put the outputs into the corresponding streams.
        # If used as a callback, then already invoked inside
@@ -891,16 +696,10 @@ class AsyncLLMEngine:
        return all_finished
    async def _engine_abort(self, request_ids: Iterable[str]):
-        if self.engine_use_ray:
+        self.engine.abort_request(request_ids)
            await self.engine.abort_request.remote(request_ids)  # type: ignore
        else:
            self.engine.abort_request(request_ids)
    async def run_engine_loop(self):
-        if self.engine_use_ray:
+        pipeline_parallel_size = \
            pipeline_parallel_size = 1  # type: ignore
        else:
            pipeline_parallel_size = \
                self.engine.parallel_config.pipeline_parallel_size
        has_requests_in_progress = [False] * pipeline_parallel_size
        while True:
@@ -912,12 +711,7 @@ class AsyncLLMEngine:
                # timeout, and unblocks the RPC thread in the workers so that
                # they can process any other queued control plane messages,
                # such as add/remove lora adapters.
-                if self.engine_use_ray:
+                await self.engine.stop_remote_worker_execution_loop_async()
                    await (self.engine.stop_remote_worker_execution_loop.
                           remote()  # type: ignore
                           )
                else:
                    await self.engine.stop_remote_worker_execution_loop_async()
                await self._request_tracker.wait_for_new_requests()
                logger.debug("Got new requests!")
                requests_in_progress = [
@@ -938,17 +732,9 @@ class AsyncLLMEngine:
                for task in done:
                    result = task.result()
                    virtual_engine = requests_in_progress.index(task)
-                    if self.engine_use_ray:
+                    has_unfinished_requests = (
-                        has_unfinished_requests = (
+                        self.engine.has_unfinished_requests_for_virtual_engine(
-                            await (self.engine.
+                            virtual_engine))
                                   has_unfinished_requests_for_virtual_engine.
                                   remote(  # type: ignore
                                       virtual_engine)))
                    else:
                        has_unfinished_requests = (
                            self.engine.
                            has_unfinished_requests_for_virtual_engine(
                                virtual_engine))
                    if result or has_unfinished_requests:
                        requests_in_progress[virtual_engine] = (
                            asyncio.create_task(
@@ -1190,52 +976,29 @@ class AsyncLLMEngine:
    async def get_model_config(self) -> ModelConfig:
        """Get the model configuration of the vLLM engine."""
-        if self.engine_use_ray:
+        return self.engine.get_model_config()
            return await self.engine.get_model_config.remote()  # type: ignore
        else:
            return self.engine.get_model_config()
    async def get_parallel_config(self) -> ParallelConfig:
        """Get the parallel configuration of the vLLM engine."""
-        if self.engine_use_ray:
+        return self.engine.get_parallel_config()
            return await self.engine.get_parallel_config.remote(  # type: ignore
            )
        else:
            return self.engine.get_parallel_config()
    async def get_decoding_config(self) -> DecodingConfig:
        """Get the decoding configuration of the vLLM engine."""
-        if self.engine_use_ray:
+        return self.engine.get_decoding_config()
            return await self.engine.get_decoding_config.remote(  # type: ignore
            )
        else:
            return self.engine.get_decoding_config()
    async def get_scheduler_config(self) -> SchedulerConfig:
        """Get the scheduling configuration of the vLLM engine."""
-        if self.engine_use_ray:
+        return self.engine.get_scheduler_config()
            return await self.engine.get_scheduler_config.remote(  # type: ignore
            )
        else:
            return self.engine.get_scheduler_config()
    async def get_lora_config(self) -> LoRAConfig:
        """Get the lora configuration of the vLLM engine."""
-        if self.engine_use_ray:
+        return self.engine.get_lora_config()
            return await self.engine.get_lora_config.remote(  # type: ignore
            )
        else:
            return self.engine.get_lora_config()
    async def do_log_stats(
            self,
            scheduler_outputs: Optional[SchedulerOutputs] = None,
            model_output: Optional[List[SamplerOutput]] = None) -> None:
-        if self.engine_use_ray:
+        self.engine.do_log_stats()
            await self.engine.do_log_stats.remote(  # type: ignore
                scheduler_outputs, model_output)
        else:
            self.engine.do_log_stats()
    async def check_health(self) -> None:
        """Raises an error if engine is unhealthy."""
@@ -1244,40 +1007,30 @@ class AsyncLLMEngine:
        if self.is_stopped:
            raise AsyncEngineDeadError("Background loop is stopped.")
-        if self.engine_use_ray:
+        await self.engine.check_health_async()
            try:
                await self.engine.check_health.remote()  # type: ignore
            except ray.exceptions.RayActorError as e:
                raise RuntimeError("Engine is dead.") from e
        else:
            await self.engine.check_health_async()
        logger.debug("Health check took %fs", time.perf_counter() - t)
    async def is_tracing_enabled(self) -> bool:
-        if self.engine_use_ray:
+        return self.engine.is_tracing_enabled()
            return await self.engine.is_tracing_enabled.remote(  # type: ignore
            )
        else:
            return self.engine.is_tracing_enabled()
    def add_logger(self, logger_name: str, logger: StatLoggerBase) -> None:
-        if self.engine_use_ray:
+        self.engine.add_logger(logger_name=logger_name, logger=logger)
            ray.get(
                self.engine.add_logger.remote(  # type: ignore
                    logger_name=logger_name, logger=logger))
        else:
            self.engine.add_logger(logger_name=logger_name, logger=logger)
    def remove_logger(self, logger_name: str) -> None:
-        if self.engine_use_ray:
+        self.engine.remove_logger(logger_name=logger_name)
            ray.get(
                self.engine.remove_logger.remote(  # type: ignore
                    logger_name=logger_name))
        else:
            self.engine.remove_logger(logger_name=logger_name)
    async def start_profile(self) -> None:
-        self.engine.model_executor._run_workers("start_profile")
+        # using type instead of isinstance to check to avoid capturing
        # inherited classes
        if type(self.engine.model_executor) == GPUExecutorAsync:
            self.engine.model_executor.start_profile()
        else:
            self.engine.model_executor._run_workers("start_profile")
    async def stop_profile(self) -> None:
-        self.engine.model_executor._run_workers("stop_profile")
+        # using type instead of isinstance to check to avoid capturing
        # inherited classes
        if type(self.engine.model_executor) == GPUExecutorAsync:
            self.engine.model_executor.stop_profile()
        else:
            self.engine.model_executor._run_workers("stop_profile")
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -3,13 +3,13 @@ import time
 from collections import deque
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import (TYPE_CHECKING, Any, ClassVar, Deque, Dict, Iterable, List,
+from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Deque, Dict,
-                    Mapping, NamedTuple, Optional)
+                    Iterable, List, Mapping, NamedTuple, Optional)
 from typing import Sequence as GenericSequence
-from typing import Set, Tuple, Type, Union
+from typing import Set, Type, Union
 import torch
-from typing_extensions import TypeVar, assert_never
+from typing_extensions import TypeVar
 import vllm.envs as envs
 from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig,
@@ -26,20 +26,19 @@ from vllm.engine.output_processor.interfaces import (
 from vllm.engine.output_processor.stop_checker import StopChecker
 from vllm.engine.output_processor.util import create_output_by_sequence_group
 from vllm.executor.executor_base import ExecutorBase
 from vllm.executor.gpu_executor import GPUExecutor
 from vllm.executor.ray_utils import initialize_ray_cluster
 from vllm.inputs import (INPUT_REGISTRY, EncoderDecoderLLMInputs,
-                         InputRegistry, LLMInputs, PromptInputs,
+                         InputRegistry, LLMInputs, PromptInputs)
-                         SingletonPromptInputs)
+from vllm.inputs.preprocess import InputPreprocessor
 from vllm.inputs.parse import is_explicit_encoder_decoder_prompt
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.multimodal import MultiModalDataDict
 from vllm.outputs import (EmbeddingRequestOutput, RequestOutput,
                          RequestOutputFactory)
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
-from vllm.sampling_params import SamplingParams
+from vllm.sampling_params import RequestOutputKind, SamplingParams
 from vllm.sequence import (EmbeddingSequenceGroupOutput, ExecuteModelRequest,
                           Sequence, SequenceGroup, SequenceGroupMetadata,
                           SequenceStatus)
@@ -75,11 +74,6 @@ def _load_generation_config_dict(model_config: ModelConfig) -> Dict[str, Any]:
 _G = TypeVar("_G", bound=BaseTokenizerGroup, default=BaseTokenizerGroup)
 _O = TypeVar("_O", RequestOutput, EmbeddingRequestOutput)
 PromptComponents = Tuple[Optional[str], List[int],
                         Optional[MultiModalDataDict]]
 DecoderPromptComponents = Tuple[Optional[str], Optional[List[int]],
                                Optional[MultiModalDataDict]]
@dataclass
 class SchedulerOutputState:
@@ -225,9 +219,6 @@ class LLMEngine:
        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
        stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
        input_registry: InputRegistry = INPUT_REGISTRY,
        # To improve performance, only final requests outputs may be required.
        # If this set to true, then no intermediate outputs will be returned.
        step_return_finished_only: bool = False,
    ) -> None:
        logger.info(
            "Initializing an LLM engine (v%s) with config: "
@@ -295,7 +286,6 @@ class LLMEngine:
        self.observability_config = observability_config or ObservabilityConfig(
        )
        self.log_stats = log_stats
        self.step_return_finished_only = step_return_finished_only
        if not self.model_config.skip_tokenizer_init:
            self.tokenizer = self._init_tokenizer()
@@ -317,6 +307,9 @@ class LLMEngine:
        self.generation_config_fields = _load_generation_config_dict(
            model_config)
        self.input_preprocessor = InputPreprocessor(model_config,
                                                    self.tokenizer)
        self.input_registry = input_registry
        self.input_processor = input_registry.create_input_processor(
            model_config)
@@ -397,7 +390,7 @@ class LLMEngine:
        # Currently used by AsyncLLMEngine to ensure quick append
        # of request outputs to asyncio queues
-        self.process_request_outputs_callback = None
+        self.process_request_outputs_callback: Optional[Callable] = None
        # Create the scheduler.
        # NOTE: the cache_config here have been updated with the numbers of
@@ -575,19 +568,15 @@ class LLMEngine:
        if model_executor := getattr(self, "model_executor", None):
            model_executor.shutdown()
    MISSING_TOKENIZER_GROUP_MSG = ("Unable to get tokenizer because "
                                   "skip_tokenizer_init is True")
    def get_tokenizer_group(
        self,
        group_type: Type[_G] = BaseTokenizerGroup,
        *,
        missing_msg: str = MISSING_TOKENIZER_GROUP_MSG,
    ) -> _G:
        tokenizer_group = self.tokenizer
        if tokenizer_group is None:
-            raise ValueError(missing_msg)
+            raise ValueError("Unable to get tokenizer because "
                             "skip_tokenizer_init is True")
        if not isinstance(tokenizer_group, group_type):
            raise TypeError("Invalid type of tokenizer group. "
                            f"Expected type: {group_type}, but "
@@ -619,52 +608,6 @@ class LLMEngine:
            self.prompt_adapter_config.verify_with_model_config(
                self.model_config)
    def _get_bos_token_id(self,
                          lora_request: Optional[LoRARequest] = None
                          ) -> Optional[int]:
        if self.tokenizer is None:
            logger.warning("Using None for BOS token id because tokenizer "
                           "is not initialized")
            return None
        return self.tokenizer.get_lora_tokenizer(lora_request).bos_token_id
    def _get_eos_token_id(self,
                          lora_request: Optional[LoRARequest] = None
                          ) -> Optional[int]:
        if self.tokenizer is None:
            logger.warning("Using None for EOS token id because tokenizer "
                           "is not initialized")
            return None
        return self.tokenizer.get_lora_tokenizer(lora_request).eos_token_id
    def _get_decoder_start_token_id(self) -> Optional[int]:
        '''
        Obtain the decoder start token id employed by an encoder/decoder
        model. Returns None for non-encoder/decoder models or if the
        model config is unavailable.
        '''
        if not self.is_encoder_decoder_model():
            logger.warning("Using None for decoder start token id because "
                           "this is not an encoder/decoder model.")
            return None
        if (self.model_config is None or self.model_config.hf_config is None):
            logger.warning("Using None for decoder start token id because "
                           "model config is not available.")
            return None
        dec_start_token_id = getattr(self.model_config.hf_config,
                                     'decoder_start_token_id', None)
        if dec_start_token_id is None:
            logger.warning("Falling back on <BOS> for decoder start token id "
                           "because decoder start token id is not available.")
            dec_start_token_id = self._get_bos_token_id()
        return dec_start_token_id
    def _add_processed_request(
        self,
        request_id: str,
@@ -679,7 +622,7 @@ class LLMEngine:
        # Create the sequences.
        block_size = self.cache_config.block_size
        seq_id = next(self.seq_counter)
-        eos_token_id = self._get_eos_token_id(lora_request)
+        eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)
        seq = Sequence(seq_id, processed_inputs, block_size, eos_token_id,
                       lora_request, prompt_adapter_request)
@@ -729,334 +672,6 @@ class LLMEngine:
    def stop_remote_worker_execution_loop(self) -> None:
        self.model_executor.stop_remote_worker_execution_loop()
    _LLMInputComponentsType = Tuple[str, List[int]]
    def _prepare_decoder_input_ids_for_generation(
        self,
        decoder_input_ids: Optional[List[int]],
    ) -> List[int]:
        """
        Prepares `decoder_input_ids` for generation with encoder-decoder models.
        Based on
        https://github.com/huggingface/transformers/blob/
        4037a2b5b1278736e566aec12e169100275545ea/
        src/transformers/generation/utils.py
        specifically GenerationMixin._prepare_decoder_input_ids_for_generation()
        Arguments:
        * decoder_input_ids: input token ids to preprocess
        Returns:
        * Processed token list
        """
        decoder_start_token_id = self._get_decoder_start_token_id()
        assert decoder_start_token_id is not None
        if decoder_input_ids is None:
            # no decoder prompt input ->
            # use decoder_start_token_id as decoder_input_ids
            decoder_input_ids = self._get_default_enc_dec_decoder_prompt()
        if (len(decoder_input_ids) == 0
                or decoder_input_ids[0] != decoder_start_token_id):
            decoder_input_ids = [decoder_start_token_id] + decoder_input_ids
        return decoder_input_ids
    def _tokenize_prompt(
        self,
        prompt: str,
        request_id: str,
        lora_request: Optional[LoRARequest],
    ) -> List[int]:
        '''
        Wrapper around application of the model's tokenizer.
        Arguments:
        * prompt
        * request_id
        * lora_request
        Returns:
        * prompt token ids
        '''
        tokenizer = self.get_tokenizer_group(
            missing_msg="prompts must be None if skip_tokenizer_init is True")
        return tokenizer.encode(request_id=request_id,
                                prompt=prompt,
                                lora_request=lora_request)
    def _extract_prompt_components(
        self,
        inputs: SingletonPromptInputs,
        request_id: str,
        lora_request: Optional[LoRARequest] = None,
    ) -> PromptComponents:
        '''
        Extract the components of any single encoder or decoder input prompt.
        Arguments:
        * request_id
        * inputs: single encoder or decoder input prompt
        * lora_request: this is only valid for decoder prompts
        Returns:
        * prompt
        * prompt_token_ids
        * multi_modal_data
        '''
        if isinstance(inputs, str):
            prompt = inputs
            prompt_token_ids = self._tokenize_prompt(
                prompt,
                request_id=request_id,
                lora_request=lora_request,
            )
            multi_modal_data = None
        elif isinstance(inputs, dict):
            if "prompt_token_ids" in inputs:
                prompt = None
                prompt_token_ids = inputs["prompt_token_ids"]
            else:
                # NOTE: This extra assignment is required to pass mypy
                prompt = parsed_prompt = inputs["prompt"]
                prompt_token_ids = self._tokenize_prompt(
                    parsed_prompt,
                    request_id=request_id,
                    lora_request=lora_request,
                )
            multi_modal_data = inputs.get("multi_modal_data")
        else:
            assert_never(inputs)
        return prompt, prompt_token_ids, multi_modal_data
    def _apply_prompt_adapter(
        self,
        prompt_token_ids: List[int],
        prompt_adapter_request: Optional[PromptAdapterRequest],
    ) -> List[int]:
        if prompt_adapter_request:
            prompt_token_ids = (
                [0] * prompt_adapter_request.prompt_adapter_num_virtual_tokens
                + prompt_token_ids)
        return prompt_token_ids
    def _get_default_enc_dec_decoder_prompt(self) -> List[int]:
        '''
        Specifically for encoder/decoder models:
        generate a default decoder prompt for when
        the user specifies only the encoder prompt.
        Encoder/decoder models utilize the decoder
        prompt in different ways; as new models are
        added, it is intended that this function
        will be extended to produce differing
        default decoder prompts, depending on the
        model variety.
        Absent a special case, the default behavior
        of this method is to mirror the behavior of
        the HuggingFace (HF) GenerationMixin for a None
        decoder prompt, which is to employ a logit processor
        setting to force the first decoded token to be <BOS>.
        Here, this behavior is approximated by having the
        "default" decoder prompt be <BOS>.
        However, it is possible that in the future
        other models may have different or more 
        complex logic for the default decoder prompt.
        This motivates having a special helper method
        for default decoder prompts.
        Returns:
        * prompt_token_ids
        '''
        bos_token_id = self._get_bos_token_id()
        assert bos_token_id is not None
        return [bos_token_id]
    def _build_enc_dec_llm_inputs(
        self,
        encoder_comps: PromptComponents,
        decoder_comps: DecoderPromptComponents,
    ) -> EncoderDecoderLLMInputs:
        encoder_prompt, encoder_prompt_ids, encoder_mm_data = encoder_comps
        decoder_prompt, decoder_prompt_ids, decoder_mm_data = decoder_comps
        if encoder_mm_data is not None or decoder_mm_data is not None:
            raise ValueError("Multi-modal encoder-decoder models are "
                             "not supported yet")
        decoder_prompt_ids = (
            self._prepare_decoder_input_ids_for_generation(decoder_prompt_ids))
        return EncoderDecoderLLMInputs(
            prompt_token_ids=decoder_prompt_ids,
            prompt=decoder_prompt,
            encoder_prompt_token_ids=encoder_prompt_ids,
            encoder_prompt=encoder_prompt,
        )
    def _process_encoder_decoder_prompt(
        self,
        inputs: PromptInputs,
        request_id: str,
    ) -> EncoderDecoderLLMInputs:
        '''
        For encoder/decoder models only:
        Process an input prompt into an
        :class:`EncoderDecoderLLMInputs` instance.
        There are two types of input prompts:
        singleton prompts which carry only the
        encoder prompt, and explicit encoder/decoder
        prompts which carry both the encoder and the
        decoder prompts as member variables.
        This function handles the following scenarios:
        * Singleton encoder prompt: extract encoder prompt
          token ids & infer default decoder prompt token ids
        * Explicit encoder/decoder prompt: extract encoder
          and decoder prompt token ids
        Note that for Explicit encoder/decoder prompts,
        each sub-prompt (encoder or decoder prompt) can
        have any possible singleton type; thus this
        method relies on helper functions to obtain
        token ids for the sub-prompts.
        Arguments:
        * inputs: an input prompt
        * request_id
        Returns:
        * :class:`EncoderDecoderLLMInputs` instance
        '''
        encoder_comps: PromptComponents
        decoder_comps: DecoderPromptComponents
        if is_explicit_encoder_decoder_prompt(inputs):
            encoder_comps = self._extract_prompt_components(
                inputs["encoder_prompt"],
                request_id=request_id,
            )
            if (decoder_input := inputs["decoder_prompt"]) is None:
                decoder_comps = None, None, None
            else:
                decoder_comps = self._extract_prompt_components(
                    decoder_input,
                    request_id=request_id,
                )
        else:
            encoder_comps = self._extract_prompt_components(
                inputs,
                request_id=request_id,
            )
            decoder_comps = None, None, None
        return self._build_enc_dec_llm_inputs(encoder_comps, decoder_comps)
    def _build_decoder_only_llm_inputs(
        self,
        prompt_comps: PromptComponents,
        prompt_adapter_request: Optional[PromptAdapterRequest],
    ) -> LLMInputs:
        prompt, prompt_token_ids, multi_modal_data = prompt_comps
        prompt_token_ids = self._apply_prompt_adapter(
            prompt_token_ids, prompt_adapter_request=prompt_adapter_request)
        return LLMInputs(prompt_token_ids=prompt_token_ids,
                         prompt=prompt,
                         multi_modal_data=multi_modal_data)
    def _process_decoder_only_prompt(
        self,
        inputs: SingletonPromptInputs,
        request_id: str,
        lora_request: Optional[LoRARequest] = None,
        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
    ) -> LLMInputs:
        '''
        For decoder-only models:
        Process an input prompt into an :class:`LLMInputs` instance.
        Arguments:
        * inputs: input prompt
        * request_id
        * lora_request
        * prompt_adapter_request
        Returns:
        * :class:`LLMInputs` instance
        '''
        prompt_comps = self._extract_prompt_components(
            inputs,
            request_id=request_id,
            lora_request=lora_request,
        )
        return self._build_decoder_only_llm_inputs(
            prompt_comps,
            prompt_adapter_request=prompt_adapter_request,
        )
    def process_model_inputs(
        self,
        inputs: PromptInputs,
        request_id: str,
        lora_request: Optional[LoRARequest] = None,
        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
    ) -> Union[LLMInputs, EncoderDecoderLLMInputs]:
        if self.is_encoder_decoder_model():
            # Encoder-decoder model requires special mapping of
            # input prompts to encoder & decoder
            model_inputs = self._process_encoder_decoder_prompt(
                inputs,
                request_id=request_id,
            )
        else:
            if is_explicit_encoder_decoder_prompt(inputs):
                raise ValueError("Cannot pass encoder-decoder prompt "
                                 "to decoder-only models")
            # Decoder-only operation
            model_inputs = self._process_decoder_only_prompt(
                inputs,
                request_id=request_id,
                lora_request=lora_request,
                prompt_adapter_request=prompt_adapter_request,
            )
        return self.input_processor(model_inputs)
    def add_request(
        self,
        request_id: str,
@@ -1115,12 +730,13 @@ class LLMEngine:
        if arrival_time is None:
            arrival_time = time.time()
-        processed_inputs = self.process_model_inputs(
+        preprocessed_inputs = self.input_preprocessor.preprocess(
            inputs,
            request_id=request_id,
            lora_request=lora_request,
            prompt_adapter_request=prompt_adapter_request,
        )
        processed_inputs = self.input_processor(preprocessed_inputs)
        self._add_processed_request(
            request_id=request_id,
@@ -1378,7 +994,8 @@ class LLMEngine:
            seq_group = scheduled_seq_group.seq_group
            seq_group.maybe_set_first_token_time(now)
            request_output = RequestOutputFactory.create(seq_group)
-            ctx.request_outputs.append(request_output)
+            if request_output:
                ctx.request_outputs.append(request_output)
        # When we process a single request, we skip it for the next time,
        # and invoke the request output callback (if there was final output)
@@ -1415,14 +1032,19 @@ class LLMEngine:
            seq_group = scheduled_seq_group.seq_group
            seq_group.maybe_set_first_token_time(now)
-            if (seq_group.is_finished()
+            request_output = RequestOutputFactory.create(seq_group)
-                    if self.step_return_finished_only else True):
+            if request_output:
                request_output = RequestOutputFactory.create(seq_group)
                ctx.request_outputs.append(request_output)
        for seq_group in scheduler_outputs.ignored_seq_groups:
            params = seq_group.sampling_params
            if params is not None and params.output_kind == (
                    RequestOutputKind.DELTA) and not seq_group.is_finished():
                continue
            request_output = RequestOutputFactory.create(seq_group)
-            ctx.request_outputs.append(request_output)
+            if request_output:
                ctx.request_outputs.append(request_output)
        # Immediately process request outputs here (if callback is given)
        if (ctx.request_outputs
@@ -1435,7 +1057,8 @@ class LLMEngine:
        # LLMEngine/AsyncLLMEngine directly
        if is_async:
            # Log stats.
-            self.do_log_stats(scheduler_outputs, outputs, finished_before)
+            self.do_log_stats(scheduler_outputs, outputs, finished_before,
                              skip)
            # Tracing
            self.do_tracing(scheduler_outputs)
@@ -1742,18 +1365,20 @@ class LLMEngine:
    def do_log_stats(self,
                     scheduler_outputs: Optional[SchedulerOutputs] = None,
                     model_output: Optional[List[SamplerOutput]] = None,
-                     finished_before: Optional[List[int]] = None) -> None:
+                     finished_before: Optional[List[int]] = None,
                     skip: Optional[List[int]] = None) -> None:
        """Forced log when no requests active."""
        if self.log_stats:
            stats = self._get_stats(scheduler_outputs, model_output,
-                                    finished_before)
+                                    finished_before, skip)
            for logger in self.stat_loggers.values():
                logger.log(stats)
    def _get_stats(self,
                   scheduler_outputs: Optional[SchedulerOutputs],
                   model_output: Optional[List[SamplerOutput]] = None,
-                   finished_before: Optional[List[int]] = None) -> Stats:
+                   finished_before: Optional[List[int]] = None,
                   skip: Optional[List[int]] = None) -> Stats:
        """Get Stats to be Logged to Prometheus.
        Args:
@@ -1761,6 +1386,10 @@ class LLMEngine:
                the scheduled batch,
            model_output: Optional, used to emit speculative decoding metrics
                which are created by the workers.
            finished_before: Optional, indices of sequences that were finished
                before. These sequences will be ignored.
            skip: Optional, indices of sequences that were preempted. These
                sequences will be ignored.
        """
        now = time.time()
@@ -1835,6 +1464,11 @@ class LLMEngine:
                    actual_num_batched_tokens -= 1
                    continue
                # Currently, skip == preempted sequences, so we need to skip
                # their log stats
                if skip and idx in skip:
                    continue
                group_was_prefill = idx < scheduler_outputs.num_prefill_groups
                seq_group = scheduled_seq_group.seq_group
@@ -1964,10 +1598,20 @@ class LLMEngine:
        self.model_executor.check_health()
    def start_profile(self) -> None:
-        self.model_executor.start_profile()
+        # using type instead of isinstance to check to avoid capturing
        # inherited classes (MultiprocessingGPUExecutor)
        if type(self.model_executor) == GPUExecutor:
            self.model_executor.start_profile()
        else:
            self.model_executor._run_workers("start_profile")
    def stop_profile(self) -> None:
-        self.model_executor.stop_profile()
+        # using type instead of isinstance to check to avoid capturing
        # inherited classes (MultiprocessingGPUExecutor)
        if type(self.model_executor) == GPUExecutor:
            self.model_executor.stop_profile()
        else:
            self.model_executor._run_workers("stop_profile")
    def is_tracing_enabled(self) -> bool:
        return self.tracer is not None
@@ -2041,7 +1685,7 @@ class LLMEngine:
                    metrics.model_execute_time)
    def is_encoder_decoder_model(self):
-        return self.model_config.is_encoder_decoder_model
+        return self.input_preprocessor.is_encoder_decoder_model()
    def is_embedding_model(self):
        return self.model_config.is_embedding_model
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -19,7 +19,7 @@ from vllm.model_executor.guided_decoding.guided_fields import LLMGuidedOptions
 from vllm.outputs import EmbeddingRequestOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
-from vllm.sampling_params import SamplingParams
+from vllm.sampling_params import RequestOutputKind, SamplingParams
 from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
                                               get_cached_tokenizer)
 from vllm.transformers_utils.tokenizer_group import TokenizerGroup
@@ -642,14 +642,12 @@ class LLM:
            raise ValueError("The lengths of prompts and lora_request "
                             "must be the same.")
-        if isinstance(params, list):
+        for sp in params if isinstance(params, list) else (params, ):
-            params = [
+            if isinstance(sp, SamplingParams):
-                self._add_guided_processor(param, guided_options)
+                self._add_guided_processor(sp, guided_options)
-                if isinstance(param, SamplingParams) else param
+
-                for param in params
+                # We only care about the final output
-            ]
+                sp.output_kind = RequestOutputKind.FINAL_ONLY
        elif isinstance(params, SamplingParams):
            params = self._add_guided_processor(params, guided_options)
        # Add requests to the engine.
        for i, request_inputs in enumerate(inputs):
@@ -709,9 +707,6 @@ class LLM:
                         f"output: {0:.2f} toks/s"),
            )
        # In the loop below, only finished outputs are used
        self.llm_engine.step_return_finished_only = True
        # Run the engine.
        outputs: List[Union[RequestOutput, EmbeddingRequestOutput]] = []
        total_in_toks = 0
@@ -724,6 +719,7 @@ class LLM:
                    if use_tqdm:
                        if isinstance(output, RequestOutput):
                            # Calculate tokens only for RequestOutput
                            assert output.prompt_token_ids is not None
                            total_in_toks += len(output.prompt_token_ids)
                            in_spd = total_in_toks / pbar.format_dict["elapsed"]
                            total_out_toks += sum(
@@ -735,9 +731,6 @@ class LLM:
                                f"output: {out_spd:.2f} toks/s")
                        pbar.update(1)
        # Restore original behavior
        self.llm_engine.step_return_finished_only = False
        if use_tqdm:
            pbar.close()
        # Sort the outputs by request ID.
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -12,7 +12,8 @@ from typing_extensions import Annotated, Required, TypedDict
 from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
 from vllm.entrypoints.openai.logits_processors import get_logits_processors
 from vllm.pooling_params import PoolingParams
-from vllm.sampling_params import LogitsProcessor, SamplingParams
+from vllm.sampling_params import (LogitsProcessor, RequestOutputKind,
                                  SamplingParams)
 from vllm.sequence import Logprob
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import random_uuid
@@ -316,6 +317,8 @@ class ChatCompletionRequest(OpenAIBaseModel):
            length_penalty=self.length_penalty,
            logits_processors=logits_processors,
            truncate_prompt_tokens=self.truncate_prompt_tokens,
            output_kind=RequestOutputKind.DELTA if self.stream \
                else RequestOutputKind.FINAL_ONLY,
        )
    @model_validator(mode="before")
@@ -559,6 +562,8 @@ class CompletionRequest(OpenAIBaseModel):
            length_penalty=self.length_penalty,
            logits_processors=logits_processors,
            truncate_prompt_tokens=self.truncate_prompt_tokens,
            output_kind=RequestOutputKind.DELTA if self.stream \
                else RequestOutputKind.FINAL_ONLY,
        )
    @model_validator(mode="before")
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -195,7 +195,6 @@ async def main(args):
    engine = AsyncLLMEngine.from_engine_args(
        engine_args, usage_context=UsageContext.OPENAI_BATCH_RUNNER)
    # When using single vLLM without engine_use_ray
    model_config = await engine.get_model_config()
    if args.disable_log_requests:
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -246,8 +246,7 @@ class OpenAIServingChat(OpenAIServing):
    def get_chat_request_role(self, request: ChatCompletionRequest) -> str:
        if request.add_generation_prompt:
            return self.response_role
-        else:
+        return request.messages[-1]["role"]
            return request.messages[-1]["role"]
    async def chat_completion_stream_generator(
        self,
@@ -264,15 +263,37 @@ class OpenAIServingChat(OpenAIServing):
        # Send response for each token for each request.n (index)
        num_choices = 1 if request.n is None else request.n
        previous_texts = [""] * num_choices
        previous_num_tokens = [0] * num_choices
        finish_reason_sent = [False] * num_choices
        num_prompt_tokens = 0
        tool_parser: Optional[ToolParser] = self.tool_parser(
            tokenizer) if self.tool_parser else None
        if isinstance(request.tool_choice, ChatCompletionNamedToolChoiceParam):
            tool_choice_function_name = request.tool_choice.function.name
        else:
            tool_choice_function_name = None
        # Determine whether tools are in use with "auto" tool choice
        tool_choice_auto = (
            not tool_choice_function_name
            and self._should_stream_with_auto_tool_parsing(request))
        all_previous_token_ids: Optional[List[List[int]]]
        if tool_choice_auto:
            # These are only required in "auto" tool choice case
            previous_texts = [""] * num_choices
            all_previous_token_ids = [[]] * num_choices
        else:
            previous_texts, all_previous_token_ids = None, None
        try:
            async for res in result_generator:
                if res.prompt_token_ids is not None:
                    num_prompt_tokens = len(res.prompt_token_ids)
                # We need to do it here, because if there are exceptions in
                # the result_generator, it needs to be sent as the FIRST
                # response (by the try...catch).
@@ -305,10 +326,10 @@ class OpenAIServingChat(OpenAIServing):
                                and request.stream_options.include_usage):
                            # if continuous usage stats are requested, add it
                            if request.stream_options.continuous_usage_stats:
-                                prompt_tokens = len(res.prompt_token_ids)
+                                usage = UsageInfo(
-                                usage = UsageInfo(prompt_tokens=prompt_tokens,
+                                    prompt_tokens=num_prompt_tokens,
-                                                  completion_tokens=0,
+                                    completion_tokens=0,
-                                                  total_tokens=prompt_tokens)
+                                    total_tokens=num_prompt_tokens)
                                chunk.usage = usage
                            # otherwise don't
                            else:
@@ -344,12 +365,10 @@ class OpenAIServingChat(OpenAIServing):
                                        request.stream_options.include_usage):
                                    if (request.stream_options.
                                            continuous_usage_stats):
                                        prompt_tokens = len(
                                            res.prompt_token_ids)
                                        usage = UsageInfo(
-                                            prompt_tokens=prompt_tokens,
+                                            prompt_tokens=num_prompt_tokens,
                                            completion_tokens=0,
-                                            total_tokens=prompt_tokens)
+                                            total_tokens=num_prompt_tokens)
                                        chunk.usage = usage
                                    else:
                                        chunk.usage = None
@@ -360,65 +379,66 @@ class OpenAIServingChat(OpenAIServing):
                    first_iteration = False
                for output in res.outputs:
                    i = output.index
                    if finish_reason_sent[i]:
                        continue
                    delta_token_ids = output.token_ids[previous_num_tokens[i]:]
                    out_logprobs = output.logprobs[
                        previous_num_tokens[i]:] if output.logprobs else None
                    if request.logprobs and request.top_logprobs is not None:
-                        assert out_logprobs is not None, (
+                        assert output.logprobs is not None, (
                            "Did not output logprobs")
                        logprobs = self._create_chat_logprobs(
-                            token_ids=delta_token_ids,
+                            token_ids=output.token_ids,
-                            top_logprobs=out_logprobs,
+                            top_logprobs=output.logprobs,
                            tokenizer=tokenizer,
                            num_output_top_logprobs=request.top_logprobs,
                        )
                    else:
                        logprobs = None
-                    delta_text = output.text[len(previous_texts[i]):]
+                    delta_text = output.text
-                    delta_message: Optional[DeltaMessage] = None
+                    delta_message: Optional[DeltaMessage]
                    # handle streaming deltas for tools with named tool_choice
-                    if (request.tool_choice and type(request.tool_choice) is
+                    if tool_choice_function_name:
                            ChatCompletionNamedToolChoiceParam):
                        delta_message = DeltaMessage(tool_calls=[
                            DeltaToolCall(function=DeltaFunctionCall(
-                                name=request.tool_choice.function.name,
+                                name=tool_choice_function_name,
                                arguments=delta_text),
                                          index=i)
                        ])
                    # handle streaming deltas for tools with "auto" tool choice
-                    elif (self._should_stream_with_auto_tool_parsing(request)
+                    elif tool_choice_auto:
-                          and tool_parser):
+                        assert previous_texts is not None
                        assert all_previous_token_ids is not None
                        assert tool_parser is not None
                        #TODO optimize manipulation of these lists
                        previous_text = previous_texts[i]
                        previous_token_ids = all_previous_token_ids[i]
                        current_text = previous_text + delta_text
                        current_token_ids = previous_token_ids + list(
                            output.token_ids)
                        delta_message = (
                            tool_parser.extract_tool_calls_streaming(
-                                previous_text=previous_texts[i],
+                                previous_text=previous_text,
-                                current_text=output.text,
+                                current_text=current_text,
                                delta_text=delta_text,
-                                previous_token_ids= \
+                                previous_token_ids=previous_token_ids,
-                                    output.token_ids[
+                                current_token_ids=current_token_ids,
-                                    :-1 * len(delta_token_ids)
+                                delta_token_ids=output.token_ids))
-                                    ],
+
-                                current_token_ids=output.token_ids,
+                        # update the previous values for the next iteration
-                                delta_token_ids=delta_token_ids
+                        previous_texts[i] = current_text
-                            )
+                        all_previous_token_ids[i] = current_token_ids
                        )
                    # handle streaming just a content delta
                    else:
                        delta_message = DeltaMessage(content=delta_text)
                    # set the previous values for the next iteration
-                    previous_texts[i] = output.text
+                    previous_num_tokens[i] += len(output.token_ids)
                    previous_num_tokens[i] = len(output.token_ids)
                    # if the message delta is None (e.g. because it was a
                    # "control token" for tool calls or the parser otherwise
@@ -445,13 +465,12 @@ class OpenAIServingChat(OpenAIServing):
                        # handle usage stats if requested & if continuous
                        if (request.stream_options
                                and request.stream_options.include_usage):
-                            if (request.stream_options.continuous_usage_stats):
+                            if request.stream_options.continuous_usage_stats:
                                prompt_tokens = len(res.prompt_token_ids)
                                completion_tokens = len(output.token_ids)
                                usage = UsageInfo(
-                                    prompt_tokens=prompt_tokens,
+                                    prompt_tokens=num_prompt_tokens,
                                    completion_tokens=completion_tokens,
-                                    total_tokens=prompt_tokens +
+                                    total_tokens=num_prompt_tokens +
                                    completion_tokens,
                                )
                                chunk.usage = usage
@@ -482,7 +501,7 @@ class OpenAIServingChat(OpenAIServing):
                                tool_parser.prev_tool_call_arr[index].get(
                                    "arguments", {}))
-                            # get what we've streamed so for for arguments
+                            # get what we've streamed so far for arguments
                            # for the current tool
                            actual_call = tool_parser.streamed_args_for_tool[
                                index]
@@ -500,7 +519,6 @@ class OpenAIServingChat(OpenAIServing):
                            ])
                        # Send the finish response for each request.n only once
                        prompt_tokens = len(res.prompt_token_ids)
                        choice_data = ChatCompletionResponseStreamChoice(
                            index=i,
                            delta=delta_message,
@@ -518,13 +536,12 @@ class OpenAIServingChat(OpenAIServing):
                            model=model_name)
                        if (request.stream_options
                                and request.stream_options.include_usage):
-                            if (request.stream_options.continuous_usage_stats):
+                            if request.stream_options.continuous_usage_stats:
                                prompt_tokens = len(res.prompt_token_ids)
                                completion_tokens = len(output.token_ids)
                                usage = UsageInfo(
-                                    prompt_tokens=prompt_tokens,
+                                    prompt_tokens=num_prompt_tokens,
                                    completion_tokens=completion_tokens,
-                                    total_tokens=prompt_tokens +
+                                    total_tokens=num_prompt_tokens +
                                    completion_tokens,
                                )
                                chunk.usage = usage
@@ -538,10 +555,11 @@ class OpenAIServingChat(OpenAIServing):
            # is sent, send the usage
            if (request.stream_options
                    and request.stream_options.include_usage):
                completion_tokens = previous_num_tokens[i]
                final_usage = UsageInfo(
-                    prompt_tokens=prompt_tokens,
+                    prompt_tokens=num_prompt_tokens,
-                    completion_tokens=previous_num_tokens[i],
+                    completion_tokens=completion_tokens,
-                    total_tokens=prompt_tokens + previous_num_tokens[i],
+                    total_tokens=num_prompt_tokens + completion_tokens,
                )
                final_usage_chunk = ChatCompletionStreamResponse(
@@ -607,7 +625,7 @@ class OpenAIServingChat(OpenAIServing):
            # if auto tools are not enabled, and a named tool choice using
            #   outlines is not being used
-            if not (self.enable_auto_tools
+            if (not self.enable_auto_tools
                    or not self.tool_parser) and not isinstance(
                        request.tool_choice,
                        ChatCompletionNamedToolChoiceParam):
@@ -680,6 +698,7 @@ class OpenAIServingChat(OpenAIServing):
                                                   or "")
                choice.message.content = full_message
        assert final_res.prompt_token_ids is not None
        num_prompt_tokens = len(final_res.prompt_token_ids)
        num_generated_tokens = sum(
            len(output.token_ids) for output in final_res.outputs)
@@ -789,9 +808,9 @@ class OpenAIServingChat(OpenAIServing):
        return bool(
            # if there is a delta message that includes tool calls which
            # include a function that has arguments
-            self.enable_auto_tools and self.tool_parser and delta_message
+            output.finish_reason is not None
            and self.enable_auto_tools and self.tool_parser and delta_message
            and delta_message.tool_calls and delta_message.tool_calls[0]
            and delta_message.tool_calls[0].function
            and delta_message.tool_calls[0].function.arguments is not None
            and output.finish_reason is not None
        )
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -223,9 +223,10 @@ class OpenAIServingCompletion(OpenAIServing):
        tokenizer: AnyTokenizer,
    ) -> AsyncGenerator[str, None]:
        num_choices = 1 if request.n is None else request.n
-        previous_texts = [""] * num_choices * num_prompts
+        previous_text_lens = [0] * num_choices * num_prompts
        previous_num_tokens = [0] * num_choices * num_prompts
        has_echoed = [False] * num_choices * num_prompts
        num_prompt_tokens = [0] * num_prompts
        try:
            async for prompt_idx, res in result_generator:
@@ -233,6 +234,10 @@ class OpenAIServingCompletion(OpenAIServing):
                prompt_logprobs = res.prompt_logprobs
                prompt_text = res.prompt
                # Prompt details are excluded from later streamed outputs
                if res.prompt_token_ids is not None:
                    num_prompt_tokens[prompt_idx] = len(res.prompt_token_ids)
                delta_token_ids: GenericSequence[int]
                out_logprobs: Optional[GenericSequence[Optional[Dict[
                    int, Logprob]]]]
@@ -244,6 +249,7 @@ class OpenAIServingCompletion(OpenAIServing):
                    assert request.max_tokens is not None
                    if request.echo and request.max_tokens == 0:
                        assert prompt_token_ids is not None
                        assert prompt_text is not None
                        # only return the prompt
                        delta_text = prompt_text
@@ -252,6 +258,7 @@ class OpenAIServingCompletion(OpenAIServing):
                        has_echoed[i] = True
                    elif (request.echo and request.max_tokens > 0
                          and not has_echoed[i]):
                        assert prompt_token_ids is not None
                        assert prompt_text is not None
                        assert prompt_logprobs is not None
                        # echo the prompt and first token
@@ -266,11 +273,9 @@ class OpenAIServingCompletion(OpenAIServing):
                        has_echoed[i] = True
                    else:
                        # return just the delta
-                        delta_text = output.text[len(previous_texts[i]):]
+                        delta_text = output.text
-                        delta_token_ids = output.token_ids[
+                        delta_token_ids = output.token_ids
-                            previous_num_tokens[i]:]
+                        out_logprobs = output.logprobs
                        out_logprobs = output.logprobs[previous_num_tokens[
                            i]:] if output.logprobs else None
                    if request.logprobs is not None:
                        assert out_logprobs is not None, (
@@ -280,13 +285,13 @@ class OpenAIServingCompletion(OpenAIServing):
                            top_logprobs=out_logprobs,
                            num_output_top_logprobs=request.logprobs,
                            tokenizer=tokenizer,
-                            initial_text_offset=len(previous_texts[i]),
+                            initial_text_offset=previous_text_lens[i],
                        )
                    else:
                        logprobs = None
-                    previous_texts[i] = output.text
+                    previous_text_lens[i] += len(output.text)
-                    previous_num_tokens[i] = len(output.token_ids)
+                    previous_num_tokens[i] += len(output.token_ids)
                    finish_reason = output.finish_reason
                    stop_reason = output.stop_reason
@@ -307,8 +312,8 @@ class OpenAIServingCompletion(OpenAIServing):
                            and request.stream_options.include_usage):
                        if (request.stream_options.continuous_usage_stats
                                or output.finish_reason is not None):
-                            prompt_tokens = len(prompt_token_ids)
+                            prompt_tokens = num_prompt_tokens[prompt_idx]
-                            completion_tokens = len(output.token_ids)
+                            completion_tokens = previous_num_tokens[i]
                            usage = UsageInfo(
                                prompt_tokens=prompt_tokens,
                                completion_tokens=completion_tokens,
@@ -356,6 +361,7 @@ class OpenAIServingCompletion(OpenAIServing):
        for final_res in final_res_batch:
            prompt_token_ids = final_res.prompt_token_ids
            assert prompt_token_ids is not None
            prompt_logprobs = final_res.prompt_logprobs
            prompt_text = final_res.prompt
@@ -411,9 +417,9 @@ class OpenAIServingCompletion(OpenAIServing):
                )
                choices.append(choice_data)
                num_generated_tokens += len(output.token_ids)
            num_prompt_tokens += len(prompt_token_ids)
            num_generated_tokens += sum(
                len(output.token_ids) for output in final_res.outputs)
        usage = UsageInfo(
            prompt_tokens=num_prompt_tokens,
--- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
@@ -33,7 +33,6 @@ class Hermes2ProToolParser(ToolParser):
        self.current_tool_name_sent: bool = False
        self.prev_tool_call_arr: List[Dict] = []
        self.current_tool_id: int = -1
        self.current_tool_name_sent = False
        self.streamed_args_for_tool: List[str] = [
        ]  # map what has been streamed for each tool so far to a list
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -58,7 +58,6 @@ if TYPE_CHECKING:
    VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False
    VLLM_TEST_FORCE_FP8_MARLIN: bool = False
    VLLM_RPC_GET_DATA_TIMEOUT_MS: int = 5000
    VLLM_ALLOW_ENGINE_USE_RAY: bool = False
    VLLM_PLUGINS: Optional[List[str]] = None
    VLLM_TORCH_PROFILER_DIR: Optional[str] = None
    VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
@@ -391,14 +390,6 @@ environment_variables: Dict[str, Callable[[], Any]] = {
    "VLLM_RPC_GET_DATA_TIMEOUT_MS":
    lambda: int(os.getenv("VLLM_RPC_GET_DATA_TIMEOUT_MS", "5000")),
    # If set, allow running the engine as a separate ray actor,
    # which is a deprecated feature soon to be removed.
    # See https://github.com/vllm-project/vllm/issues/7045
    "VLLM_ALLOW_ENGINE_USE_RAY":
    lambda:
    (os.environ.get("VLLM_ALLOW_ENGINE_USE_RAY", "0").strip().lower() in
     ("1", "true")),
    # a list of plugin names to load, separated by commas.
    # if this is not set, it means all plugins will be loaded
    # if this is set to an empty string, no plugins will be loaded
--- a/vllm/inputs/parse.py
+++ b/vllm/inputs/parse.py
@@ -5,7 +5,8 @@ from typing_extensions import TypeIs
 from vllm.utils import is_list_of
 from .data import (EncoderDecoderLLMInputs, ExplicitEncoderDecoderPrompt,
-                   LLMInputs, PromptInputs)
+                   LLMInputs, PromptInputs, SingletonPromptInputs, TextPrompt,
                   TokensPrompt)
 class ParsedText(TypedDict):
@@ -60,8 +61,38 @@ def parse_and_batch_prompt(
                    for elem in prompt
                ]
-    raise ValueError("prompt must be a string, array of strings, "
+    raise TypeError("prompt must be a string, array of strings, "
-                     "array of tokens, or array of token arrays")
+                    "array of tokens, or array of token arrays")
 class ParsedStrPrompt(TypedDict):
    type: Literal["str"]
    content: str
 class ParsedTextPrompt(TypedDict):
    type: Literal["text"]
    content: TextPrompt
 class ParsedTokensPrompt(TypedDict):
    type: Literal["tokens"]
    content: TokensPrompt
 def parse_singleton_prompt(
    inputs: SingletonPromptInputs,
 ) -> Union[ParsedStrPrompt, ParsedTextPrompt, ParsedTokensPrompt]:
    if isinstance(inputs, str):
        return ParsedStrPrompt(type="str", content=inputs)
    elif isinstance(inputs, dict):
        if "prompt_token_ids" in inputs:
            return ParsedTokensPrompt(type="tokens",
                                      content=inputs)  # type: ignore
        elif "prompt" in inputs:
            return ParsedTextPrompt(type="text", content=inputs)
    raise TypeError("inputs must be a string, TextPrompt, or TokensPrompt")
 def is_explicit_encoder_decoder_prompt(
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -0,0 +1,536 @@
 import asyncio
 from typing import TYPE_CHECKING, List, Optional, Tuple, Union
 from typing_extensions import assert_never
 from vllm.config import ModelConfig
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
 from .data import (EncoderDecoderLLMInputs, LLMInputs, PromptInputs,
                   SingletonPromptInputs)
 from .parse import is_explicit_encoder_decoder_prompt, parse_singleton_prompt
 if TYPE_CHECKING:
    from vllm.multimodal import MultiModalDataDict
 logger = init_logger(__name__)
 PromptComponents = Tuple[Optional[str], List[int],
                         Optional["MultiModalDataDict"]]
 DecoderPromptComponents = Tuple[Optional[str], Optional[List[int]],
                                Optional["MultiModalDataDict"]]
 class InputPreprocessor:
    def __init__(
        self,
        model_config: ModelConfig,
        tokenizer: Optional[BaseTokenizerGroup],
    ) -> None:
        super().__init__()
        self.model_config = model_config
        self.tokenizer = tokenizer
    def get_tokenizer_group(self) -> BaseTokenizerGroup:
        if self.tokenizer is None:
            raise ValueError("You cannot pass text prompts when "
                             "`skip_tokenizer_init` is True")
        return self.tokenizer
    def get_bos_token_id(self,
                         lora_request: Optional[LoRARequest] = None
                         ) -> Optional[int]:
        if self.tokenizer is None:
            logger.warning("Using None for BOS token id because tokenizer "
                           "is not initialized")
            return None
        return self.tokenizer.get_lora_tokenizer(lora_request).bos_token_id
    def get_eos_token_id(self,
                         lora_request: Optional[LoRARequest] = None
                         ) -> Optional[int]:
        if self.tokenizer is None:
            logger.warning("Using None for EOS token id because tokenizer "
                           "is not initialized")
            return None
        return self.tokenizer.get_lora_tokenizer(lora_request).eos_token_id
    def get_decoder_start_token_id(self) -> Optional[int]:
        '''
        Obtain the decoder start token id employed by an encoder/decoder
        model. Returns None for non-encoder/decoder models or if the
        model config is unavailable.
        '''
        if not self.is_encoder_decoder_model():
            logger.warning("Using None for decoder start token id because "
                           "this is not an encoder/decoder model.")
            return None
        if (self.model_config is None or self.model_config.hf_config is None):
            logger.warning("Using None for decoder start token id because "
                           "model config is not available.")
            return None
        dec_start_token_id = getattr(self.model_config.hf_config,
                                     'decoder_start_token_id', None)
        if dec_start_token_id is None:
            logger.warning("Falling back on <BOS> for decoder start token id "
                           "because decoder start token id is not available.")
            dec_start_token_id = self.get_bos_token_id()
        return dec_start_token_id
    def _get_default_enc_dec_decoder_prompt(self) -> List[int]:
        '''
        Specifically for encoder/decoder models:
        generate a default decoder prompt for when
        the user specifies only the encoder prompt.
        Encoder/decoder models utilize the decoder
        prompt in different ways; as new models are
        added, it is intended that this function
        will be extended to produce differing
        default decoder prompts, depending on the
        model variety.
        Absent a special case, the default behavior
        of this method is to mirror the behavior of
        the HuggingFace (HF) GenerationMixin for a None
        decoder prompt, which is to employ a logit processor
        setting to force the first decoded token to be <BOS>.
        Here, this behavior is approximated by having the
        "default" decoder prompt be <BOS>.
        However, it is possible that in the future
        other models may have different or more 
        complex logic for the default decoder prompt.
        This motivates having a special helper method
        for default decoder prompts.
        Returns:
        * prompt_token_ids
        '''
        bos_token_id = self.get_bos_token_id()
        assert bos_token_id is not None
        return [bos_token_id]
    def _prepare_decoder_input_ids_for_generation(
        self,
        decoder_input_ids: Optional[List[int]],
    ) -> List[int]:
        """
        Prepares `decoder_input_ids` for generation with encoder-decoder models.
        Based on
        https://github.com/huggingface/transformers/blob/
        4037a2b5b1278736e566aec12e169100275545ea/
        src/transformers/generation/utils.py
        specifically GenerationMixin._prepare_decoder_input_ids_for_generation()
        Arguments:
        * decoder_input_ids: input token ids to preprocess
        Returns:
        * Processed token list
        """
        decoder_start_token_id = self.get_decoder_start_token_id()
        assert decoder_start_token_id is not None
        if decoder_input_ids is None:
            # no decoder prompt input ->
            # use decoder_start_token_id as decoder_input_ids
            decoder_input_ids = self._get_default_enc_dec_decoder_prompt()
        if (len(decoder_input_ids) == 0
                or decoder_input_ids[0] != decoder_start_token_id):
            decoder_input_ids = [decoder_start_token_id] + decoder_input_ids
        return decoder_input_ids
    def _apply_prompt_adapter(
        self,
        prompt_token_ids: List[int],
        prompt_adapter_request: Optional[PromptAdapterRequest],
    ) -> List[int]:
        if prompt_adapter_request:
            prompt_token_ids = (
                [0] * prompt_adapter_request.prompt_adapter_num_virtual_tokens
                + prompt_token_ids)
        return prompt_token_ids
    def _tokenize_prompt(
        self,
        prompt: str,
        request_id: str,
        lora_request: Optional[LoRARequest],
    ) -> List[int]:
        """
        Apply the model's tokenizer to a text prompt, returning the
        corresponding token IDs.
        """
        tokenizer = self.get_tokenizer_group()
        return tokenizer.encode(request_id=request_id,
                                prompt=prompt,
                                lora_request=lora_request)
    async def _tokenize_prompt_async(
        self,
        prompt: str,
        request_id: str,
        lora_request: Optional[LoRARequest],
    ) -> List[int]:
        """Async version of :meth:`_tokenize_prompt`."""
        tokenizer = self.get_tokenizer_group()
        return await tokenizer.encode_async(request_id=request_id,
                                            prompt=prompt,
                                            lora_request=lora_request)
    def _extract_prompt_components(
        self,
        inputs: SingletonPromptInputs,
        request_id: str,
        lora_request: Optional[LoRARequest] = None,
    ) -> PromptComponents:
        '''
        Extract the components of any single encoder or decoder input prompt.
        Arguments:
        * request_id
        * inputs: single encoder or decoder input prompt
        * lora_request: this is only valid for decoder prompts
        Returns:
        * prompt
        * prompt_token_ids
        * multi_modal_data
        '''
        parsed = parse_singleton_prompt(inputs)
        if parsed["type"] == "str":
            prompt = parsed["content"]
            prompt_token_ids = self._tokenize_prompt(
                prompt,
                request_id=request_id,
                lora_request=lora_request,
            )
            multi_modal_data = None
        elif parsed["type"] == "tokens":
            prompt = None
            prompt_token_ids = parsed["content"]["prompt_token_ids"]
            multi_modal_data = parsed["content"].get("multi_modal_data")
        elif parsed["type"] == "text":
            prompt = parsed["content"]["prompt"]
            prompt_token_ids = self._tokenize_prompt(
                prompt,
                request_id=request_id,
                lora_request=lora_request,
            )
            multi_modal_data = parsed["content"].get("multi_modal_data")
        else:
            assert_never(parsed)
        return prompt, prompt_token_ids, multi_modal_data
    async def _extract_prompt_components_async(
        self,
        inputs: SingletonPromptInputs,
        request_id: str,
        lora_request: Optional[LoRARequest] = None,
    ) -> PromptComponents:
        """Async version of :meth:`_extract_prompt_components`."""
        parsed = parse_singleton_prompt(inputs)
        if parsed["type"] == "str":
            prompt = parsed["content"]
            prompt_token_ids = await self._tokenize_prompt_async(
                prompt,
                request_id=request_id,
                lora_request=lora_request,
            )
            multi_modal_data = None
        elif parsed["type"] == "tokens":
            prompt = None
            prompt_token_ids = parsed["content"]["prompt_token_ids"]
            multi_modal_data = parsed["content"].get("multi_modal_data")
        elif parsed["type"] == "text":
            prompt = parsed["content"]["prompt"]
            prompt_token_ids = await self._tokenize_prompt_async(
                prompt,
                request_id=request_id,
                lora_request=lora_request,
            )
            multi_modal_data = parsed["content"].get("multi_modal_data")
        else:
            assert_never(parsed)
        return prompt, prompt_token_ids, multi_modal_data
    def _build_enc_dec_llm_inputs(
        self,
        encoder_comps: PromptComponents,
        decoder_comps: DecoderPromptComponents,
    ) -> EncoderDecoderLLMInputs:
        encoder_prompt, encoder_prompt_ids, encoder_mm_data = encoder_comps
        decoder_prompt, decoder_prompt_ids, decoder_mm_data = decoder_comps
        if encoder_mm_data is not None or decoder_mm_data is not None:
            raise ValueError("Multi-modal encoder-decoder models are "
                             "not supported yet")
        decoder_prompt_ids = (
            self._prepare_decoder_input_ids_for_generation(decoder_prompt_ids))
        return EncoderDecoderLLMInputs(
            prompt_token_ids=decoder_prompt_ids,
            prompt=decoder_prompt,
            encoder_prompt_token_ids=encoder_prompt_ids,
            encoder_prompt=encoder_prompt,
        )
    def _process_encoder_decoder_prompt(
        self,
        inputs: PromptInputs,
        request_id: str,
    ) -> EncoderDecoderLLMInputs:
        '''
        For encoder/decoder models only:
        Process an input prompt into an
        :class:`EncoderDecoderLLMInputs` instance.
        There are two types of input prompts:
        singleton prompts which carry only the
        encoder prompt, and explicit encoder/decoder
        prompts which carry both the encoder and the
        decoder prompts as member variables.
        This function handles the following scenarios:
        * Singleton encoder prompt: extract encoder prompt
          token ids & infer default decoder prompt token ids
        * Explicit encoder/decoder prompt: extract encoder
          and decoder prompt token ids
        Note that for Explicit encoder/decoder prompts,
        each sub-prompt (encoder or decoder prompt) can
        have any possible singleton type; thus this
        method relies on helper functions to obtain
        token ids for the sub-prompts.
        Arguments:
        * inputs: an input prompt
        * request_id
        Returns:
        * :class:`EncoderDecoderLLMInputs` instance
        '''
        encoder_comps: PromptComponents
        decoder_comps: DecoderPromptComponents
        if is_explicit_encoder_decoder_prompt(inputs):
            encoder_comps = self._extract_prompt_components(
                inputs["encoder_prompt"],
                request_id=request_id,
            )
            if (decoder_input := inputs["decoder_prompt"]) is None:
                decoder_comps = None, None, None
            else:
                decoder_comps = self._extract_prompt_components(
                    decoder_input,
                    request_id=request_id,
                )
        else:
            encoder_comps = self._extract_prompt_components(
                inputs,
                request_id=request_id,
            )
            decoder_comps = None, None, None
        return self._build_enc_dec_llm_inputs(encoder_comps, decoder_comps)
    async def _process_encoder_decoder_prompt_async(
        self,
        inputs: PromptInputs,
        request_id: str,
    ) -> EncoderDecoderLLMInputs:
        """Async version of :meth:`_process_encoder_decoder_prompt`."""
        encoder_comps: PromptComponents
        decoder_comps: DecoderPromptComponents
        if is_explicit_encoder_decoder_prompt(inputs):
            encoder_task = self._extract_prompt_components_async(
                inputs["encoder_prompt"],
                request_id=request_id,
            )
            if (decoder_input := inputs["decoder_prompt"]) is None:
                encoder_comps = await encoder_task
                decoder_comps = None, None, None
            else:
                decoder_task = self._extract_prompt_components_async(
                    decoder_input,
                    request_id=request_id,
                )
                encoder_comps, decoder_comps = await asyncio.gather(
                    encoder_task, decoder_task)
        else:
            encoder_comps = await self._extract_prompt_components_async(
                inputs,
                request_id=request_id,
            )
            decoder_comps = None, None, None
        return self._build_enc_dec_llm_inputs(encoder_comps, decoder_comps)
    def _build_decoder_only_llm_inputs(
        self,
        prompt_comps: PromptComponents,
        prompt_adapter_request: Optional[PromptAdapterRequest],
    ) -> LLMInputs:
        prompt, prompt_token_ids, multi_modal_data = prompt_comps
        prompt_token_ids = self._apply_prompt_adapter(
            prompt_token_ids, prompt_adapter_request=prompt_adapter_request)
        return LLMInputs(prompt_token_ids=prompt_token_ids,
                         prompt=prompt,
                         multi_modal_data=multi_modal_data)
    def _process_decoder_only_prompt(
        self,
        inputs: SingletonPromptInputs,
        request_id: str,
        lora_request: Optional[LoRARequest] = None,
        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
    ) -> LLMInputs:
        '''
        For decoder-only models:
        Process an input prompt into an :class:`LLMInputs` instance.
        Arguments:
        * inputs: input prompt
        * request_id
        * lora_request
        * prompt_adapter_request
        Returns:
        * :class:`LLMInputs` instance
        '''
        prompt_comps = self._extract_prompt_components(
            inputs,
            request_id=request_id,
            lora_request=lora_request,
        )
        return self._build_decoder_only_llm_inputs(
            prompt_comps,
            prompt_adapter_request=prompt_adapter_request,
        )
    async def _process_decoder_only_prompt_async(
        self,
        inputs: SingletonPromptInputs,
        request_id: str,
        lora_request: Optional[LoRARequest] = None,
        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
    ) -> LLMInputs:
        """Async version of :meth:`_process_decoder_only_prompt`."""
        prompt_comps = await self._extract_prompt_components_async(
            inputs,
            request_id=request_id,
            lora_request=lora_request,
        )
        return self._build_decoder_only_llm_inputs(
            prompt_comps,
            prompt_adapter_request=prompt_adapter_request,
        )
    def preprocess(
        self,
        inputs: PromptInputs,
        request_id: str,
        lora_request: Optional[LoRARequest] = None,
        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
    ) -> Union[LLMInputs, EncoderDecoderLLMInputs]:
        """Preprocess the input prompt."""
        if self.is_encoder_decoder_model():
            # Encoder-decoder model requires special mapping of
            # input prompts to encoder & decoder
            return self._process_encoder_decoder_prompt(
                inputs,
                request_id=request_id,
            )
        if is_explicit_encoder_decoder_prompt(inputs):
            raise ValueError("Cannot pass encoder-decoder prompt "
                             "to decoder-only models")
        # Decoder-only operation
        return self._process_decoder_only_prompt(
            inputs,
            request_id=request_id,
            lora_request=lora_request,
            prompt_adapter_request=prompt_adapter_request,
        )
    async def preprocess_async(
        self,
        inputs: PromptInputs,
        request_id: str,
        lora_request: Optional[LoRARequest] = None,
        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
    ) -> Union[LLMInputs, EncoderDecoderLLMInputs]:
        """Async version of :meth:`preprocess`."""
        if self.is_encoder_decoder_model():
            # Encoder-decoder model requires special mapping of
            # input prompts to encoder & decoder
            return await self._process_encoder_decoder_prompt_async(
                inputs,
                request_id=request_id,
            )
        if is_explicit_encoder_decoder_prompt(inputs):
            raise ValueError("Cannot pass encoder-decoder prompt "
                             "to decoder-only models")
        # Decoder-only operation
        return await self._process_decoder_only_prompt_async(
            inputs,
            request_id=request_id,
            lora_request=lora_request,
            prompt_adapter_request=prompt_adapter_request,
        )
    def is_encoder_decoder_model(self):
        return self.model_config.is_encoder_decoder_model
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -410,6 +410,7 @@ def fused_topk(
    if renormalize:
        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
    return topk_weights, topk_ids
@@ -443,7 +444,8 @@ def grouped_topk(hidden_states: torch.Tensor,
    if renormalize:
        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
-    return topk_weights, topk_ids
+
    return topk_weights, topk_ids.to(torch.int32)
 def get_config_dtype_str(dtype: torch.dtype,
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -990,7 +990,7 @@ def get_rope(
                base, is_neox_style, dtype, short_factor, long_factor,
                **extra_kwargs)
        elif scaling_type == "mrope":
-            return MRotaryEmbedding(
+            rotary_emb = MRotaryEmbedding(
                head_size,
                rotary_dim,
                max_position,
--- a/vllm/model_executor/models/init.py
+++ b/vllm/model_executor/models/init.py
@@ -90,12 +90,12 @@ _MULTIMODAL_MODELS = {
    "PaliGemmaForConditionalGeneration": ("paligemma",
                                          "PaliGemmaForConditionalGeneration"),
    "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
    "UltravoxModel": ("ultravox", "UltravoxModel"),
    "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
    "PixtralForConditionalGeneration": ("pixtral",
                                        "PixtralForConditionalGeneration"),
    "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
    "Qwen2VLForConditionalGeneration": ("qwen2_vl",
                                        "Qwen2VLForConditionalGeneration"),
    "UltravoxModel": ("ultravox", "UltravoxModel"),
 }
 _CONDITIONAL_GENERATION_MODELS = {
    "BartModel": ("bart", "BartForConditionalGeneration"),
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Simon Mo	9ba0817ff1	bump version to v0.6.1.post2 (#8473 ) Some checks failed Create Release / Create Release (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.10, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.11, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.12, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.8, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.9, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (12.1, ubuntu-20.04, 3.10, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (12.1, ubuntu-20.04, 3.11, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (12.1, ubuntu-20.04, 3.12, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (12.1, ubuntu-20.04, 3.8, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (12.1, ubuntu-20.04, 3.9, 2.4.0) (push) Has been cancelled Details	2024-09-13 11:35:00 -07:00
Nick Hill	18e9e1f7b3	[HotFix] Fix final output truncation with stop string + streaming (#8468 )	2024-09-13 11:31:12 -07:00
Isotr0py	f57092c00b	[Doc] Add oneDNN installation to CPU backend documentation (#8467 )	2024-09-13 18:06:30 +00:00
Cyrus Leung	a84e598e21	[CI/Build] Reorganize models tests (#7820 )	2024-09-13 10:20:06 -07:00
youkaichao	0a4806f0a9	[plugin][torch.compile] allow to add custom compile backend (#8445 )	2024-09-13 09:32:42 -07:00
Cyrus Leung	ecd7a1d5b6	[Installation] Gate FastAPI version for Python 3.8 (#8456 )	2024-09-13 09:02:26 -07:00
youkaichao	a2469127db	[misc][ci] fix quant test (#8449 )	2024-09-13 17:20:14 +08:00
Jee Jee Li	06311e2956	[Misc] Skip loading extra bias for Qwen2-VL GPTQ-Int8 (#8442 )	2024-09-13 07:58:28 +00:00
youkaichao	cab69a15e4	[doc] recommend pip instead of conda (#8446 )	2024-09-12 23:52:41 -07:00
Isotr0py	9b4a3b235e	[CI/Build] Enable InternVL2 PP test only on single node (#8437 )	2024-09-13 06:35:20 +00:00
Simon Mo	acda0b35d0	bump version to v0.6.1.post1 (#8440 ) Some checks failed Create Release / Create Release (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.10, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.11, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.12, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.8, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.9, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (12.1, ubuntu-20.04, 3.10, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (12.1, ubuntu-20.04, 3.11, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (12.1, ubuntu-20.04, 3.12, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (12.1, ubuntu-20.04, 3.8, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (12.1, ubuntu-20.04, 3.9, 2.4.0) (push) Has been cancelled Details	2024-09-12 21:39:49 -07:00
William Lin	ba77527955	[bugfix] torch profiler bug for single gpu with GPUExecutor (#8354 )	2024-09-12 21:30:00 -07:00
Alexander Matveev	6821020109	[Bugfix] Fix async log stats (#8417 )	2024-09-12 20:48:59 -07:00
Cyrus Leung	8427550488	[CI/Build] Update pixtral tests to use JSON (#8436 )	2024-09-13 03:47:52 +00:00
Cyrus Leung	3f79bc3d1a	[Bugfix] Bump fastapi and pydantic version (#8435 )	2024-09-13 03:21:42 +00:00
shangmingc	40c396533d	[Bugfix] Mapping physical device indices for e2e test utils (#8290 )	2024-09-13 11:06:28 +08:00
Cyrus Leung	5ec9c0fb3c	[Core] Factor out input preprocessing to a separate class (#7329 )	2024-09-13 02:56:13 +00:00
Dipika Sikka	8f44a92d85	[BugFix] fix group_topk (#8430 )	2024-09-13 09:23:42 +08:00
Roger Wang	360ddbd37e	[Misc] Update Pixtral example (#8431 )	2024-09-12 17:31:18 -07:00
Wenxiang	a480939e8e	[Bugfix] Fix weight loading issue by rename variable. (#8293 )	2024-09-12 19:25:00 -04:00
Patrick von Platen	d31174a4e1	[Hotfix][Pixtral] Fix multiple images bugs (#8415 )	2024-09-12 15:21:51 -07:00
Roger Wang	b61bd98f90	[CI/Build] Disable multi-node test for InternVL2 (#8428 )	2024-09-12 15:05:35 -07:00
Roger Wang	c16369455f	[Hotfix][Core][VLM] Disable chunked prefill by default and prefix caching for multimodal models (#8425 )	2024-09-12 14:06:51 -07:00
Alexander Matveev	019877253b	[Bugfix] multi-step + flashinfer: ensure cuda graph compatible (#8427 )	2024-09-12 21:01:50 +00:00
Nick Hill	551ce01078	[Core] Add engine option to return only deltas or final output (#7381 )	2024-09-12 12:02:00 -07:00
William Lin	a6c0f3658d	[multi-step] add flashinfer backend (#7928 )	2024-09-12 11:16:22 -07:00
Joe Runde	f2e263b801	[Bugfix] Offline mode fix (#8376 ) Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>	2024-09-12 11:11:57 -07:00
Luis Vega	1f0c75afa9	[BugFix] Fix Duplicate Assignment in Hermes2ProToolParser (#8423 )	2024-09-12 11:10:11 -07:00
WANGWEI	8a23e93302	[BugFix] lazy init _copy_stream to avoid torch init wrong gpu instance (#8403 )	2024-09-12 10:47:42 -07:00
Alex Brooks	c6202daeed	[Model] Support multiple images for qwen-vl (#8247 ) Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>	2024-09-12 10:10:54 -07:00
Isotr0py	e56bf27741	[Bugfix] Fix InternVL2 inference with various num_patches (#8375 ) Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>	2024-09-12 10:10:35 -07:00
Roger Wang	520ca380ae	[Hotfix][VLM] Fixing max position embeddings for Pixtral (#8399 )	2024-09-12 09:28:37 -07:00
youkaichao	7de49aa86c	[torch.compile] hide slicing under custom op for inductor (#8384 )	2024-09-12 00:11:55 -07:00
Woosuk Kwon	42ffba11ad	[Misc] Use RoPE cache for MRoPE (#8396 )	2024-09-11 23:13:14 -07:00
Kevin Lin	295c4730a8	[Misc] Raise error when using encoder/decoder model with cpu backend (#8355 )	2024-09-12 05:45:24 +00:00
Blueyo0	1bf2dd9df0	[Gemma2] add bitsandbytes support for Gemma2 (#8338 )	2024-09-11 21:53:12 -07:00
tomeras91	5a60699c45	[Bugfix]: Fix the logic for deciding if tool parsing is used (#8366 )	2024-09-12 03:55:30 +00:00
Michael Goin	b6c75e1cf2	Fix the AMD weight loading tests (#8390 )	2024-09-11 20:35:33 -07:00
Woosuk Kwon	b71c956deb	[TPU] Use Ray for default distributed backend (#8389 )	2024-09-11 20:31:51 -07:00
youkaichao	f842a7aff1	[misc] remove engine_use_ray (#8126 )	2024-09-11 18:23:36 -07:00
Cody Yu	a65cb16067	[MISC] Dump model runner inputs when crashing (#8305 )	2024-09-12 01:12:25 +00:00