bump version to v0.6.1.post2 (#8473 )

[HotFix] Fix final output truncation with stop string + streaming (#8468 )
[Doc] Add oneDNN installation to CPU backend documentation (#8467 )
2024-09-13 11:35:00 -07:00 · 2024-09-13 11:31:12 -07:00 · 2024-09-13 18:06:30 +00:00 · 2024-09-13 10:20:06 -07:00 · 2024-09-13 09:32:42 -07:00 · 2024-09-13 09:02:26 -07:00
117 changed files with 2993 additions and 1833 deletions
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -23,12 +23,10 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
 # Run basic model test
 docker exec cpu-test bash -c "
  pip install pytest matplotlib einops transformers_stream_generator
-  pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py \
-      --ignore=tests/models/test_oot_registration.py \
-      --ignore=tests/models/test_registry.py \
-      --ignore=tests/models/test_fp8.py \
-      --ignore=tests/models/test_jamba.py \
-      --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
+  pytest -v -s tests/models/decoder_only/language \
+    --ignore=tests/models/test_fp8.py \
+    --ignore=tests/models/decoder_only/language/test_jamba.py \
+    --ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported

 # Run compressed-tensor test
 docker exec cpu-test bash -c "
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -50,6 +50,7 @@ steps:
  - tests/worker
  commands:
  - pytest -v -s async_engine # Async Engine
+  - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
  - pytest -v -s test_inputs.py
  - pytest -v -s multimodal
  - pytest -v -s test_utils.py # Utils
@@ -91,7 +92,7 @@ steps:
  - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
  - pytest -v -s entrypoints/openai
  - pytest -v -s entrypoints/test_chat_utils.py
-
+  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

 - label: Distributed Tests (4 GPUs) # 10min
  working_dir: "/vllm-workspace/tests"
@@ -162,15 +163,6 @@ steps:
    - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference_encoder_decoder.py

- label: Models Test # 1hr10min
-  source_file_dependencies:
-  - vllm/
-  - tests/models
-  commands:
-    - pip install -e ./plugins/vllm_add_dummy_model
-    - pytest -v -s models/test_oot_registration.py # it needs a clean process
-    - pytest -v -s models -m \"not vlm\" --ignore=models/test_oot_registration.py
-
 - label: torch compile integration test
  source_file_dependencies:
  - vllm/
@@ -178,14 +170,6 @@ steps:
    - pytest -v -s ./compile/test_full_graph.py
    - pytest -v -s ./compile/test_wrapper.py

-
- label: Vision Language Models Test # 42min
-  #mirror_hardwares: [amd]
-  source_file_dependencies:
-  - vllm/
-  commands:
-    - pytest -v -s models -m vlm
-
 - label: Prefix Caching Test # 7min
  #mirror_hardwares: [amd]
  source_file_dependencies:
@@ -284,6 +268,45 @@ steps:
  commands:
    - pytest -v -s tool_use

+#####  models test  #####
+
+- label: Basic Models Test # 3min
+  source_file_dependencies:
+  - vllm/
+  - tests/models
+  commands:
+    - pip install -e ./plugins/vllm_add_dummy_model
+    - pytest -v -s models/test_oot_registration.py # it needs a clean process
+    - pytest -v -s models/*.py --ignore=models/test_oot_registration.py
+
+- label: Decoder-only Language Models Test # 1h3min
+  #mirror_hardwares: [amd]
+  source_file_dependencies:
+  - vllm/
+  - tests/models/decoder_only/language
+  commands:
+    - pytest -v -s models/decoder_only/language
+
+- label: Decoder-only Multi-Modal Models Test # 56min
+  #mirror_hardwares: [amd]
+  source_file_dependencies:
+  - vllm/
+  - tests/models/decoder_only/audio_language
+  - tests/models/decoder_only/vision_language
+  commands:
+    - pytest -v -s models/decoder_only/audio_language
+    - pytest -v -s models/decoder_only/vision_language
+
+- label: Other Models Test # 5min
+  #mirror_hardwares: [amd]
+  source_file_dependencies:
+  - vllm/
+  - tests/models/embedding/language
+  - tests/models/encoder_decoder/language
+  commands:
+    - pytest -v -s models/embedding/language
+    - pytest -v -s models/encoder_decoder/language
+
 #####  1 GPU test  #####
 #####  multi gpus test  #####

@@ -309,11 +332,11 @@ steps:
  - tests/distributed/
  commands:
  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
  - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
-    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'

 - label: Distributed Tests (2 GPUs) # 28min
  #mirror_hardwares: [amd]
@@ -326,11 +349,10 @@ steps:
  - vllm/model_executor/models/
  - tests/distributed/
  commands:
-  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
-  - TARGET_TEST_SUITE=L4 pytest -v -s distributed/test_basic_distributed_correctness.py
-  - pytest -v -s distributed/test_basic_distributed_correctness_enc_dec.py
-  - pytest -v -s distributed/test_chunked_prefill_distributed.py
-  - pytest -v -s distributed/test_multimodal_broadcast.py
+  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
+  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
+  # Avoid importing model tests that cause CUDA reinitialization error
+  - pytest models/encoder_decoder/language/test_bart.py models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
  - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
  - pip install -e ./plugins/vllm_add_dummy_model
  - pytest -v -s distributed/test_distributed_oot.py
--- a/.github/ISSUE_TEMPLATE/400-bug
+++ b/.github/ISSUE_TEMPLATE/400-bug
@@ -30,6 +30,15 @@ body:
      </details>
  validations:
    required: true
+- type: textarea
+  attributes:
+    label: Model Input Dumps
+    description: |
+      If you are facing crashing due to illegal memory access or other issues with model execution, vLLM may dump the problematic input of the model. In this case, you will see the message `Error in model execution (input dumped to /tmp/err_xxx.pkl)`. If you see this message, please zip the file (because GitHub doesn't support .pkl file format) and upload it here. This will help us to reproduce the issue and facilitate the debugging process.
+    placeholder: |
+      Upload the dumped input file.
+  validations:
+    required: false
 - type: textarea
  attributes:
    label: 🐛 Describe the bug
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -54,10 +54,21 @@ void gelu_fast(torch::Tensor& out, torch::Tensor& input);

 void gelu_quick(torch::Tensor& out, torch::Tensor& input);

-void advance_step(int64_t num_seqs, int64_t num_queries, int64_t block_size,
-                  torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
-                  torch::Tensor& input_positions, torch::Tensor& seq_lens,
-                  torch::Tensor& slot_mapping, torch::Tensor& block_tables);
+void advance_step_flashattn(int64_t num_seqs, int64_t num_queries,
+                            int64_t block_size, torch::Tensor& input_tokens,
+                            torch::Tensor& sampled_token_ids,
+                            torch::Tensor& input_positions,
+                            torch::Tensor& seq_lens,
+                            torch::Tensor& slot_mapping,
+                            torch::Tensor& block_tables);
+
+void advance_step_flashinfer(
+    int64_t num_seqs, int64_t num_queries, int64_t block_size,
+    torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
+    torch::Tensor& input_positions, torch::Tensor& seq_lens,
+    torch::Tensor& slot_mapping, torch::Tensor& block_tables,
+    torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr,
+    torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bounds);

 #ifndef USE_ROCM
 torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
--- a/csrc/prepare_inputs/advance_step.cu
+++ b/csrc/prepare_inputs/advance_step.cu
@@ -12,13 +12,11 @@ namespace prepare_inputs {

 //
 template <int const num_threads>
-__global__ void advance_step_kernel(int num_seqs, int num_queries,
-                                    int block_size, long* input_tokens_ptr,
-                                    long const* sampled_token_ids_ptr,
-                                    long* input_positions_ptr,
-                                    int* seq_lens_ptr, long* slot_mapping_ptr,
-                                    int const* block_tables_ptr,
-                                    int64_t const block_tables_stride) {
+__global__ void advance_step_flashattn_kernel(
+    int num_seqs, int num_queries, int block_size, long* input_tokens_ptr,
+    long const* sampled_token_ids_ptr, long* input_positions_ptr,
+    int* seq_lens_ptr, long* slot_mapping_ptr, int const* block_tables_ptr,
+    int64_t const block_tables_stride) {
  int num_query_blocks = div_ceil(num_queries, num_threads);

  if (blockIdx.x >= num_query_blocks) {
@@ -79,16 +77,91 @@ inline void verify_tensor(std::string const& name, torch::Tensor& t,
  }
 }

-void advance_step(int num_seqs, int num_queries, int block_size,
-                  torch::Tensor& input_tokens,       // type: long
-                  torch::Tensor& sampled_token_ids,  // type: long
-                  torch::Tensor& input_positions,    // type: long
-                  torch::Tensor& seq_lens,           // type: int
-                  torch::Tensor& slot_mapping,       // type: long
-                  torch::Tensor& block_tables) {     // type: int
+__global__ void advance_step_flashinfer_kernel(
+    int num_threads, int num_seqs, int num_queries, int block_size,
+    long* input_tokens_ptr, long const* sampled_token_ids_ptr,
+    long* input_positions_ptr, int* seq_lens_ptr, long* slot_mapping_ptr,
+    int const* block_tables_ptr, int64_t const block_tables_stride,
+    int* paged_kv_last_page_len_ptr, int* block_table_bound_ptr) {
+  int num_query_blocks = div_ceil(num_queries, num_threads);
+
+  if (blockIdx.x < num_query_blocks) {
+    int cur_query_id = blockIdx.x * num_threads + threadIdx.x;
+
+    if (cur_query_id < num_queries) {
+      // Update input_tokens
+      input_tokens_ptr[cur_query_id] = sampled_token_ids_ptr[cur_query_id];
+
+      int seq_len = seq_lens_ptr[cur_query_id];
+      int next_seq_len = seq_len + 1;
+      int next_input_pos = next_seq_len - 1;
+
+      // Update seq_lens
+      seq_lens_ptr[cur_query_id] = next_seq_len;
+      // Update input_positions
+      input_positions_ptr[cur_query_id] = next_input_pos;
+
+      int const* seq_block_tables_ptr =
+          block_tables_ptr + block_tables_stride * cur_query_id;
+
+      int block_index = next_input_pos / block_size;
+      int block_offset = next_input_pos % block_size;
+
+      // Update paged_kv_last_page_len
+      paged_kv_last_page_len_ptr[cur_query_id] = block_offset + 1;
+
+      int slot_num =
+          seq_block_tables_ptr[block_index] * block_size + block_offset;
+      // Update slot_mapping
+      slot_mapping_ptr[cur_query_id] = slot_num;
+      block_table_bound_ptr[cur_query_id] = div_ceil(next_seq_len, block_size);
+    }
+  }
+}
+
+__global__ void advance_step_flashinfer_indptr_kernel(
+    int num_threads, int num_seqs, int num_queries, int* paged_kv_indptr_ptr,
+    int* block_table_bound_ptr) {
+  int idx = blockIdx.x * num_threads + threadIdx.x;
+
+  // Update paged_kv_indptr
+  if (idx < num_queries) {
+    int sum = 0;
+    for (int i = 0; i <= idx; ++i) {
+      sum += block_table_bound_ptr[i];
+    }
+    paged_kv_indptr_ptr[idx + 1] = sum;
+  }
+}
+
+__global__ void advance_step_flashinfer_indices_kernel(
+    int num_threads, int num_seqs, int num_queries, int const* block_tables_ptr,
+    int64_t const block_tables_stride, int* paged_kv_indices_ptr,
+    int* paged_kv_indptr_ptr, int* block_table_bound_ptr) {
+  int idx = blockIdx.x * num_threads + threadIdx.x;
+  int row = idx / block_tables_stride;
+  int col = idx % block_tables_stride;
+
+  if (row < num_queries && col < block_table_bound_ptr[row]) {
+    paged_kv_indices_ptr[paged_kv_indptr_ptr[row] + col] =
+        block_tables_ptr[row * block_tables_stride + col];
+  }
+  // if cudagraph, fill padded seqs with the last valid seq's indptr
+  if (num_queries < row && row <= num_seqs) {
+    paged_kv_indptr_ptr[row] = paged_kv_indptr_ptr[num_queries];
+  }
+}
+
+void advance_step_flashattn(int num_seqs, int num_queries, int block_size,
+                            torch::Tensor& input_tokens,       // type: long
+                            torch::Tensor& sampled_token_ids,  // type: long
+                            torch::Tensor& input_positions,    // type: long
+                            torch::Tensor& seq_lens,           // type: int
+                            torch::Tensor& slot_mapping,       // type: long
+                            torch::Tensor& block_tables) {     // type: int

  if (logging) {
-    printf("advance_step:\n");
+    printf("advance_step_flashattn:\n");
    printf("  num_seqs = %d\n", num_seqs);
    printf("  num_queries = %d\n", num_queries);
    printf("  block_size = %d\n", block_size);
@@ -108,24 +181,126 @@ void advance_step(int num_seqs, int num_queries, int block_size,
  int blocks;
  cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);

-  advance_step_kernel<max_threads><<<blocks, max_threads, 0, stream>>>(
-      num_seqs, num_queries, block_size,
+  advance_step_flashattn_kernel<max_threads>
+      <<<blocks, max_threads, 0, stream>>>(
+          num_seqs, num_queries, block_size,
+          reinterpret_cast<long*>(input_tokens.data_ptr()),
+          reinterpret_cast<long const*>(sampled_token_ids.data_ptr()),
+          reinterpret_cast<long*>(input_positions.data_ptr()),
+          reinterpret_cast<int*>(seq_lens.data_ptr()),
+          reinterpret_cast<long*>(slot_mapping.data_ptr()),
+          reinterpret_cast<int const*>(block_tables.data_ptr()),
+          block_tables.stride(0));
+}
+
+void advance_step_flashinfer(
+    int num_seqs, int num_queries, int block_size,
+    torch::Tensor& input_tokens,            // type: long
+    torch::Tensor& sampled_token_ids,       // type: long
+    torch::Tensor& input_positions,         // type: long
+    torch::Tensor& seq_lens,                // type: int
+    torch::Tensor& slot_mapping,            // type: long
+    torch::Tensor& block_tables,            // type: int
+    torch::Tensor& paged_kv_indices,        // type: int
+    torch::Tensor& paged_kv_indptr,         // type: int
+    torch::Tensor& paged_kv_last_page_len,  // type: int
+    torch::Tensor& block_table_bound) {     // type: int
+
+  if (logging) {
+    printf("advance_step_flashinfer:\n");
+    printf("  num_seqs = %d\n", num_seqs);
+    printf("  num_queries = %d\n", num_queries);
+    printf("  block_size = %d\n", block_size);
+    printf("  block_tables.stride(0) = %d\n", block_tables.stride(0));
+  }
+  // Verify all tensors
+  verify_tensor("input_tokens", input_tokens, num_seqs, -1, at::kLong);
+  // verify_tensor("sampled_token_ids", sampled_token_ids, num_queries, 1,
+  //               at::kLong);
+  verify_tensor("input_positions", input_positions, num_seqs, -1, at::kLong);
+  verify_tensor("seq_lens", seq_lens, num_seqs, -1, at::kInt);
+  verify_tensor("slot_mapping", slot_mapping, num_seqs, -1, at::kLong);
+  verify_tensor("block_tables", block_tables, num_seqs, -1, at::kInt);
+
+  verify_tensor("paged_kv_indices", paged_kv_indices, -1, -1, at::kInt);
+  verify_tensor("paged_kv_indptr", paged_kv_indptr, num_seqs + 1, -1, at::kInt);
+  verify_tensor("paged_kv_last_page_len", paged_kv_last_page_len, num_seqs, -1,
+                at::kInt);
+
+  verify_tensor("block_table_bound", block_table_bound, num_seqs, -1, at::kInt);
+
+  int dev = sampled_token_ids.get_device();
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream(dev);
+
+  int blocks;
+  int threads;
+  cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
+  cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev);
+  if (logging) {
+    printf("launching kernel with %d blocks\n", blocks);
+  }
+
+  // TODO(will): support arbitrary block_tables stride
+  if ((blocks * threads) / block_tables.stride(0) < num_queries) {
+    TORCH_CHECK(false,
+                "multi-step: not enough threads to map block_table to"
+                "FlashInfer's paged_kv_indices on GPU. Try reducing the number "
+                "of seqs,",
+                " increasing the block size or take smaller steps.",
+                " num_queries = ", num_queries,
+                " block_tables.stride(0) = ", block_tables.stride(0),
+                " blocks = ", blocks, " max_threads = ", threads);
+  }
+
+  advance_step_flashinfer_kernel<<<blocks, threads, 0, stream>>>(
+      threads, num_seqs, num_queries, block_size,
      reinterpret_cast<long*>(input_tokens.data_ptr()),
      reinterpret_cast<long const*>(sampled_token_ids.data_ptr()),
      reinterpret_cast<long*>(input_positions.data_ptr()),
      reinterpret_cast<int*>(seq_lens.data_ptr()),
      reinterpret_cast<long*>(slot_mapping.data_ptr()),
      reinterpret_cast<int const*>(block_tables.data_ptr()),
-      block_tables.stride(0));
+      block_tables.stride(0),
+      reinterpret_cast<int*>(paged_kv_last_page_len.data_ptr()),
+      reinterpret_cast<int*>(block_table_bound.data_ptr()));
+
+  advance_step_flashinfer_indptr_kernel<<<blocks, threads, 0, stream>>>(
+      threads, num_seqs, num_queries,
+      reinterpret_cast<int*>(paged_kv_indptr.data_ptr()),
+      reinterpret_cast<int*>(block_table_bound.data_ptr()));
+
+  advance_step_flashinfer_indices_kernel<<<blocks, threads, 0, stream>>>(
+      threads, num_seqs, num_queries,
+      reinterpret_cast<int const*>(block_tables.data_ptr()),
+      block_tables.stride(0),
+      reinterpret_cast<int*>(paged_kv_indices.data_ptr()),
+      reinterpret_cast<int*>(paged_kv_indptr.data_ptr()),
+      reinterpret_cast<int*>(block_table_bound.data_ptr()));
 }

 }  // namespace prepare_inputs

-void advance_step(int64_t num_seqs, int64_t num_queries, int64_t block_size,
-                  torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
-                  torch::Tensor& input_positions, torch::Tensor& seq_lens,
-                  torch::Tensor& slot_mapping, torch::Tensor& block_tables) {
-  prepare_inputs::advance_step(num_seqs, num_queries, block_size, input_tokens,
-                               sampled_token_ids, input_positions, seq_lens,
-                               slot_mapping, block_tables);
+void advance_step_flashattn(int64_t num_seqs, int64_t num_queries,
+                            int64_t block_size, torch::Tensor& input_tokens,
+                            torch::Tensor& sampled_token_ids,
+                            torch::Tensor& input_positions,
+                            torch::Tensor& seq_lens,
+                            torch::Tensor& slot_mapping,
+                            torch::Tensor& block_tables) {
+  prepare_inputs::advance_step_flashattn(
+      num_seqs, num_queries, block_size, input_tokens, sampled_token_ids,
+      input_positions, seq_lens, slot_mapping, block_tables);
+}
+
+void advance_step_flashinfer(
+    int64_t num_seqs, int64_t num_queries, int64_t block_size,
+    torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
+    torch::Tensor& input_positions, torch::Tensor& seq_lens,
+    torch::Tensor& slot_mapping, torch::Tensor& block_tables,
+    torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr,
+    torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bound) {
+  prepare_inputs::advance_step_flashinfer(
+      num_seqs, num_queries, block_size, input_tokens, sampled_token_ids,
+      input_positions, seq_lens, slot_mapping, block_tables, paged_kv_indices,
+      paged_kv_indptr, paged_kv_last_page_len, block_table_bound);
 }
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -74,11 +74,22 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {

  // prepare_inputs advance_step
  ops.def(
-      "advance_step(int num_seqs, int num_queries, int block_size, "
+      "advance_step_flashattn(int num_seqs, int num_queries, int block_size, "
      "Tensor! input_tokens, Tensor sampled_token_ids, "
      "Tensor! input_positions, Tensor! seq_lens, Tensor! slot_mapping, "
      "Tensor block_tables) -> ()");
-  ops.impl("advance_step", torch::kCUDA, &advance_step);
+  ops.impl("advance_step_flashattn", torch::kCUDA, &advance_step_flashattn);
+
+  ops.def(
+      "advance_step_flashinfer("
+      "    int num_seqs, int num_queries, int block_size,"
+      "    Tensor! input_tokens, Tensor sampled_token_ids,"
+      "    Tensor! input_positions, Tensor! seq_lens, Tensor! slot_mapping,"
+      "    Tensor block_tables, Tensor! paged_kv_indices,"
+      "    Tensor! paged_kv_indptr, Tensor! paged_kv_last_page_len,"
+      "    Tensor! block_table_bounds"
+      ") -> ()");
+  ops.impl("advance_step_flashinfer", torch::kCUDA, &advance_step_flashinfer);

  // Layernorm
  // Apply Root Mean Square (RMS) Normalization to the input tensor.
--- a/docs/source/getting_started/cpu-installation.rst
+++ b/docs/source/getting_started/cpu-installation.rst
@@ -59,6 +59,20 @@ Build from source
    $ pip install wheel packaging ninja "setuptools>=49.4.0" numpy
    $ pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu

+- Third, build and install oneDNN library from source:
+
+.. code-block:: console
+
+    $ git clone -b rls-v3.5 https://github.com/oneapi-src/oneDNN.git
+    $ cmake -B ./oneDNN/build -S ./oneDNN -G Ninja -DONEDNN_LIBRARY_TYPE=STATIC \ 
+        -DONEDNN_BUILD_DOC=OFF \ 
+        -DONEDNN_BUILD_EXAMPLES=OFF \ 
+        -DONEDNN_BUILD_TESTS=OFF \ 
+        -DONEDNN_BUILD_GRAPH=OFF \ 
+        -DONEDNN_ENABLE_WORKLOAD=INFERENCE \ 
+        -DONEDNN_ENABLE_PRIMITIVE=MATMUL
+    $ cmake --build ./oneDNN/build --target install --config Release
+
 - Finally, build and install vLLM CPU backend: 

 .. code-block:: console
--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
@@ -26,6 +26,10 @@ You can install vLLM using pip:
    $ # Install vLLM with CUDA 12.1.
    $ pip install vllm

+.. note::
+
+    Although we recommend using ``conda`` to create and manage Python environments, it is highly recommended to use ``pip`` to install vLLM. This is because ``pip`` can install ``torch`` with separate library packages like ``NCCL``, while ``conda`` installs ``torch`` with statically linked ``NCCL``. This can cause issues when vLLM tries to use ``NCCL``. See `this issue <https://github.com/vllm-project/vllm/issues/8420>`_ for more details.
+
 .. note::

    As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default.
@@ -34,7 +38,7 @@ You can install vLLM using pip:
    .. code-block:: console

        $ # Install vLLM with CUDA 11.8.
-        $ export VLLM_VERSION=0.4.0
+        $ export VLLM_VERSION=0.6.1.post1
        $ export PYTHON_VERSION=310
        $ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118

@@ -48,7 +52,7 @@ You can install vLLM using pip:

    .. code-block:: console

-        $ export VLLM_VERSION=0.5.4 # vLLM's main branch version is currently set to latest released tag
+        $ export VLLM_VERSION=0.6.1.post1 # vLLM's main branch version is currently set to latest released tag
        $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-${VLLM_VERSION}-cp38-abi3-manylinux1_x86_64.whl
        $ # You can also access a specific commit
        $ # export VLLM_COMMIT=...
@@ -80,11 +84,11 @@ You can also build and install vLLM from source:

 .. tip::

-    Building from source requires quite a lot compilation. If you are building from source for multiple times, it is beneficial to cache the compilation results. For example, you can install `ccache <https://github.com/ccache/ccache>`_ via either `conda install ccache` or `apt install ccache` . As long as `which ccache` command can find the `ccache` binary, it will be used automatically by the build system. After the first build, the subsequent builds will be much faster.
+    Building from source requires quite a lot compilation. If you are building from source for multiple times, it is beneficial to cache the compilation results. For example, you can install `ccache <https://github.com/ccache/ccache>`_ via either ``conda install ccache`` or ``apt install ccache`` . As long as ``which ccache`` command can find the ``ccache`` binary, it will be used automatically by the build system. After the first build, the subsequent builds will be much faster.

 .. tip::
    To avoid your system being overloaded, you can limit the number of compilation jobs
-    to be run simultaneously, via the environment variable `MAX_JOBS`. For example:
+    to be run simultaneously, via the environment variable ``MAX_JOBS``. For example:

    .. code-block:: console

@@ -99,7 +103,7 @@ You can also build and install vLLM from source:
        $ # Use `--ipc=host` to make sure the shared memory is large enough.
        $ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3

-    If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from `the official website <https://developer.nvidia.com/cuda-toolkit-archive>`_. After installation, set the environment variable `CUDA_HOME` to the installation path of CUDA Toolkit, and make sure that the `nvcc` compiler is in your `PATH`, e.g.:
+    If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from `the official website <https://developer.nvidia.com/cuda-toolkit-archive>`_. After installation, set the environment variable ``CUDA_HOME`` to the installation path of CUDA Toolkit, and make sure that the ``nvcc`` compiler is in your ``PATH``, e.g.:

    .. code-block:: console

--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -254,7 +254,7 @@ Multimodal Language Models
    -
  * - :code:`QWenLMHeadModel`
    - Qwen-VL
-    - Image\ :sup:`E`
+    - Image\ :sup:`E+`
    - :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc.
    -
  * - :code:`Qwen2VLForConditionalGeneration`
@@ -342,7 +342,7 @@ Note that, as an inference engine, vLLM does not introduce new models. Therefore

 We have the following levels of testing for models:

-1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to `test_models.py <https://github.com/vllm-project/vllm/blob/main/tests/models/test_models.py>`_ and `test_big_models.py <https://github.com/vllm-project/vllm/blob/main/tests/models/test_big_models.py>`_ for the models that have passed this test.
+1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to `models tests <https://github.com/vllm-project/vllm/blob/main/tests/models>`_ for the models that have passed this test.
 2. **Output Sensibility**: We check if the output of the model is sensible and coherent, by measuring the perplexity of the output and checking for any obvious errors. This is a less stringent test.
 3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to `functionality tests <https://github.com/vllm-project/vllm/tree/main/tests>`_ and `examples <https://github.com/vllm-project/vllm/tree/main/examples>`_ for the models that have passed this test.
 4. **Community Feedback**: We rely on the community to provide feedback on the models. If a model is broken or not working as expected, we encourage users to raise issues to report it or open pull requests to fix it. The rest of the models fall under this category.
--- a/examples/offline_inference_pixtral.py
+++ b/examples/offline_inference_pixtral.py
@@ -11,7 +11,7 @@ from vllm.sampling_params import SamplingParams
 # - Server:
 #
 # ```bash
-# vllm serve mistralai/Pixtral-12B-2409 --tokenizer_mode mistral --limit_mm_per_prompt 'image=4' --max_num_batched_tokens 16384
+# vllm serve mistralai/Pixtral-12B-2409 --tokenizer-mode mistral --limit-mm-per-prompt 'image=4' --max-model-len 16384
 # ```
 #
 # - Client:
@@ -45,6 +45,7 @@ def run_simple_demo():
    model_name = "mistralai/Pixtral-12B-2409"
    sampling_params = SamplingParams(max_tokens=8192)

+    # Lower max_num_seqs or max_model_len on low-VRAM GPUs.
    llm = LLM(model=model_name, tokenizer_mode="mistral")

    prompt = "Describe this image in one sentence."
@@ -83,7 +84,7 @@ def run_advanced_demo():
        model=model_name,
        tokenizer_mode="mistral",
        limit_mm_per_prompt={"image": max_img_per_msg},
-        max_num_batched_tokens=max_img_per_msg * max_tokens_per_img,
+        max_model_len=max_img_per_msg * max_tokens_per_img,
    )

    prompt = "Describe the following image."
--- a/examples/offline_inference_vision_language_multi_image.py
+++ b/examples/offline_inference_vision_language_multi_image.py
@@ -19,7 +19,39 @@ IMAGE_URLS = [
 ]


-def load_phi3v(question, image_urls: List[str]):
+def load_qwenvl_chat(question: str, image_urls: List[str]):
+    model_name = "Qwen/Qwen-VL-Chat"
+    llm = LLM(
+        model=model_name,
+        trust_remote_code=True,
+        max_num_seqs=5,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+    placeholders = "".join(f"Picture {i}: <img></img>\n"
+                           for i, _ in enumerate(image_urls, start=1))
+
+    # This model does not have a chat_template attribute on its tokenizer,
+    # so we need to explicitly pass it. We use ChatML since it's used in the
+    # generation utils of the model:
+    # https://huggingface.co/Qwen/Qwen-VL-Chat/blob/main/qwen_generation_utils.py#L265
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                              trust_remote_code=True)
+
+    # Copied from: https://huggingface.co/docs/transformers/main/en/chat_templating
+    chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"  # noqa: E501
+
+    messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
+    prompt = tokenizer.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True,
+                                           chat_template=chat_template)
+
+    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"]
+    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+    return llm, prompt, stop_token_ids, None, chat_template
+
+
+def load_phi3v(question: str, image_urls: List[str]):
    llm = LLM(
        model="microsoft/Phi-3.5-vision-instruct",
        trust_remote_code=True,
@@ -30,10 +62,10 @@ def load_phi3v(question, image_urls: List[str]):
                             for i, _ in enumerate(image_urls, start=1))
    prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
    stop_token_ids = None
-    return llm, prompt, stop_token_ids, None
+    return llm, prompt, stop_token_ids, None, None


-def load_internvl(question, image_urls: List[str]):
+def load_internvl(question: str, image_urls: List[str]):
    model_name = "OpenGVLab/InternVL2-2B"

    llm = LLM(
@@ -61,7 +93,7 @@ def load_internvl(question, image_urls: List[str]):
    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]

-    return llm, prompt, stop_token_ids, None
+    return llm, prompt, stop_token_ids, None, None


 def load_qwen2_vl(question, image_urls: List[str]):
@@ -111,18 +143,19 @@ def load_qwen2_vl(question, image_urls: List[str]):
    else:
        image_data, _ = process_vision_info(messages)

-    return llm, prompt, stop_token_ids, image_data
+    return llm, prompt, stop_token_ids, image_data, None


 model_example_map = {
    "phi3_v": load_phi3v,
    "internvl_chat": load_internvl,
    "qwen2_vl": load_qwen2_vl,
+    "qwen_vl_chat": load_qwenvl_chat,
 }


 def run_generate(model, question: str, image_urls: List[str]):
-    llm, prompt, stop_token_ids, image_data = model_example_map[model](
+    llm, prompt, stop_token_ids, image_data, _ = model_example_map[model](
        question, image_urls)
    if image_data is None:
        image_data = [fetch_image(url) for url in image_urls]
@@ -146,29 +179,32 @@ def run_generate(model, question: str, image_urls: List[str]):


 def run_chat(model: str, question: str, image_urls: List[str]):
-    llm, _, stop_token_ids, _ = model_example_map[model](question, image_urls)
+    llm, _, stop_token_ids, _, chat_template = model_example_map[model](
+        question, image_urls)

    sampling_params = SamplingParams(temperature=0.0,
                                     max_tokens=128,
                                     stop_token_ids=stop_token_ids)
-
-    outputs = llm.chat([{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "text",
-                "text": question,
-            },
-            *({
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
+    outputs = llm.chat(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": question,
                },
-            } for image_url in image_urls),
-        ],
-    }],
-                       sampling_params=sampling_params)
+                *({
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    },
+                } for image_url in image_urls),
+            ],
+        }],
+        sampling_params=sampling_params,
+        chat_template=chat_template,
+    )

    for o in outputs:
        generated_text = o.outputs[0].text
--- a/examples/offline_inference_with_profiler.py
+++ b/examples/offline_inference_with_profiler.py
@@ -16,7 +16,7 @@ prompts = [
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

 # Create an LLM.
-llm = LLM(model="facebook/opt-125m")
+llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1)

 llm.start_profile()

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -76,7 +76,7 @@ exclude = [

 [tool.codespell]
 ignore-words-list = "dout, te, indicies, subtile"
-skip = "./tests/prompts,./benchmarks/sonnet.txt,./tests/lora/data,./build"
+skip = "./tests/models/fixtures,./tests/prompts,./benchmarks/sonnet.txt,./tests/lora/data,./build"

 [tool.isort]
 use_parentheses = true
@@ -85,5 +85,6 @@ skip_gitignore = true
 [tool.pytest.ini_options]
 markers = [
    "skip_global_cleanup",
-    "vlm: run tests for vision language models only",
+    "core_model: run this model test in each PR instead of just daily",
+    "distributed_2_gpus: run this test only in distributed tests for 2 GPUs",
 ]
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -7,11 +7,12 @@ py-cpuinfo
 transformers >= 4.43.2  # Required for Chameleon and Llama 3.1 hotfox.
 tokenizers >= 0.19.1  # Required for Llama 3.
 protobuf # Required by LlamaTokenizer.
-fastapi
+fastapi < 0.113.0; python_version < '3.9'
+fastapi >= 0.114.1; python_version >= '3.9'
 aiohttp
 openai >= 1.40.0 # Ensure modern openai package (ensure types module present)
 uvicorn[standard]
-pydantic >= 2.8  # Required for OpenAI server.
+pydantic >= 2.9  # Required for fastapi >= 0.113.0
 pillow  # Required for image processing
 prometheus_client >= 0.18.0
 prometheus-fastapi-instrumentator >= 7.0.0
--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
@@ -1,4 +1,3 @@
-import os
 import subprocess
 import sys
 import time
@@ -26,8 +25,7 @@ def _query_server_long(prompt: str) -> dict:


@pytest.fixture
-def api_server(tokenizer_pool_size: int, engine_use_ray: bool,
-               worker_use_ray: bool):
+def api_server(tokenizer_pool_size: int, worker_use_ray: bool):
    script_path = Path(__file__).parent.joinpath(
        "api_server_async_engine.py").absolute()
    commands = [
@@ -37,25 +35,17 @@ def api_server(tokenizer_pool_size: int, engine_use_ray: bool,
        str(tokenizer_pool_size)
    ]

-    # Copy the environment variables and append `VLLM_ALLOW_ENGINE_USE_RAY=1`
-    # to prevent `--engine-use-ray` raises an exception due to it deprecation
-    env_vars = os.environ.copy()
-    env_vars["VLLM_ALLOW_ENGINE_USE_RAY"] = "1"
-
-    if engine_use_ray:
-        commands.append("--engine-use-ray")
    if worker_use_ray:
        commands.append("--worker-use-ray")
-    uvicorn_process = subprocess.Popen(commands, env=env_vars)
+    uvicorn_process = subprocess.Popen(commands)
    yield
    uvicorn_process.terminate()


@pytest.mark.parametrize("tokenizer_pool_size", [0, 2])
@pytest.mark.parametrize("worker_use_ray", [False, True])
-@pytest.mark.parametrize("engine_use_ray", [False, True])
-def test_api_server(api_server, tokenizer_pool_size: int, worker_use_ray: bool,
-                    engine_use_ray: bool):
+def test_api_server(api_server, tokenizer_pool_size: int,
+                    worker_use_ray: bool):
    """
    Run the API server and test it.

--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
@@ -1,8 +1,10 @@
 import asyncio
 import os
+import uuid
 from asyncio import CancelledError
+from copy import copy
 from dataclasses import dataclass
-from typing import Optional
+from typing import List, Optional

 import pytest
 import pytest_asyncio
@@ -12,6 +14,7 @@ from vllm import SamplingParams
 from vllm.config import ParallelConfig
 from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine
 from vllm.outputs import RequestOutput as RealRequestOutput
+from vllm.sampling_params import RequestOutputKind

 from ..conftest import cleanup
 from ..utils import wait_for_gpu_memory_to_clear
@@ -72,14 +75,12 @@ class MockEngine:


 class MockAsyncLLMEngine(AsyncLLMEngine):
-
-    def _init_engine(self, *args, **kwargs):
-        return MockEngine()
+    _engine_class = MockEngine


@pytest.mark.asyncio
 async def test_new_requests_event():
-    engine = MockAsyncLLMEngine(worker_use_ray=False, engine_use_ray=False)
+    engine = MockAsyncLLMEngine(worker_use_ray=False)
    engine.start_background_loop()
    await asyncio.sleep(0.01)
    assert engine.engine.step_calls == 0
@@ -112,16 +113,11 @@ async def test_new_requests_event():
    assert engine.engine.add_request_calls == 3
    assert engine.engine.step_calls == old_step_calls + 1

-    # Allow deprecated engine_use_ray to not raise exception
-    os.environ["VLLM_ALLOW_ENGINE_USE_RAY"] = "1"
-
-    engine = MockAsyncLLMEngine(worker_use_ray=True, engine_use_ray=True)
+    engine = MockAsyncLLMEngine(worker_use_ray=True)
    assert engine.get_model_config() is not None
    assert engine.get_tokenizer() is not None
    assert engine.get_decoding_config() is not None

-    os.environ.pop("VLLM_ALLOW_ENGINE_USE_RAY")
-

 def start_engine():
    wait_for_gpu_memory_to_clear(
@@ -130,8 +126,17 @@ def start_engine():
        timeout_s=60,
    )

+    num_scheduler_steps = int(os.getenv("NUM_SCHEDULER_STEPS", "1"))
+    print(f"Starting engine with num_scheduler_steps={num_scheduler_steps}")
+
    return AsyncLLMEngine.from_engine_args(
-        AsyncEngineArgs(model="facebook/opt-125m", enforce_eager=True))
+        AsyncEngineArgs(model="facebook/opt-125m",
+                        enforce_eager=True,
+                        num_scheduler_steps=num_scheduler_steps))
+
+
+def uid() -> str:
+    return str(uuid.uuid4())


@pytest_asyncio.fixture(scope="module")
@@ -154,59 +159,195 @@ def should_do_global_cleanup_after_test(request) -> bool:


@pytest.mark.asyncio(scope="module")
-async def test_asyncio_run(async_engine):
+@pytest.mark.parametrize("stop", [None, ["a stop string"]])
+async def test_asyncio_run(async_engine, stop):
+
+    scheduler_config = await async_engine.get_scheduler_config()
+    num_scheduler_steps = scheduler_config.num_scheduler_steps

    async def run(prompt: str):
        sampling_params = SamplingParams(
            temperature=0,
            max_tokens=32,
+            min_tokens=32,
+            stop=stop,
        )

+        output_count = 0
+        final_output = None
        async for output in async_engine.generate(prompt,
                                                  sampling_params,
-                                                  request_id=prompt):
+                                                  request_id=uid()):
+            output_count += 1
            final_output = output
-        return final_output
+        return final_output, output_count

    results = await asyncio.gather(
        run("test0"),
-        run("test1"),
+        run("test0"),
    )
    assert len(results) == 2
+    first, second = results
+
+    # remove nondeterministic fields for comparison
+    first[0].metrics = None
+    second[0].metrics = None
+    first[0].request_id = None
+    second[0].request_id = None
+
+    assert str(first) == str(second)
+
+    output_count = results[0][1]
+    if num_scheduler_steps == 1:
+        assert output_count == 32
+    else:
+        assert 1 < output_count < 32


@pytest.mark.asyncio(scope="module")
-async def test_cancellation(async_engine):
+@pytest.mark.parametrize("stop", [None, ["a stop string"]])
+async def test_output_kinds(async_engine, stop):
+    """Test that output_kind works as expected and that
+    results are equivalent across different kinds."""
+
+    scheduler_config = await async_engine.get_scheduler_config()
+    num_scheduler_steps = scheduler_config.num_scheduler_steps
+
    sampling_params = SamplingParams(
        temperature=0,
-        min_tokens=10,
-        max_tokens=10,
+        max_tokens=32,
+        min_tokens=32,
+        stop=stop,
    )

+    async def run(prompt: str, kind: RequestOutputKind):
+        params = copy(sampling_params)
+        params.output_kind = kind
+
+        output_count = 0
+        final_output = None
+        async for output in async_engine.generate(prompt,
+                                                  params,
+                                                  request_id=uid()):
+            output_count += 1
+            final_output = output
+
+        assert final_output is not None
+        assert final_output.finished
+
+        return (final_output.prompt_token_ids,
+                final_output.outputs[0].token_ids,
+                final_output.outputs[0].text, output_count)
+
+    async def run_deltas(prompt: str):
+        params = copy(sampling_params)
+        params.output_kind = RequestOutputKind.DELTA
+
+        prompt_tokens = None
+        output_tokens: List[int] = []
+        output_text = ""
+        output_count = 0
+        final_output = None
+        async for output in async_engine.generate(prompt,
+                                                  params,
+                                                  request_id=uid()):
+            token_ids = output.outputs[0].token_ids
+            text = output.outputs[0].text
+            final_output = output
+
+            # Ensure we get prompt ids iff we haven't yet received output tokens
+            if output_tokens:
+                assert 1 <= len(token_ids) <= num_scheduler_steps
+                assert stop or text
+                assert not output.prompt_token_ids
+            else:
+                assert output.prompt_token_ids
+                prompt_tokens = output.prompt_token_ids
+
+            output_tokens.extend(token_ids)
+            output_text += text
+
+            output_count += 1
+
+        assert final_output is not None
+        assert final_output.finished
+
+        return prompt_tokens, output_tokens, output_text, output_count
+
+    results = await asyncio.gather(
+        run("common input prompt", RequestOutputKind.CUMULATIVE),
+        run("common input prompt", RequestOutputKind.FINAL_ONLY),
+        run_deltas("common input prompt"))
+
+    # Make sure outputs are the same
+    prompt_set = set(tuple(prompt_ids) for prompt_ids, _, _, _ in results)
+    assert len(prompt_set) == 1
+
+    text_set = set(text for _, _, text, _ in results)
+    assert len(text_set) == 1
+
+    tokens_set = set(tuple(ids) for _, ids, _, _ in results)
+    assert len(tokens_set) == 1
+
+    cumulative, final, deltas = results
+
+    # output message counts
+    assert cumulative[3] == deltas[3]
+
+    if num_scheduler_steps == 1:
+        assert cumulative[3] == 32
+    else:
+        assert 1 < cumulative[3] < 32
+
+    assert final[3] == 1
+
+
+@pytest.mark.asyncio(scope="module")
+@pytest.mark.parametrize("stop", [None, ["a stop string"]])
+async def test_cancellation(async_engine, stop):
+    scheduler_config = await async_engine.get_scheduler_config()
+    num_scheduler_steps = scheduler_config.num_scheduler_steps
+
+    sampling_params = SamplingParams(
+        temperature=0,
+        min_tokens=13,
+        max_tokens=13,
+        stop=stop,
+    )
+
+    stop_at = 5 if num_scheduler_steps == 1 else 1
+
+    request_id = uid()
+
    i = 0
    with pytest.raises(CancelledError):
        async for output in async_engine.generate("test2",
                                                  sampling_params,
-                                                  request_id="test2"):
+                                                  request_id=request_id):
            assert not output.finished
            i += 1
-            if i == 5:
-                await async_engine.abort("test2")
+            if i == stop_at:
+                await async_engine.abort(request_id)

-    assert i == 5
+    assert i == stop_at


@pytest.mark.asyncio(scope="module")
-async def test_delayed_generator(async_engine):
+@pytest.mark.parametrize("stop", [None, ["a stop string"]])
+async def test_delayed_generator(async_engine, stop):
+    scheduler_config = await async_engine.get_scheduler_config()
+
+    if scheduler_config.num_scheduler_steps != 1:
+        pytest.skip("no need to test this one with multistep")
+
    sampling_params = SamplingParams(
        temperature=0,
        min_tokens=10,
        max_tokens=10,
+        stop=stop,
    )

-    stream = async_engine.generate("test3",
-                                   sampling_params,
-                                   request_id="test3")
+    stream = async_engine.generate("test3", sampling_params, request_id=uid())
    i = 0
    final_output: Optional[RealRequestOutput] = None
    async for output in stream:
--- a/tests/async_engine/test_openapi_server_ray.py
+++ b/tests/async_engine/test_openapi_server_ray.py
@@ -19,16 +19,11 @@ def server():
        "--max-model-len",
        "2048",
        "--enforce-eager",
-        "--engine-use-ray",
        "--chat-template",
        str(chatml_jinja_path),
    ]

-    # Allow `--engine-use-ray`, otherwise the launch of the server throw
-    # an error due to try to use a deprecated feature
-    env_dict = {"VLLM_ALLOW_ENGINE_USE_RAY": "1"}
-    with RemoteOpenAIServer(MODEL_NAME, args,
-                            env_dict=env_dict) as remote_server:
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server


--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -3,20 +3,27 @@
 Run `pytest tests/basic_correctness/test_basic_correctness.py`.
 """
 import os
+import pickle
+import re
 import weakref
+from unittest.mock import patch

 import pytest

 from vllm import LLM
 from vllm.utils import is_hip
+from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata

 from ..models.utils import check_outputs_equal
+from ..utils import multi_gpu_test

 MODELS = [
    "facebook/opt-125m",
    "meta-llama/Llama-2-7b-hf",
 ]

+TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
+

 def test_vllm_gc_ed():
    """Verify vllm instance is GC'ed when it is deleted"""
@@ -64,3 +71,88 @@ def test_models(
        name_0="hf",
        name_1="vllm",
    )
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize(
+    "model, distributed_executor_backend, attention_backend, "
+    "test_suite", [
+        ("facebook/opt-125m", "ray", "", "L4"),
+        ("facebook/opt-125m", "mp", "", "L4"),
+        ("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
+        ("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
+        ("facebook/opt-125m", "ray", "", "A100"),
+        ("facebook/opt-125m", "mp", "", "A100"),
+        ("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
+        ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
+    ])
+def test_models_distributed(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    distributed_executor_backend: str,
+    attention_backend: str,
+    test_suite: str,
+) -> None:
+
+    if test_suite != TARGET_TEST_SUITE:
+        pytest.skip(f"Skip test for {test_suite}")
+
+    if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
+        # test ray adag
+        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
+        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
+
+    if attention_backend:
+        os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
+
+    dtype = "half"
+    max_tokens = 5
+
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+    with vllm_runner(model,
+                     dtype=dtype,
+                     tensor_parallel_size=2,
+                     distributed_executor_backend=distributed_executor_backend
+                     ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
+
+
+def test_model_with_failure(vllm_runner) -> None:
+    try:
+        with patch("vllm.model_executor.models.opt.OPTForCausalLM.forward",
+                   side_effect=ValueError()):
+            with pytest.raises(ValueError) as exc_info:
+                vllm_runner("facebook/opt-125m",
+                            dtype="half",
+                            enforce_eager=False,
+                            gpu_memory_utilization=0.7)
+            matches = re.search(r"input dumped to (.+).pkl",
+                                str(exc_info.value))
+            assert matches is not None
+            filename = f"{matches.group(1)}.pkl"
+
+        with open(filename, "rb") as filep:
+            inputs = pickle.load(filep)
+
+        if any(key not in inputs for key in ("arg_1", "arg_2", "arg_3")):
+            raise AssertionError("Missing keys in dumped inputs. Dumped keys: "
+                                 f"{list(inputs.keys())}")
+        assert isinstance(inputs["arg_1"],
+                          ModelInputForGPUWithSamplingMetadata)
+    finally:
+        os.remove(filename)
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -6,11 +6,13 @@ prefill requests are chunked.

 Run `pytest tests/models/test_chunked_prefill.py`.
 """
+import os
 from contextlib import nullcontext

 import pytest

 from ..models.utils import check_logprobs_close, check_outputs_equal
+from ..utils import multi_gpu_test

 MODELS = [
    "facebook/opt-125m",
@@ -66,6 +68,59 @@ def test_models(
    )


+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
+@pytest.mark.parametrize("model", MODELS)
+def test_models_distributed(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    distributed_executor_backend: str,
+) -> None:
+    if (model == "meta-llama/Llama-2-7b-hf"
+            and distributed_executor_backend == "ray"):
+        # test ray adag
+        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
+        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
+
+    dtype = "half"
+    max_tokens = 5
+    chunked_prefill_token_size = 16
+
+    # Add a chunked prefill config.
+    max_num_seqs = min(chunked_prefill_token_size, 256)
+    assert chunked_prefill_token_size != -1
+    enable_chunked_prefill = True
+    max_num_batched_tokens = chunked_prefill_token_size
+
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            tensor_parallel_size=2,
+            max_num_seqs=max_num_seqs,
+            enable_chunked_prefill=enable_chunked_prefill,
+            max_num_batched_tokens=max_num_batched_tokens,
+            distributed_executor_backend=distributed_executor_backend,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
+
+
@pytest.mark.parametrize(
    "kv_cache_dtype,model",
    [("fp8_e4m3",
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -19,10 +19,13 @@ MODELS = [
    "facebook/opt-125m",
 ]

-assert ENABLE_ARTIFICIAL_PREEMPT is True, (
-    "Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1. "
-    "`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest "
-    "tests/basic_correctness/test_preemption.py`")
+
+@pytest.fixture(scope="module", autouse=True)
+def check_settings():
+    assert ENABLE_ARTIFICIAL_PREEMPT is True, (
+        "Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1. "
+        "`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest "
+        "tests/basic_correctness/test_preemption.py`")


@pytest.fixture
@@ -64,6 +67,7 @@ def test_chunked_prefill_recompute(
            enable_chunked_prefill=enable_chunked_prefill,
            max_num_seqs=max_num_seqs,
            worker_use_ray=worker_use_ray,
+            disable_log_stats=False,
    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -16,5 +16,7 @@ def test_full_graph(model):
        "The future of AI is",
    ]
    sampling_params = SamplingParams(temperature=0)
-    llm = LLM(model="meta-llama/Meta-Llama-3-8B")
+    llm = LLM(model="meta-llama/Meta-Llama-3-8B",
+              enforce_eager=True,
+              load_format="dummy")
    llm.generate(prompts, sampling_params)
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -6,8 +6,8 @@ import sys
 import tempfile
 from collections import UserList
 from enum import Enum
-from typing import (Any, Callable, Dict, List, Optional, Tuple, TypedDict,
-                    TypeVar, Union)
+from typing import (Any, Callable, Dict, List, Optional, Tuple, Type,
+                    TypedDict, TypeVar, Union)

 import numpy as np
 import pytest
@@ -18,6 +18,7 @@ from huggingface_hub import snapshot_download
 from PIL import Image
 from transformers import (AutoModelForCausalLM, AutoTokenizer, BatchEncoding,
                          BatchFeature)
+from transformers.models.auto.auto_factory import _BaseAutoModelClass

 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
@@ -260,7 +261,7 @@ class HfRunner:
        *,
        model_kwargs: Optional[Dict[str, Any]] = None,
        is_embedding_model: bool = False,
-        auto_cls=AutoModelForCausalLM,
+        auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM,
        postprocess_inputs: Callable[[BatchEncoding],
                                     BatchEncoding] = identity,
    ) -> None:
@@ -292,20 +293,14 @@ class HfRunner:
            trust_remote_code=True,
        )

-        try:
-            # don't put this import at the top level
-            # it will call torch.cuda.device_count()
-            from transformers import AutoProcessor  # noqa: F401
-            self.processor = AutoProcessor.from_pretrained(
-                model_name,
-                torch_dtype=torch_dtype,
-                trust_remote_code=True,
-            )
-        except Exception as exc:
-            logger.warning(
-                "Unable to auto-load HuggingFace processor for model (%s). "
-                "Using tokenizer instead. Reason: %s", model_name, exc)
-            self.processor = self.tokenizer
+        # don't put this import at the top level
+        # it will call torch.cuda.device_count()
+        from transformers import AutoProcessor  # noqa: F401
+        self.processor = AutoProcessor.from_pretrained(
+            model_name,
+            torch_dtype=torch_dtype,
+            trust_remote_code=True,
+        )

        self.postprocess_inputs = postprocess_inputs

@@ -658,8 +653,8 @@ class VllmRunner:
            outputs.append((req_sample_output_ids, req_sample_output_strs))
        return outputs

+    @staticmethod
    def _final_steps_generate_w_logprobs(
-        self,
        req_outputs: List[RequestOutput],
    ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
        outputs: List[Tuple[List[int], str, Optional[SampleLogprobs]]] = []
--- a/tests/distributed/test_basic_distributed_correctness.py
+++ b/tests/distributed/test_basic_distributed_correctness.py
@@ -1,80 +0,0 @@
-"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
-
-Run:
-```sh
-cd $VLLM_PATH/tests
-
-pytest distributed/test_basic_distributed_correctness.py
-```
-"""
-import os
-
-import pytest
-
-from vllm.utils import cuda_device_count_stateless
-
-from ..models.utils import check_outputs_equal
-from ..utils import fork_new_process_for_each_test
-
-TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
-
-
-@pytest.mark.skipif(cuda_device_count_stateless() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
-@pytest.mark.parametrize(
-    "model, distributed_executor_backend, attention_backend, "
-    "test_suite", [
-        ("facebook/opt-125m", "ray", "", "L4"),
-        ("facebook/opt-125m", "mp", "", "L4"),
-        ("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
-        ("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
-        ("facebook/opt-125m", "ray", "", "A100"),
-        ("facebook/opt-125m", "mp", "", "A100"),
-        ("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
-        ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
-    ])
-@fork_new_process_for_each_test
-def test_models(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    distributed_executor_backend: str,
-    attention_backend: str,
-    test_suite: str,
-) -> None:
-
-    if test_suite != TARGET_TEST_SUITE:
-        pytest.skip(f"Skip test for {test_suite}")
-
-    if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
-        # test ray adag
-        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
-        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
-
-    if attention_backend:
-        os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
-
-    dtype = "half"
-    max_tokens = 5
-
-    # NOTE: take care of the order. run vLLM first, and then run HF.
-    # vLLM needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-    with vllm_runner(model,
-                     dtype=dtype,
-                     tensor_parallel_size=2,
-                     distributed_executor_backend=distributed_executor_backend
-                     ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
-    check_outputs_equal(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
--- a/tests/distributed/test_basic_distributed_correctness_enc_dec.py
+++ b/tests/distributed/test_basic_distributed_correctness_enc_dec.py
@@ -1,102 +0,0 @@
-"""For encoder/decoder models only:
-Compare the outputs of HF and distributed vLLM when using greedy sampling.
-
-Run:
-```sh
-cd $VLLM_PATH/tests
-
-pytest distributed/test_basic_distributed_correctness_enc_dec.py
-```
-"""
-
-import pytest
-from transformers import AutoModelForSeq2SeqLM
-
-from vllm.utils import cuda_device_count_stateless
-
-from ..conftest import DecoderPromptType
-from ..models.utils import check_logprobs_close
-from ..utils import fork_new_process_for_each_test
-
-
-@pytest.mark.skipif(cuda_device_count_stateless() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
-@pytest.mark.parametrize("model, distributed_executor_backend", [
-    ("facebook/bart-large-cnn", "ray"),
-    ("facebook/bart-large-cnn", "mp"),
-])
-@fork_new_process_for_each_test
-def test_models(
-    model: str,
-    distributed_executor_backend: str,
-    hf_runner,
-    vllm_runner,
-    example_encoder_decoder_prompts,
-) -> None:
-    '''
-    Test vLLM BART inference on more than one GPU, comparing
-    outputs against HF as a baseline.
-
-    Fork a new process for each test, to prevent CUDA from
-    being re-initialized by successive tests within the same
-    process.
-
-    Arguments:
-
-    * model: the HF ID of the specific BART variant under test
-    * distributed_executor_backend
-    * hf_runner: HuggingFace (HF) test model runner
-    * vllm_runner: vLLM test model runner
-    * example_encoder_decoder_prompts: test fixture which provides a 
-                                        dictionary of dummy prompts
-    '''
-
-    dtype = "float"
-    max_tokens = 64
-    num_logprobs = 5
-
-    # Example inputs with non-trivial (i.e. not None/empty) encoder &
-    # decoder prompts.
-    test_prompts = example_encoder_decoder_prompts[DecoderPromptType.CUSTOM]
-
-    # NOTE: take care of the order. run vLLM first, and then run HF.
-    # vLLM needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            tensor_parallel_size=2,
-            distributed_executor_backend=distributed_executor_backend,
-            enforce_eager=True,
-    ) as vllm_model:
-        vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
-            test_prompts, max_tokens, num_logprobs)
-
-    # Configuration settings for HF baseline
-    hf_kwargs = {
-        "top_k": None,
-        "num_beams": 1,
-        "repetition_penalty": 1.0,
-        "top_p": 1.0,
-        "length_penalty": 1.0,
-        "early_stopping": False,
-        "no_repeat_ngram_size": None,
-        "min_length": 0
-    }
-
-    with hf_runner(model, dtype=dtype,
-                   auto_cls=AutoModelForSeq2SeqLM) as hf_model:
-        hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit(
-            test_prompts,
-            max_tokens,
-            num_logprobs,
-            **hf_kwargs,
-        ))
-
-    check_logprobs_close(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
--- a/tests/distributed/test_chunked_prefill_distributed.py
+++ b/tests/distributed/test_chunked_prefill_distributed.py
@@ -1,75 +0,0 @@
-"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
-
-Run:
-```sh
-pytest test_chunked_prefill_distributed.py
-```
-"""
-
-import os
-
-import pytest
-
-from vllm.utils import cuda_device_count_stateless
-
-from ..models.utils import check_outputs_equal
-from ..utils import fork_new_process_for_each_test
-
-
-@pytest.mark.skipif(cuda_device_count_stateless() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
-@pytest.mark.parametrize("model, distributed_executor_backend", [
-    ("facebook/opt-125m", "ray"),
-    ("meta-llama/Llama-2-7b-hf", "ray"),
-    ("facebook/opt-125m", "mp"),
-    ("meta-llama/Llama-2-7b-hf", "mp"),
-])
-@fork_new_process_for_each_test
-def test_models(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    distributed_executor_backend: str,
-) -> None:
-    if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray":  # noqa
-        assert distributed_executor_backend == "ray"
-        # test ray adag
-        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
-        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
-
-    dtype = "half"
-    max_tokens = 5
-    chunked_prefill_token_size = 16
-
-    # Add a chunked prefill config.
-    max_num_seqs = min(chunked_prefill_token_size, 256)
-    assert chunked_prefill_token_size != -1
-    enable_chunked_prefill = True
-    max_num_batched_tokens = chunked_prefill_token_size
-
-    # NOTE: take care of the order. run vLLM first, and then run HF.
-    # vLLM needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            tensor_parallel_size=2,
-            max_num_seqs=max_num_seqs,
-            enable_chunked_prefill=enable_chunked_prefill,
-            max_num_batched_tokens=max_num_batched_tokens,
-            distributed_executor_backend=distributed_executor_backend,
-    ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
-    check_outputs_equal(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
--- a/tests/distributed/test_multimodal_broadcast.py
+++ b/tests/distributed/test_multimodal_broadcast.py
@@ -1,58 +0,0 @@
-"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
-
-Run:
-```sh
-pytest -s -v test_multimodal_broadcast.py
-```
-"""
-
-import pytest
-
-from vllm.utils import cuda_device_count_stateless
-
-from ..utils import fork_new_process_for_each_test
-
-
-@pytest.mark.skipif(cuda_device_count_stateless() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
-@pytest.mark.parametrize("model, distributed_executor_backend", [
-    ("llava-hf/llava-1.5-7b-hf", "ray"),
-    ("llava-hf/llava-v1.6-mistral-7b-hf", "ray"),
-    ("facebook/chameleon-7b", "ray"),
-    ("llava-hf/llava-1.5-7b-hf", "mp"),
-    ("llava-hf/llava-v1.6-mistral-7b-hf", "mp"),
-    ("facebook/chameleon-7b", "mp"),
-])
-@fork_new_process_for_each_test
-def test_models(hf_runner, vllm_runner, image_assets, model: str,
-                distributed_executor_backend: str) -> None:
-
-    dtype = "half"
-    max_tokens = 5
-    num_logprobs = 5
-    tensor_parallel_size = 2
-
-    if model.startswith("llava-hf/llava-1.5"):
-        from ..models.test_llava import models, run_test
-    elif model.startswith("llava-hf/llava-v1.6"):
-        from ..models.test_llava_next import run_test  # type: ignore[no-redef]
-        from ..models.test_llava_next import models
-    elif model.startswith("facebook/chameleon"):
-        from ..models.test_chameleon import run_test  # type: ignore[no-redef]
-        from ..models.test_chameleon import models
-    else:
-        raise NotImplementedError(f"Unsupported model: {model}")
-
-    run_test(
-        hf_runner,
-        vllm_runner,
-        image_assets,
-        model=models[0],
-        # So that LLaVA-NeXT processor may return nested list
-        size_factors=[0.25, 0.5, 1.0],
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        tensor_parallel_size=tensor_parallel_size,
-        distributed_executor_backend=distributed_executor_backend,
-    )
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -32,9 +32,11 @@ VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
        (1, 4, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
        (2, 2, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
        (2, 2, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
-        (1, 2, 1, 1, 1, "OpenGVLab/InternVL2-1B", "ray"),
-        (1, 2, 1, 1, 1, "OpenGVLab/InternVL2-2B", "ray"),
-        (1, 2, 1, 0, 1, "OpenGVLab/InternVL2-4B", "ray"),
+        # NOTE: InternVL2 multi-node tests are flaky,
+        # use mp backend to skip the multi-node tests
+        (1, 2, 1, 1, 1, "OpenGVLab/InternVL2-1B", "mp"),
+        (1, 2, 1, 1, 1, "OpenGVLab/InternVL2-2B", "mp"),
+        (1, 2, 1, 0, 1, "OpenGVLab/InternVL2-4B", "mp"),
    ],
 )
@fork_new_process_for_each_test
--- a/tests/distributed/test_same_node.py
+++ b/tests/distributed/test_same_node.py
@@ -1,13 +1,13 @@
 import os

-import torch
+import torch.distributed as dist

 from vllm.distributed.parallel_state import in_the_same_node_as

-torch.distributed.init_process_group(backend="gloo")
-test_result = all(
-    in_the_same_node_as(torch.distributed.group.WORLD, source_rank=0))
+if __name__ == "__main__":
+    dist.init_process_group(backend="gloo")
+    test_result = all(in_the_same_node_as(dist.group.WORLD, source_rank=0))

-expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
-assert test_result == expected, f"Expected {expected}, got {test_result}"
-print("Same node test passed!")
+    expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
+    assert test_result == expected, f"Expected {expected}, got {test_result}"
+    print("Same node test passed!")
--- a/tests/engine/test_skip_tokenizer_init.py
+++ b/tests/engine/test_skip_tokenizer_init.py
@@ -11,9 +11,10 @@ def test_skip_tokenizer_initialization(model: str):
    # token ids.
    llm = LLM(model=model, skip_tokenizer_init=True)
    sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
-    with pytest.raises(ValueError) as err:
+
+    with pytest.raises(ValueError, match="cannot pass text prompts when"):
        llm.generate("abc", sampling_params)
-    assert "prompts must be None if" in str(err.value)
+
    outputs = llm.generate({"prompt_token_ids": [1, 2, 3]},
                           sampling_params=sampling_params)
    assert len(outputs) > 0
--- a/tests/entrypoints/offline_mode/init.py
+++ b/tests/entrypoints/offline_mode/init.py
--- a/tests/entrypoints/offline_mode/test_offline_mode.py
+++ b/tests/entrypoints/offline_mode/test_offline_mode.py
@@ -0,0 +1,77 @@
+"""Tests for HF_HUB_OFFLINE mode"""
+import importlib
+import sys
+import weakref
+
+import pytest
+
+from vllm import LLM
+
+from ...conftest import cleanup
+
+MODEL_NAME = "facebook/opt-125m"
+
+
+@pytest.fixture(scope="module")
+def llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(model=MODEL_NAME,
+              max_num_batched_tokens=4096,
+              tensor_parallel_size=1,
+              gpu_memory_utilization=0.10,
+              enforce_eager=True)
+
+    with llm.deprecate_legacy_api():
+        yield weakref.proxy(llm)
+
+        del llm
+
+    cleanup()
+
+
+@pytest.mark.skip_global_cleanup
+def test_offline_mode(llm: LLM, monkeypatch):
+    # we use the llm fixture to ensure the model files are in-cache
+    del llm
+
+    # Set HF to offline mode and ensure we can still construct an LLM
+    try:
+        monkeypatch.setenv("HF_HUB_OFFLINE", "1")
+        # Need to re-import huggingface_hub and friends to setup offline mode
+        _re_import_modules()
+        # Cached model files should be used in offline mode
+        LLM(model=MODEL_NAME,
+            max_num_batched_tokens=4096,
+            tensor_parallel_size=1,
+            gpu_memory_utilization=0.10,
+            enforce_eager=True)
+    finally:
+        # Reset the environment after the test
+        # NB: Assuming tests are run in online mode
+        monkeypatch.delenv("HF_HUB_OFFLINE")
+        _re_import_modules()
+        pass
+
+
+def _re_import_modules():
+    hf_hub_module_names = [
+        k for k in sys.modules if k.startswith("huggingface_hub")
+    ]
+    transformers_module_names = [
+        k for k in sys.modules if k.startswith("transformers")
+        and not k.startswith("transformers_modules")
+    ]
+
+    reload_exception = None
+    for module_name in hf_hub_module_names + transformers_module_names:
+        try:
+            importlib.reload(sys.modules[module_name])
+        except Exception as e:
+            reload_exception = e
+            # Try to continue clean up so that other tests are less likely to
+            # be affected
+
+    # Error this test if reloading a module failed
+    if reload_exception is not None:
+        raise reload_exception
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -10,7 +10,6 @@ import pytest
 import torch

 from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType
-from vllm.attention.backends.xformers import XFormersBackend
 from vllm.utils import (STR_BACKEND_ENV_VAR, STR_XFORMERS_ATTN_VAL,
                        make_tensor_with_pad)

@@ -521,6 +520,9 @@ def make_backend(backend_name: str) -> AttentionBackend:
    * Backend instance
    '''
    if backend_name == STR_XFORMERS_ATTN_VAL:
+        # NOTE: xFormers backend cannot be imported for CPU and AMD GPUs.
+        from vllm.attention.backends.xformers import XFormersBackend
+
        return XFormersBackend()
    raise AssertionError(
        f"Unrecognized backend_name {backend_name} for unit test")
--- a/tests/models/decoder_only/init.py
+++ b/tests/models/decoder_only/init.py
--- a/tests/models/decoder_only/audio_language/init.py
+++ b/tests/models/decoder_only/audio_language/init.py
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -7,10 +7,8 @@ from transformers import AutoModel, AutoTokenizer, BatchEncoding
 from vllm.sequence import SampleLogprobs
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE

-from ..conftest import HfRunner, VllmRunner
-from .utils import check_logprobs_close
-
-pytestmark = pytest.mark.vlm
+from ....conftest import HfRunner, VllmRunner
+from ...utils import check_logprobs_close

 MODEL_NAME = "fixie-ai/ultravox-v0_3"

--- a/tests/models/decoder_only/language/init.py
+++ b/tests/models/decoder_only/language/init.py
--- a/tests/models/decoder_only/language/test_aqlm.py
+++ b/tests/models/decoder_only/language/test_aqlm.py
--- a/tests/models/decoder_only/language/test_big_models.py
+++ b/tests/models/decoder_only/language/test_big_models.py
@@ -7,7 +7,7 @@ Run `pytest tests/models/test_big_models.py`.
 import pytest
 import torch

-from .utils import check_outputs_equal
+from ...utils import check_outputs_equal

 MODELS = [
    "meta-llama/Llama-2-7b-hf",
--- a/tests/models/decoder_only/language/test_danube3_4b.py
+++ b/tests/models/decoder_only/language/test_danube3_4b.py
@@ -6,7 +6,7 @@ Run `pytest tests/models/test_danube3_4b.py`.
 """
 import pytest

-from .utils import check_outputs_equal
+from ...utils import check_outputs_equal

 MODELS = ["h2oai/h2o-danube3-4b-base"]

--- a/tests/models/decoder_only/language/test_fp8.py
+++ b/tests/models/decoder_only/language/test_fp8.py
@@ -10,7 +10,7 @@ import pytest
 from tests.kernels.utils import override_backend_env_variable
 from tests.quantization.utils import is_quant_method_supported

-from ..models.utils import check_logprobs_close
+from ...utils import check_logprobs_close

 os.environ["TOKENIZERS_PARALLELISM"] = "true"

--- a/tests/models/decoder_only/language/test_gguf.py
+++ b/tests/models/decoder_only/language/test_gguf.py
@@ -11,7 +11,7 @@ from transformers import AutoTokenizer

 from tests.quantization.utils import is_quant_method_supported

-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close

 os.environ["TOKENIZERS_PARALLELISM"] = "true"

--- a/tests/models/decoder_only/language/test_gptq_marlin.py
+++ b/tests/models/decoder_only/language/test_gptq_marlin.py
@@ -15,7 +15,7 @@ import pytest
 from tests.quantization.utils import is_quant_method_supported
 from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT

-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close

 os.environ["TOKENIZERS_PARALLELISM"] = "true"

--- a/tests/models/decoder_only/language/test_gptq_marlin_24.py
+++ b/tests/models/decoder_only/language/test_gptq_marlin_24.py
@@ -10,9 +10,10 @@ from dataclasses import dataclass

 import pytest

-from tests.models.utils import check_logprobs_close
 from tests.quantization.utils import is_quant_method_supported

+from ...utils import check_logprobs_close
+

@dataclass
 class ModelPair:
--- a/tests/models/decoder_only/language/test_granite.py
+++ b/tests/models/decoder_only/language/test_granite.py
@@ -6,7 +6,7 @@ import importlib.metadata

 import pytest

-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close

 TRANSFORMERS_VERSION = tuple(
    map(int,
--- a/tests/models/decoder_only/language/test_jamba.py
+++ b/tests/models/decoder_only/language/test_jamba.py
@@ -1,8 +1,9 @@
 import pytest

-from tests.models.utils import check_outputs_equal
 from vllm.worker.model_runner import _get_graph_batch_size

+from ...utils import check_outputs_equal
+
 MODELS = ["ai21labs/Jamba-tiny-random"]


--- a/tests/models/decoder_only/language/test_marlin.py
+++ b/tests/models/decoder_only/language/test_marlin.py
@@ -16,7 +16,7 @@ import pytest

 from tests.quantization.utils import is_quant_method_supported

-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close


@dataclass
--- a/tests/models/decoder_only/language/test_mistral.py
+++ b/tests/models/decoder_only/language/test_mistral.py
@@ -4,7 +4,7 @@ Run `pytest tests/models/test_mistral.py`.
 """
 import pytest

-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close

 MODELS = [
    "mistralai/Mistral-7B-Instruct-v0.1",
--- a/tests/models/decoder_only/language/test_modelopt.py
+++ b/tests/models/decoder_only/language/test_modelopt.py
--- a/tests/models/decoder_only/language/test_models.py
+++ b/tests/models/decoder_only/language/test_models.py
@@ -7,7 +7,7 @@ Run `pytest tests/models/test_models.py`.
 """
 import pytest

-from .utils import check_outputs_equal
+from ...utils import check_outputs_equal

 MODELS = [
    "facebook/opt-125m",
--- a/tests/models/decoder_only/language/test_phimoe.py
+++ b/tests/models/decoder_only/language/test_phimoe.py
@@ -7,7 +7,7 @@ import torch

 from vllm.utils import is_cpu

-from .utils import check_logprobs_close
+from ...utils import check_logprobs_close

 MODELS = [
    "microsoft/Phi-3.5-MoE-instruct",
--- a/tests/models/decoder_only/vision_language/init.py
+++ b/tests/models/decoder_only/vision_language/init.py
--- a/tests/models/decoder_only/vision_language/test_blip2.py
+++ b/tests/models/decoder_only/vision_language/test_blip2.py
@@ -6,10 +6,8 @@ from transformers import AutoModelForVision2Seq, AutoTokenizer
 from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs

-from ..conftest import IMAGE_ASSETS
-from .utils import check_logprobs_close
-
-pytestmark = pytest.mark.vlm
+from ....conftest import IMAGE_ASSETS
+from ...utils import check_logprobs_close

 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "stop_sign":
@@ -56,7 +54,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
                dtype: str, max_tokens: int, num_logprobs: int) -> None:
    """Inference result should be the same between hf and vllm.

-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalData objects and corresponding
    MultiModalConfig as input.
--- a/tests/models/decoder_only/vision_language/test_broadcast.py
+++ b/tests/models/decoder_only/vision_language/test_broadcast.py
@@ -0,0 +1,42 @@
+import pytest
+
+from ....utils import multi_gpu_test
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
+@pytest.mark.parametrize("model", [
+    "llava-hf/llava-1.5-7b-hf",
+    "llava-hf/llava-v1.6-mistral-7b-hf",
+    "facebook/chameleon-7b",
+])
+def test_models(hf_runner, vllm_runner, image_assets,
+                distributed_executor_backend, model) -> None:
+
+    dtype = "half"
+    max_tokens = 5
+    num_logprobs = 5
+    tensor_parallel_size = 2
+
+    if model.startswith("llava-hf/llava-1.5"):
+        from .test_llava import models, run_test
+    elif model.startswith("llava-hf/llava-v1.6"):
+        from .test_llava_next import models, run_test  # type: ignore[no-redef]
+    elif model.startswith("facebook/chameleon"):
+        from .test_chameleon import models, run_test  # type: ignore[no-redef]
+    else:
+        raise NotImplementedError(f"Unsupported model: {model}")
+
+    run_test(
+        hf_runner,
+        vllm_runner,
+        image_assets,
+        model=models[0],
+        # So that LLaVA-NeXT processor may return nested list
+        size_factors=[0.25, 0.5, 1.0],
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+    )
--- a/tests/models/decoder_only/vision_language/test_chameleon.py
+++ b/tests/models/decoder_only/vision_language/test_chameleon.py
@@ -6,10 +6,8 @@ from transformers import AutoModelForVision2Seq, BatchEncoding
 from vllm.multimodal.utils import rescale_image_size
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE

-from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
-from .utils import check_outputs_equal
-
-pytestmark = pytest.mark.vlm
+from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from ...utils import check_outputs_equal

 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "stop_sign":
@@ -36,7 +34,7 @@ def run_test(
 ):
    """Inference result should be the same between hf and vllm.

-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalDataDict objects 
    and corresponding vision language config as input.
--- a/tests/models/decoder_only/vision_language/test_fuyu.py
+++ b/tests/models/decoder_only/vision_language/test_fuyu.py
@@ -6,10 +6,8 @@ from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
 from vllm.utils import is_cpu

-from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
-from .utils import check_logprobs_close
-
-pytestmark = pytest.mark.vlm
+from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from ...utils import check_logprobs_close

 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "stop_sign":
@@ -46,7 +44,7 @@ def run_test(
 ):
    """Inference result should be the same between hf and vllm.

-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalDataDict objects 
    and corresponding MultiModalConfig as input.
--- a/tests/models/decoder_only/vision_language/test_intern_vit.py
+++ b/tests/models/decoder_only/vision_language/test_intern_vit.py
@@ -6,9 +6,7 @@ import torch.nn as nn
 from huggingface_hub import snapshot_download
 from transformers import AutoConfig, AutoModel, CLIPImageProcessor

-from ..conftest import _ImageAssets, cleanup
-
-pytestmark = pytest.mark.vlm
+from ....conftest import _ImageAssets, cleanup

 # we use snapshot_download to prevent conflicts between
 # dynamic_module and trust_remote_code for hf_runner
--- a/tests/models/decoder_only/vision_language/test_internvl.py
+++ b/tests/models/decoder_only/vision_language/test_internvl.py
@@ -9,11 +9,9 @@ from transformers import AutoConfig
 from vllm.multimodal.utils import rescale_image_size
 from vllm.utils import is_cpu

-from ..conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
-                        _ImageAssets)
-from .utils import check_logprobs_close
-
-pytestmark = pytest.mark.vlm
+from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
+                          _ImageAssets)
+from ...utils import check_logprobs_close

 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "stop_sign":
@@ -78,7 +76,7 @@ def run_test(
 ):
    """Inference result should be the same between hf and vllm.

-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalDataDict objects 
    and corresponding MultiModalConfig as input.
@@ -331,6 +329,41 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
    )


+@pytest.mark.parametrize("model", ["OpenGVLab/InternVL2-2B"])
+@pytest.mark.parametrize("size_factors", [[0.5, 1.0]])
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+@torch.inference_mode()
+def test_different_num_patches(hf_runner, vllm_runner, image_assets, model,
+                               size_factors, dtype: str, max_tokens: int,
+                               num_logprobs: int) -> None:
+    images = [asset.pil_image.resize((896, 896)) for asset in image_assets]
+
+    inputs_batching = [(
+        [prompt for _ in size_factors],
+        [rescale_image_size(image, factor) for factor in size_factors],
+    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+
+    inputs_multi_images = [
+        ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
+         [[rescale_image_size(image, factor) for image in images]
+          for factor in size_factors])
+    ]
+    for inputs in [inputs_batching, inputs_multi_images]:
+        run_test(
+            hf_runner,
+            vllm_runner,
+            inputs,
+            model,
+            dtype=dtype,
+            max_tokens=max_tokens,
+            num_logprobs=num_logprobs,
+            mm_limit=2,
+            tensor_parallel_size=1,
+        )
+
+
@pytest.mark.parametrize(
    "models", [("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")])
@pytest.mark.parametrize(
--- a/tests/models/decoder_only/vision_language/test_llava.py
+++ b/tests/models/decoder_only/vision_language/test_llava.py
@@ -8,11 +8,9 @@ from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE

-from ..conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
-                        _ImageAssets)
-from .utils import check_logprobs_close
-
-pytestmark = pytest.mark.vlm
+from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
+                          _ImageAssets)
+from ...utils import check_logprobs_close

 _LIMIT_IMAGE_PER_PROMPT = 4

@@ -143,7 +141,7 @@ def _run_test(
 ):
    """Inference result should be the same between hf and vllm.

-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalDataDict objects 
    and corresponding MultiModalConfig as input.
@@ -239,7 +237,7 @@ def _run_test(
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
 def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
-                dtype: str, max_tokens: int, num_logprobs: int) -> None:
+                dtype, max_tokens, num_logprobs) -> None:
    run_test(
        hf_runner,
        vllm_runner,
--- a/tests/models/decoder_only/vision_language/test_llava_image_embeds.py
+++ b/tests/models/decoder_only/vision_language/test_llava_image_embeds.py
@@ -5,10 +5,8 @@ from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer

 from vllm.sequence import SampleLogprobs

-from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
-from .utils import check_logprobs_close
-
-pytestmark = pytest.mark.vlm
+from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from ...utils import check_logprobs_close

 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "stop_sign":
@@ -62,7 +60,7 @@ def run_test(
 ):
    """Inference result should be the same between hf and vllm.

-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalDataDict objects 
    and corresponding vision language config as input.
--- a/tests/models/decoder_only/vision_language/test_llava_next.py
+++ b/tests/models/decoder_only/vision_language/test_llava_next.py
@@ -6,11 +6,9 @@ from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
 from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs

-from ..conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
-                        _ImageAssets)
-from .utils import check_logprobs_close
-
-pytestmark = pytest.mark.vlm
+from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
+                          _ImageAssets)
+from ...utils import check_logprobs_close

 _LIMIT_IMAGE_PER_PROMPT = 4

@@ -197,7 +195,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
                dtype, max_tokens, num_logprobs) -> None:
    """Inference result should be the same between hf and vllm.

-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalDataDict objects
    and corresponding MultiModalConfig as input.
--- a/tests/models/decoder_only/vision_language/test_llava_next_video.py
+++ b/tests/models/decoder_only/vision_language/test_llava_next_video.py
@@ -8,10 +8,8 @@ from vllm.multimodal.utils import (rescale_video_size, resize_video,
                                   sample_frames_from_video)
 from vllm.sequence import SampleLogprobs

-from ..conftest import VIDEO_ASSETS, HfRunner, VllmRunner, _VideoAssets
-from .utils import check_logprobs_close
-
-pytestmark = pytest.mark.vlm
+from ....conftest import VIDEO_ASSETS, HfRunner, VllmRunner, _VideoAssets
+from ...utils import check_logprobs_close

 _PREFACE = (
    "A chat between a curious human and an artificial intelligence assistant. "
--- a/tests/models/decoder_only/vision_language/test_minicpmv.py
+++ b/tests/models/decoder_only/vision_language/test_minicpmv.py
@@ -9,10 +9,8 @@ from transformers import BatchEncoding
 from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs

-from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner
-from .utils import check_logprobs_close
-
-pytestmark = pytest.mark.vlm
+from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner
+from ...utils import check_logprobs_close

 # The image token is placed before "user" on purpose so that the test can pass
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
@@ -65,7 +63,7 @@ def run_test(
 ):
    """Inference result should be the same between hf and vllm.

-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalDataDict objects 
    and corresponding MultiModalConfig as input.
--- a/tests/models/decoder_only/vision_language/test_paligemma.py
+++ b/tests/models/decoder_only/vision_language/test_paligemma.py
@@ -8,10 +8,8 @@ from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
 from vllm.utils import is_hip

-from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
-from .utils import check_logprobs_close
-
-pytestmark = pytest.mark.vlm
+from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from ...utils import check_logprobs_close

 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "stop_sign":
@@ -69,7 +67,7 @@ def run_test(
 ):
    """Inference result should be the same between hf and vllm.

-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalDataDict objects 
    and corresponding MultiModalConfig as input.
--- a/tests/models/decoder_only/vision_language/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/test_phi3v.py
@@ -9,10 +9,8 @@ from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
 from vllm.utils import is_cpu, is_hip

-from ..conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
-from .utils import check_logprobs_close
-
-pytestmark = pytest.mark.vlm
+from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
+from ...utils import check_logprobs_close

 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "stop_sign":
@@ -71,7 +69,7 @@ def run_test(
 ):
    """Inference result should be the same between hf and vllm.

-    All the image fixtures for the test is under tests/images.
+    All the image fixtures for the test are from IMAGE_ASSETS.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalDataDict objects 
    and corresponding MultiModalConfig as input.
--- a/tests/models/decoder_only/vision_language/test_pixtral.py
+++ b/tests/models/decoder_only/vision_language/test_pixtral.py
@@ -0,0 +1,199 @@
+"""Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
+
+Run `pytest tests/models/test_mistral.py`.
+"""
+import json
+import uuid
+from dataclasses import asdict
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
+
+import pytest
+from mistral_common.protocol.instruct.messages import ImageURLChunk
+from mistral_common.protocol.instruct.request import ChatCompletionRequest
+from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
+from mistral_common.tokens.tokenizers.multimodal import image_from_chunk
+
+from vllm import EngineArgs, LLMEngine, SamplingParams, TokensPrompt
+from vllm.multimodal import MultiModalDataBuiltins
+from vllm.sequence import Logprob, SampleLogprobs
+
+from ....utils import VLLM_PATH
+from ...utils import check_logprobs_close
+
+if TYPE_CHECKING:
+    from _typeshed import StrPath
+
+MODELS = ["mistralai/Pixtral-12B-2409"]
+IMG_URLS = [
+    "https://picsum.photos/id/237/400/300",
+    "https://picsum.photos/id/231/200/300",
+    "https://picsum.photos/id/27/500/500",
+    "https://picsum.photos/id/17/150/600",
+]
+PROMPT = "Describe each image in one short sentence."
+
+
+def _create_msg_format(urls: List[str]) -> List[Dict[str, Any]]:
+    return [{
+        "role":
+        "user",
+        "content": [{
+            "type": "text",
+            "text": PROMPT,
+        }] + [{
+            "type": "image_url",
+            "image_url": {
+                "url": url
+            }
+        } for url in urls],
+    }]
+
+
+def _create_engine_inputs(urls: List[str]) -> TokensPrompt:
+    msg = _create_msg_format(urls)
+
+    tokenizer = MistralTokenizer.from_model("pixtral")
+
+    request = ChatCompletionRequest(messages=msg)  # type: ignore[type-var]
+    tokenized = tokenizer.encode_chat_completion(request)
+
+    engine_inputs = TokensPrompt(prompt_token_ids=tokenized.tokens)
+
+    images = []
+    for chunk in request.messages[0].content:
+        if isinstance(chunk, ImageURLChunk):
+            images.append(image_from_chunk(chunk))
+
+    mm_data = MultiModalDataBuiltins(image=images)
+    engine_inputs["multi_modal_data"] = mm_data
+
+    return engine_inputs
+
+
+MSGS = [
+    _create_msg_format(IMG_URLS[:1]),
+    _create_msg_format(IMG_URLS[:2]),
+    _create_msg_format(IMG_URLS),
+]
+ENGINE_INPUTS = [
+    _create_engine_inputs(IMG_URLS[:1]),
+    _create_engine_inputs(IMG_URLS[:2]),
+    _create_engine_inputs(IMG_URLS),
+]
+
+SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
+LIMIT_MM_PER_PROMPT = dict(image=4)
+
+MAX_MODEL_LEN = [8192, 65536]
+
+FIXTURES_PATH = VLLM_PATH / "tests/models/fixtures"
+assert FIXTURES_PATH.exists()
+
+FIXTURE_LOGPROBS_CHAT = FIXTURES_PATH / "pixtral_chat.json"
+FIXTURE_LOGPROBS_ENGINE = FIXTURES_PATH / "pixtral_chat_engine.json"
+
+OutputsLogprobs = List[Tuple[List[int], str, Optional[SampleLogprobs]]]
+
+
+# For the test author to store golden output in JSON
+def _dump_outputs_w_logprobs(
+    outputs: OutputsLogprobs,
+    filename: "StrPath",
+) -> None:
+    json_data = [(tokens, text,
+                  [{k: asdict(v)
+                    for k, v in token_logprobs.items()}
+                   for token_logprobs in (logprobs or [])])
+                 for tokens, text, logprobs in outputs]
+
+    with open(filename, "w") as f:
+        json.dump(json_data, f)
+
+
+def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
+    with open(filename, "rb") as f:
+        json_data = json.load(f)
+
+    return [(tokens, text,
+             [{int(k): Logprob(**v)
+               for k, v in token_logprobs.items()}
+              for token_logprobs in logprobs])
+            for tokens, text, logprobs in json_data]
+
+
+@pytest.mark.skip(
+    reason=
+    "Model is too big, test passed on A100 locally but will OOM on CI machine."
+)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("max_model_len", MAX_MODEL_LEN)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+def test_chat(
+    vllm_runner,
+    max_model_len: int,
+    model: str,
+    dtype: str,
+) -> None:
+    EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_CHAT)
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            tokenizer_mode="mistral",
+            enable_chunked_prefill=False,
+            max_model_len=max_model_len,
+            limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
+    ) as vllm_model:
+        outputs = []
+        for msg in MSGS:
+            output = vllm_model.model.chat(msg,
+                                           sampling_params=SAMPLING_PARAMS)
+
+            outputs.extend(output)
+
+    logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
+    check_logprobs_close(outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
+                         outputs_1_lst=logprobs,
+                         name_0="h100_ref",
+                         name_1="output")
+
+
+@pytest.mark.skip(
+    reason=
+    "Model is too big, test passed on A100 locally but will OOM on CI machine."
+)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+def test_model_engine(vllm_runner, model: str, dtype: str) -> None:
+    EXPECTED_ENGINE_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_ENGINE)
+    args = EngineArgs(
+        model=model,
+        tokenizer_mode="mistral",
+        enable_chunked_prefill=False,
+        limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
+        dtype=dtype,
+    )
+    engine = LLMEngine.from_engine_args(args)
+
+    engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[0], SAMPLING_PARAMS)
+    engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[1], SAMPLING_PARAMS)
+
+    outputs = []
+    count = 0
+    while True:
+        out = engine.step()
+        count += 1
+        for request_output in out:
+            if request_output.finished:
+                outputs.append(request_output)
+
+        if count == 2:
+            engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[2],
+                               SAMPLING_PARAMS)
+        if not engine.has_unfinished_requests():
+            break
+
+    logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
+    check_logprobs_close(outputs_0_lst=EXPECTED_ENGINE_LOGPROBS,
+                         outputs_1_lst=logprobs,
+                         name_0="h100_ref",
+                         name_1="output")
--- a/tests/models/decoder_only/vision_language/test_qwen.py
+++ b/tests/models/decoder_only/vision_language/test_qwen.py
@@ -0,0 +1,401 @@
+import pathlib
+from typing import Dict, List, Optional, Tuple, Type, Union
+
+import pytest
+import torch
+from PIL.Image import Image
+
+from vllm.config import ModelConfig
+from vllm.inputs import InputContext, LLMInputs
+from vllm.multimodal.base import MultiModalInputs
+from vllm.multimodal.utils import cached_get_tokenizer, rescale_image_size
+
+from ....conftest import (IMAGE_ASSETS, HfRunner, ImageAsset, PromptImageInput,
+                          VllmRunner, _ImageAssets)
+from ...utils import check_logprobs_close
+
+text_only_models = [
+    "Qwen/Qwen-7B-Chat"  # Has no visual component
+]
+
+multimodal_models = ["Qwen/Qwen-VL"]
+
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
+    "Picture 1: <img></img>\nWhat's the content of the image?: ",
+    "cherry_blossom":
+    "Picture 1: <img></img>\nWhat is the season?: ",
+})
+
+HF_MULTIIMAGE_IMAGE_PROMPT = "Picture 1: <img></img>\nPicture 2: <img></img>\nCan you compare these images?\n"  # noqa: E501
+HF_MULTIIMAGE_IMAGE_PROMPT = "Picture 1: <img></img>\nPicture 2: <img></img>\nDescribe the two images in detail.\n"  # noqa: E501
+### Multimodal preprocessing tests
+SAMPLE_IMAGE = IMAGE_ASSETS[0].pil_image
+# These values are specific to Qwen-VL/Chat; we can get these from the model
+# config also, but they are hardcoded here to keep the parameterize/fixtures
+# easy to read.
+IMG_START_ID = 151857
+IMG_END_ID = 151858
+IMG_PAD_ID = 151859
+TOKS_PER_IMG = 256
+VIS_ENC_DIM = 4096
+IMG_SIZE = 448
+
+
+def build_model_context(model_name: str,
+                        tokenizer_name: Optional[str] = None,
+                        trust_remote_code: bool = False):
+    """Creates an InputContext for a given model.
+    
+    Args:
+        model_name: Name of the model being considered.
+        tokenizer_name: Name of the tokenizer being considered.
+        trust_remote_code: Whether or not to allow loading remote code.
+
+    Returns:
+        InputContext for the model being considered.
+    """
+    if tokenizer_name is None:
+        tokenizer_name = model_name
+    model_config = ModelConfig(
+        model_name,
+        tokenizer_name,
+        tokenizer_mode="auto",
+        trust_remote_code=trust_remote_code,
+        dtype="float32",
+        seed=0,
+    )
+    return InputContext(model_config)
+
+
+@pytest.fixture()
+def input_mapper_for_qwen():
+    # Lazy import to avoid initializing CUDA during test collection
+    from vllm.model_executor.models.qwen import input_mapper_for_qwen
+    return input_mapper_for_qwen
+
+
+@pytest.fixture()
+def input_processor_for_qwen():
+    # Lazy import to avoid initializing CUDA during test collection
+    from vllm.model_executor.models.qwen import input_processor_for_qwen
+    return input_processor_for_qwen
+
+
+@pytest.fixture()
+def qwen_vl_context() -> InputContext:
+    """Get an InputContext for Qwen-VL."""
+    return build_model_context(model_name="Qwen/Qwen-VL",
+                               trust_remote_code=True)
+
+
+# Happy path tests for single/multi-image scenarios for the multimodal
+# input processor and mapper, respectively
+@pytest.mark.parametrize("num_images", [1, 2])
+def test_input_processor_valid_mm_data(input_processor_for_qwen,
+                                       qwen_vl_context: InputContext,
+                                       num_images: int):
+    """Happy cases for image inputs to Qwen's multimodal input processor."""
+    prompt = "".join(
+        [f"Picture {num}: <img></img>\n" for num in range(1, num_images + 1)])
+    inputs = LLMInputs(
+        prompt=prompt,
+        # When processing multimodal data for a multimodal model, the qwen
+        # input processor will overwrite the provided prompt_token_ids with
+        # the image prompts
+        prompt_token_ids=None,
+        multi_modal_data={"image": torch.rand(num_images, TOKS_PER_IMG, 4096)},
+    )
+    proc_inputs = input_processor_for_qwen(qwen_vl_context, inputs)
+    assert isinstance(proc_inputs, dict)
+
+    # Each image should have one start / stop and a fixed context of 256
+    proc_tokens = proc_inputs["prompt_token_ids"]
+    assert proc_tokens.count(IMG_START_ID) == num_images
+    assert proc_tokens.count(IMG_END_ID) == num_images
+    assert proc_tokens.count(IMG_PAD_ID) == num_images * TOKS_PER_IMG
+
+
+@pytest.mark.parametrize(
+    "img_data,expected_shape",
+    [
+        # single / multi-image
+        (SAMPLE_IMAGE, (1, 3, IMG_SIZE, IMG_SIZE)),
+        (2 * [SAMPLE_IMAGE], (2, 3, IMG_SIZE, IMG_SIZE)),
+        # single / multi-image embeddings
+        (torch.rand(
+            (TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
+        (torch.rand(
+            (1, TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
+        (torch.rand(
+            (2, TOKS_PER_IMG, VIS_ENC_DIM)), (2, TOKS_PER_IMG, VIS_ENC_DIM)),
+    ])
+def test_input_mapper_valid_mm_data(input_mapper_for_qwen,
+                                    qwen_vl_context: InputContext,
+                                    img_data: Union[torch.Tensor, List[Image],
+                                                    Image],
+                                    expected_shape: List[int]):
+    """Happy cases for image inputs to Qwen's multimodal input mapper."""
+    mapped_img_data = input_mapper_for_qwen(qwen_vl_context, img_data)
+    # Ensure that we get the appropriately shaped pixel_values
+    # for images and image embeddings, respectively.
+    assert isinstance(mapped_img_data, MultiModalInputs)
+    assert "pixel_values" in mapped_img_data
+    assert mapped_img_data["pixel_values"].shape == expected_shape
+
+
+# Sad path tests for the multimodal input processor and mapper, respectively
+@pytest.mark.parametrize("mm_data", [
+    {
+        "image": torch.rand((5))
+    },
+    {
+        "image": torch.rand((5, 5, 5, 5, 5))
+    },
+])
+def test_input_processor_invalid_mm_data(input_processor_for_qwen,
+                                         qwen_vl_context: InputContext,
+                                         mm_data: Dict[str, torch.Tensor]):
+    """Test sad cases validated in Qwen's multimodal input processor."""
+    tokenizer = cached_get_tokenizer(qwen_vl_context.model_config.tokenizer,
+                                     trust_remote_code=True)
+    prompt = "Picture 1: <img></img>\n"
+    prompt_token_ids = tokenizer.encode(prompt)
+    inputs = LLMInputs(prompt=prompt,
+                       prompt_token_ids=prompt_token_ids,
+                       multi_modal_data=mm_data)
+    # Should fail since we have too many or too few dimensions for embeddings
+    with pytest.raises(ValueError):
+        input_processor_for_qwen(qwen_vl_context, inputs)
+
+
+@pytest.mark.parametrize(
+    "img_data",
+    [
+        # Wrong context length
+        torch.rand((1, TOKS_PER_IMG + 10, VIS_ENC_DIM)),
+        # Wrong visual encoder output size
+        torch.rand((1, TOKS_PER_IMG, VIS_ENC_DIM + 10)),
+    ])
+def test_input_mapper_invalid_mm_data(
+    input_mapper_for_qwen,
+    qwen_vl_context: InputContext,
+    img_data: Union[torch.Tensor, List[Image], Image],
+):
+    """Sad cases validated in Qwen VL's multimodal input mapper."""
+    with pytest.raises(ValueError):
+        input_mapper_for_qwen(qwen_vl_context, img_data)
+
+
+### End-to-end generation tests
+def get_prompt_with_path(tmp_path: pathlib.PosixPath, prompt: str,
+                         assets: Union[_ImageAssets, List[ImageAsset]]) -> str:
+    """Given a temporary dir path, export one or more image assets into the
+    tempdir & replace its contents with the local path to the string so that
+    the HF version of Qwen-VL can resolve the path and load the image ni its
+    forward() call.
+
+    Args:
+        tmp_path: Tempdir for test under consideration.
+        prompt: Prompt with image placeholders.
+        assets: List of image assets whose len equals the num placeholders.
+    """
+    # Ensure that the number of placeholders matches the number of assets;
+    # If this is not true, the test is probably written incorrectly.
+    assert prompt.count("<img></img>") == len(assets)
+
+    # Replace the placeholders with local paths to the exported assets
+    for asset in assets:
+        image_tmp_path = tmp_path / f"{asset.name}.jpg"
+        asset.pil_image.save(image_tmp_path)
+        prompt = prompt.replace(
+            "<img></img>",
+            f"<img>{image_tmp_path}</img>",
+            1,
+        )
+    return prompt
+
+
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    inputs: List[Tuple[List[str], PromptImageInput]],
+    model: str,
+    *,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    mm_limit: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    """Inference result should be the same between hf and vllm.
+
+    All the image fixtures for the test is under tests/images.
+    For huggingface runner, we provide the PIL images as input.
+    For vllm runner, we provide MultiModalDataDict objects
+    and corresponding MultiModalConfig as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+
+    # max_model_len should be greater than image_feature_size
+    # Qwen encodes each image into a fixed content size of 256
+    with vllm_runner(model,
+                     max_model_len=1024,
+                     max_num_seqs=1,
+                     dtype=dtype,
+                     limit_mm_per_prompt={"image": mm_limit},
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
+                     enforce_eager=True) as vllm_model:
+        vllm_outputs_per_image = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                images=images)
+            for prompts, images in inputs
+        ]
+
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs_per_image = [
+            hf_model.generate_greedy_logprobs_limit(prompts,
+                                                    max_tokens,
+                                                    num_logprobs=num_logprobs,
+                                                    images=images)
+            for prompts, images in inputs
+        ]
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
+                                        vllm_outputs_per_image):
+
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+@pytest.mark.parametrize("model", multimodal_models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [8])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_multimodal_models_single_image(tmp_path: pathlib.PosixPath,
+                                        hf_runner: Type[HfRunner],
+                                        vllm_runner: Type[VllmRunner],
+                                        image_assets: _ImageAssets, model: str,
+                                        size_factors: List[float], dtype: str,
+                                        max_tokens: int,
+                                        num_logprobs: int) -> None:
+    """Tests multimodal models with single image prompts."""
+    images = [asset.pil_image for asset in image_assets]
+
+    prompts = [
+        get_prompt_with_path(tmp_path, prompt, [asset])
+        for prompt, asset in zip(HF_IMAGE_PROMPTS, image_assets)
+    ]
+
+    inputs = [(
+        [prompt for _ in size_factors],
+        [rescale_image_size(image, factor) for factor in size_factors],
+    ) for image, prompt in zip(images, prompts)]
+
+    run_test(
+        hf_runner,
+        vllm_runner,
+        inputs,
+        model,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        mm_limit=1,
+        tensor_parallel_size=1,
+    )
+
+
+@pytest.mark.parametrize("model", multimodal_models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_multimodal_models_multi_image(tmp_path: pathlib.PosixPath,
+                                       hf_runner: Type[HfRunner],
+                                       vllm_runner: Type[VllmRunner],
+                                       image_assets: _ImageAssets, model: str,
+                                       size_factors: List[float], dtype: str,
+                                       max_tokens: int,
+                                       num_logprobs: int) -> None:
+    """Tests multimodal models with multi-image prompts."""
+    images = [asset.pil_image for asset in image_assets]
+    # Put all of the images into one prompt.
+    prompt = get_prompt_with_path(tmp_path, HF_MULTIIMAGE_IMAGE_PROMPT,
+                                  image_assets)
+    inputs = [([prompt for _ in size_factors],
+               [[rescale_image_size(image, factor) for image in images]
+                for factor in size_factors])]
+
+    run_test(
+        hf_runner,
+        vllm_runner,
+        inputs,
+        model,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        mm_limit=2,
+        tensor_parallel_size=1,
+    )
+
+
+# Ensure that a text-only Qwen model can still be loaded and
+# used for inference in VLLM without throwing.
+@pytest.mark.parametrize("model", text_only_models)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_text_only_qwen_model_can_be_loaded_and_run(
+    vllm_runner: Type[VllmRunner],
+    example_prompts: List[str],
+    model: str,
+    *,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+):
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_model.generate_greedy_logprobs(
+            example_prompts,
+            max_tokens,
+            num_logprobs=num_logprobs,
+        )
--- a/tests/models/embedding/init.py
+++ b/tests/models/embedding/init.py
--- a/tests/models/embedding/language/init.py
+++ b/tests/models/embedding/language/init.py
--- a/tests/models/embedding/language/test_embedding.py
+++ b/tests/models/embedding/language/test_embedding.py
--- a/tests/models/encoder_decoder/init.py
+++ b/tests/models/encoder_decoder/init.py
--- a/tests/models/encoder_decoder/language/init.py
+++ b/tests/models/encoder_decoder/language/init.py
--- a/tests/models/encoder_decoder/language/test_bart.py
+++ b/tests/models/encoder_decoder/language/test_bart.py
@@ -1,8 +1,8 @@
 """Compare the outputs of HF and vLLM for BART models using greedy sampling.

-Run `pytest tests/models/test_bart.py`.
+Run `pytest tests/models/encoder_decoder/language/test_bart.py`.
 """
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Type

 from vllm.utils import is_cpu

@@ -16,8 +16,10 @@ if not is_cpu():

    from vllm.sequence import SampleLogprobs

-    from ..conftest import DecoderPromptType
-    from .utils import check_logprobs_close
+    from ....conftest import (DecoderPromptType, ExplicitEncoderDecoderPrompt,
+                              HfRunner, VllmRunner)
+    from ....utils import multi_gpu_test
+    from ...utils import check_logprobs_close

    MODELS = ["facebook/bart-base", "facebook/bart-large-cnn"]

@@ -34,20 +36,18 @@ if not is_cpu():

        return output_ids, hf_output_str, out_logprobs

-    @pytest.mark.parametrize("model", MODELS)
-    @pytest.mark.parametrize("dtype", ["float", "bfloat16"])
-    @pytest.mark.parametrize("max_tokens", [64])
-    @pytest.mark.parametrize("num_logprobs", [5])
-    @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
-    def test_models(
-        hf_runner,
-        vllm_runner,
-        example_encoder_decoder_prompts,
+    def run_test(
+        hf_runner: Type[HfRunner],
+        vllm_runner: Type[VllmRunner],
+        prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
+        decoder_prompt_type: DecoderPromptType,
        model: str,
+        *,
        dtype: str,
        max_tokens: int,
        num_logprobs: int,
-        decoder_prompt_type: DecoderPromptType,
+        tensor_parallel_size: int,
+        distributed_executor_backend: Optional[str] = None,
    ) -> None:
        '''
        Test the vLLM BART model for a variety of encoder/decoder input prompts,
@@ -116,8 +116,29 @@ if not is_cpu():
        token during the process of validating the vLLM decoded output.
        '''

-        test_case_prompts = example_encoder_decoder_prompts[
-            decoder_prompt_type]
+        # NOTE: take care of the order. run vLLM first, and then run HF.
+        # vLLM needs a fresh new process without cuda initialization.
+        # if we run HF first, the cuda initialization will be done and it
+        # will hurt multiprocessing backend with fork method (the default).
+
+        # Note: currently encoder/decoder models are only compatible with
+        # enforce_eager=True. Normally this is not a problem because
+        # for encoder/decoder models vLLM will
+        # default to enforce_eager=True if enforce_eager
+        # is left unspecified. However, the
+        # VllmRunner test fixture (which wraps around the LLM class) defaults to
+        # enforce_eager=False (a behavior which a number of already-exisitng
+        # decoder-only unit tests expect), so when testing an encoder/decoder
+        # model we must explicitly specify enforce_eager=True in the VllmRunner
+        # constructor.
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                tensor_parallel_size=tensor_parallel_size,
+                distributed_executor_backend=distributed_executor_backend,
+                enforce_eager=True) as vllm_model:
+            vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
+                prompts, max_tokens, num_logprobs)

        # Configuration settings for HF baseline
        hf_kwargs = {
@@ -135,26 +156,12 @@ if not is_cpu():
                       auto_cls=AutoModelForSeq2SeqLM) as hf_model:
            hf_outputs = (
                hf_model.generate_encoder_decoder_greedy_logprobs_limit(
-                    test_case_prompts,
+                    prompts,
                    max_tokens,
                    num_logprobs,
                    **hf_kwargs,
                ))

-        # Note: currently encoder/decoder models are only compatible with
-        # enforce_eager=True. Normally this is not a problem because
-        # for encoder/decoder models vLLM will
-        # default to enforce_eager=True if enforce_eager
-        # is left unspecified. However, the
-        # VllmRunner test fixture (which wraps around the LLM class) defaults to
-        # enforce_eager=False (a behavior which a number of already-exisitng
-        # decoder-only unit tests expect), so when testing an encoder/decoder
-        # model we must explicitly specify enforce_eager=True in the VllmRunner
-        # constructor.
-        with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
-            vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
-                test_case_prompts, max_tokens, num_logprobs)
-
        hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE
                          else 0)

@@ -168,3 +175,49 @@ if not is_cpu():
            name_1="vllm",
            num_outputs_0_skip_tokens=hf_skip_tokens,
        )
+
+    @pytest.mark.parametrize("model", MODELS)
+    @pytest.mark.parametrize("dtype", ["float", "bfloat16"])
+    @pytest.mark.parametrize("max_tokens", [64])
+    @pytest.mark.parametrize("num_logprobs", [5])
+    @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
+    def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts,
+                    model, dtype, max_tokens, num_logprobs,
+                    decoder_prompt_type) -> None:
+
+        run_test(
+            hf_runner,
+            vllm_runner,
+            example_encoder_decoder_prompts[decoder_prompt_type],
+            decoder_prompt_type,
+            model,
+            dtype=dtype,
+            max_tokens=max_tokens,
+            num_logprobs=num_logprobs,
+            tensor_parallel_size=1,
+        )
+
+    @multi_gpu_test(num_gpus=2)
+    @pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
+    @pytest.mark.parametrize("model", ["facebook/bart-large-cnn"])
+    @pytest.mark.parametrize("dtype", ["float"])
+    @pytest.mark.parametrize("max_tokens", [64])
+    @pytest.mark.parametrize("num_logprobs", [5])
+    @pytest.mark.parametrize("decoder_prompt_type", [DecoderPromptType.CUSTOM])
+    def test_models_distributed(hf_runner, vllm_runner,
+                                example_encoder_decoder_prompts,
+                                distributed_executor_backend, model, dtype,
+                                max_tokens, num_logprobs,
+                                decoder_prompt_type) -> None:
+        run_test(
+            hf_runner,
+            vllm_runner,
+            example_encoder_decoder_prompts[decoder_prompt_type],
+            decoder_prompt_type,
+            model,
+            dtype=dtype,
+            max_tokens=max_tokens,
+            num_logprobs=num_logprobs,
+            tensor_parallel_size=2,
+            distributed_executor_backend=distributed_executor_backend,
+        )
--- a/tests/models/fixtures/pixtral_chat.json
+++ b/tests/models/fixtures/pixtral_chat.json
--- a/tests/models/fixtures/pixtral_chat_engine.json
+++ b/tests/models/fixtures/pixtral_chat_engine.json
--- a/tests/models/test_pixtral.py
+++ b/tests/models/test_pixtral.py
@@ -1,64 +0,0 @@
-"""Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
-
-Run `pytest tests/models/test_mistral.py`.
-"""
-import pytest
-
-from vllm.sampling_params import SamplingParams
-
-pytestmark = pytest.mark.vlm
-
-MODELS = ["mistralai/Pixtral-12B-2409"]
-
-
-@pytest.mark.skip(
-    reason=
-    "Model is too big, test passed on A100 locally but will OOM on CI machine."
-)
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [64])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-) -> None:
-    image_urls = [
-        "https://picsum.photos/id/237/200/300",
-        "https://picsum.photos/seed/picsum/200/300"
-    ]
-    expected = [
-        "The image depicts a black dog lying on a wooden surface, looking directly at the camera with a calm expression.",  # noqa
-        "The image depicts a serene landscape with a snow-covered mountain under a pastel-colored sky during sunset."  # noqa
-    ]
-    prompt = "Describe the image in one short sentence."
-
-    sampling_params = SamplingParams(max_tokens=512, temperature=0.0)
-
-    with vllm_runner(model, dtype=dtype,
-                     tokenizer_mode="mistral") as vllm_model:
-
-        for i, image_url in enumerate(image_urls):
-            messages = [
-                {
-                    "role":
-                    "user",
-                    "content": [{
-                        "type": "text",
-                        "text": prompt
-                    }, {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": image_url
-                        }
-                    }]
-                },
-            ]
-
-            outputs = vllm_model.model.chat(messages,
-                                            sampling_params=sampling_params)
-            assert outputs[0].outputs[0].text == expected[i]
--- a/tests/models/test_qwen.py
+++ b/tests/models/test_qwen.py
@@ -1,165 +0,0 @@
-import pathlib
-from typing import List, Optional, Type
-
-import pytest
-
-from vllm.multimodal.utils import rescale_image_size
-
-from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
-from .utils import check_logprobs_close
-
-pytestmark = pytest.mark.vlm
-
-text_only_models = [
-    "Qwen/Qwen-7B-Chat"  # Has no visual component
-]
-
-multimodal_models = ["Qwen/Qwen-VL"]
-
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign":
-    "Picture 1: <img></img>\nWhat's the content of the image?: ",
-    "cherry_blossom":
-    "Picture 1: <img></img>\nWhat is the season?: ",
-})
-
-
-### Tests for multimodal Qwen models
-def run_test(
-    tmp_path: pathlib.PosixPath,
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
-    image_assets: _ImageAssets,
-    model: str,
-    *,
-    size_factors: List[float],
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
-):
-    """Inference result should be the same between hf and vllm.
-
-    All the image fixtures for the test is under tests/images.
-    For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide MultiModalDataDict objects
-    and corresponding MultiModalConfig as input.
-    Note, the text input is also adjusted to abide by vllm contract.
-    The text output is sanitized to be able to compare with hf.
-    """
-    images = [asset.pil_image for asset in image_assets]
-
-    # Export the images to a tempdir and substitute it into the hf prompt;
-    # the contents between <img>/</img> will be ignored by VLLM, but the
-    # transformers implementation for the visual transformer parses this to
-    # reload it in the forward call; the contents are treated as a URL or a
-    # local path.
-    for idx, asset in enumerate(image_assets):
-        image_tmp_path = tmp_path / f"{asset.name}.jpg"
-        asset.pil_image.save(image_tmp_path)
-        HF_IMAGE_PROMPTS[idx] = HF_IMAGE_PROMPTS[idx].replace(
-            "<img></img>", f"<img>{image_tmp_path}</img>")
-
-    inputs_per_image = [(
-        [prompt for _ in size_factors],
-        [rescale_image_size(image, factor) for factor in size_factors],
-    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
-
-    # NOTE: take care of the order. run vLLM first, and then run HF.
-    # vLLM needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-
-    # max_model_len should be greater than image_feature_size
-    # Qwen encodes images into a fixed content size of 256
-    with vllm_runner(model,
-                     max_model_len=300,
-                     max_num_seqs=1,
-                     dtype=dtype,
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True) as vllm_model:
-        vllm_outputs_per_image = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images)
-            for prompts, images in inputs_per_image
-        ]
-
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs_per_image = [
-            hf_model.generate_greedy_logprobs_limit(prompts,
-                                                    max_tokens,
-                                                    num_logprobs=num_logprobs,
-                                                    images=images)
-            for prompts, images in inputs_per_image
-        ]
-
-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
-                                        vllm_outputs_per_image):
-
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
-@pytest.mark.parametrize("model", multimodal_models)
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # No image
-        [],
-        # Single-scale
-        [1.0],
-        # Single-scale, batched
-        [1.0, 1.0, 1.0],
-        # Multi-scale
-        [0.25, 0.5, 1.0],
-    ],
-)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [8])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_multimodal_models(tmp_path, hf_runner, vllm_runner, image_assets,
-                           model, size_factors, dtype, max_tokens,
-                           num_logprobs) -> None:
-    run_test(
-        tmp_path,
-        hf_runner,
-        vllm_runner,
-        image_assets,
-        model,
-        size_factors=size_factors,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        tensor_parallel_size=1,
-    )
-
-
-# Ensure that a text-only Qwen model can still be loaded and
-# used for inference in VLLM without throwing.
-@pytest.mark.parametrize("model", text_only_models)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [32])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_text_only_qwen_model_can_be_loaded_and_run(
-    vllm_runner: Type[VllmRunner],
-    example_prompts,
-    model: str,
-    *,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-):
-    with vllm_runner(model, dtype=dtype) as vllm_model:
-        vllm_model.generate_greedy_logprobs(
-            example_prompts,
-            max_tokens,
-            num_logprobs=num_logprobs,
-        )
--- a/tests/multi_step/test_correctness_async_llm.py
+++ b/tests/multi_step/test_correctness_async_llm.py
@@ -1,9 +1,10 @@
 # Test the AsyncLLMEngine with multi-step-decoding
-
 from typing import List, Optional

 import pytest

+from tests.kernels.utils import override_backend_env_variable
+
 from ..models.utils import check_logprobs_close
 from ..utils import (completions_with_server_args, get_client_text_generations,
                     get_client_text_logprob_generations)
@@ -33,8 +34,9 @@ DEFAULT_SERVER_ARGS: List[str] = [
@pytest.mark.parametrize("eager_mode", [False, True])
@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
@pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
-@pytest.mark.parametrize("num_logprobs", [None, 5])
-@pytest.mark.parametrize("is_async", [False, True])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("is_async", [True])
+@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
@pytest.mark.asyncio
 async def test_multi_step(
    example_prompts,
@@ -46,6 +48,8 @@ async def test_multi_step(
    num_prompts: int,
    is_async: bool,
    num_logprobs: Optional[int],
+    attention_backend: str,
+    monkeypatch,
 ) -> None:
    """Test vLLM engine with multi-step scheduling in an OpenAI-protocol
    client/server environment.
@@ -71,6 +75,8 @@ async def test_multi_step(
                    completions endpoint; `None` -> no logprobs
    """

+    override_backend_env_variable(monkeypatch, attention_backend)
+
    prompts = example_prompts
    if len(prompts) < num_prompts:
        prompts = prompts * ((num_prompts // len(prompts)) + 1)
--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@@ -10,6 +10,8 @@ import torch

 from tests.quantization.utils import is_quant_method_supported

+from ..utils import fork_new_process_for_each_test
+
 models_4bit_to_test = [
    ('huggyllama/llama-7b', 'quantize model inflight'),
 ]
@@ -29,6 +31,7 @@ models_pre_quant_8bit_to_test = [
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
                    reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
+@fork_new_process_for_each_test
 def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                             model_name, description) -> None:

@@ -41,6 +44,7 @@ def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                    reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description",
                         models_pre_qaunt_4bit_to_test)
+@fork_new_process_for_each_test
 def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                                       model_name, description) -> None:

@@ -52,6 +56,7 @@ def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                    reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description",
                         models_pre_quant_8bit_to_test)
+@fork_new_process_for_each_test
 def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                             model_name, description) -> None:

@@ -77,18 +82,8 @@ def validate_generated_texts(hf_runner,
                             model_name,
                             hf_model_kwargs=None):

-    if hf_model_kwargs is None:
-        hf_model_kwargs = {}
-
-    # Run with HF runner
-    with hf_runner(model_name, model_kwargs=hf_model_kwargs) as llm:
-        hf_outputs = llm.generate_greedy(prompts, 8)
-        hf_logs = log_generated_texts(prompts, hf_outputs, "HfRunner")
-
-    # Clean up the GPU memory for the next test
-    torch.cuda.synchronize()
-    gc.collect()
-    torch.cuda.empty_cache()
+    # NOTE: run vLLM first, as it requires a clean process
+    # when using distributed inference

    #Run with vLLM runner
    with vllm_runner(model_name,
@@ -104,6 +99,19 @@ def validate_generated_texts(hf_runner,
    gc.collect()
    torch.cuda.empty_cache()

+    if hf_model_kwargs is None:
+        hf_model_kwargs = {}
+
+    # Run with HF runner
+    with hf_runner(model_name, model_kwargs=hf_model_kwargs) as llm:
+        hf_outputs = llm.generate_greedy(prompts, 8)
+        hf_logs = log_generated_texts(prompts, hf_outputs, "HfRunner")
+
+    # Clean up the GPU memory for the next test
+    torch.cuda.synchronize()
+    gc.collect()
+    torch.cuda.empty_cache()
+
    # Compare the generated strings
    for hf_log, vllm_log in zip(hf_logs, vllm_logs):
        hf_str = hf_log["generated_text"]
--- a/tests/quantization/utils.py
+++ b/tests/quantization/utils.py
@@ -1,12 +1,10 @@
-import torch
-
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.platforms import current_platform


 def is_quant_method_supported(quant_method: str) -> bool:
    # Currently, all quantization methods require Nvidia or AMD GPUs
-    if not torch.cuda.is_available():
+    if not (current_platform.is_cuda() or current_platform.is_rocm()):
        return False

    capability = current_platform.get_device_capability()
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -10,6 +10,7 @@ from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional

 import openai
+import pytest
 import requests
 from openai.types.completion import Completion
 from transformers import AutoTokenizer
@@ -22,7 +23,8 @@ from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.entrypoints.openai.cli_args import make_arg_parser
 from vllm.model_executor.model_loader.loader import get_model_loader
 from vllm.platforms import current_platform
-from vllm.utils import FlexibleArgumentParser, get_open_port, is_hip
+from vllm.utils import (FlexibleArgumentParser, cuda_device_count_stateless,
+                        get_open_port, is_hip)

 if current_platform.is_rocm():
    from amdsmi import (amdsmi_get_gpu_vram_usage,
@@ -356,12 +358,23 @@ def error_on_warning():
        yield


+def get_physical_device_indices(devices):
+    visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES")
+    if visible_devices is None:
+        return devices
+
+    visible_indices = [int(x) for x in visible_devices.split(",")]
+    index_mapping = {i: physical for i, physical in enumerate(visible_indices)}
+    return [index_mapping[i] for i in devices if i in index_mapping]
+
+
@_nvml()
 def wait_for_gpu_memory_to_clear(devices: List[int],
                                 threshold_bytes: int,
                                 timeout_s: float = 120) -> None:
    # Use nvml instead of pytorch to reduce measurement error from torch cuda
    # context.
+    devices = get_physical_device_indices(devices)
    start_time = time.time()
    while True:
        output: Dict[int, str] = {}
@@ -441,6 +454,22 @@ def fork_new_process_for_each_test(
    return wrapper


+def multi_gpu_test(*, num_gpus: int):
+    """
+    Decorate a test to be run only when multiple GPUs are available.
+    """
+    test_selector = getattr(pytest.mark, f"distributed_{num_gpus}_gpus")
+    test_skipif = pytest.mark.skipif(
+        cuda_device_count_stateless() < num_gpus,
+        reason=f"Need at least {num_gpus} GPUs to run the test.",
+    )
+
+    def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
+        return test_selector(test_skipif(fork_new_process_for_each_test(f)))
+
+    return wrapper
+
+
 async def completions_with_server_args(
    prompts: List[str],
    model_name: str,
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -161,16 +161,36 @@ def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
    torch.ops._C.fused_add_rms_norm(input, residual, weight, epsilon)


-def advance_step(num_seqs: int, num_queries: int, block_size: int,
-                 input_tokens: torch.Tensor, sampled_token_ids: torch.Tensor,
-                 input_positions: torch.Tensor, seq_lens: torch.Tensor,
-                 slot_mapping: torch.Tensor,
-                 block_tables: torch.Tensor) -> None:
+def advance_step_flashattn(num_seqs: int, num_queries: int, block_size: int,
+                           input_tokens: torch.Tensor,
+                           sampled_token_ids: torch.Tensor,
+                           input_positions: torch.Tensor,
+                           seq_lens: torch.Tensor, slot_mapping: torch.Tensor,
+                           block_tables: torch.Tensor) -> None:
    """Advance a step on GPU for existing inputs for a multi-step runner"""
-    return torch.ops._C.advance_step(num_seqs, num_queries, block_size,
-                                     input_tokens, sampled_token_ids,
-                                     input_positions, seq_lens, slot_mapping,
-                                     block_tables)
+    return torch.ops._C.advance_step_flashattn(num_seqs, num_queries,
+                                               block_size, input_tokens,
+                                               sampled_token_ids,
+                                               input_positions, seq_lens,
+                                               slot_mapping, block_tables)
+
+
+def advance_step_flashinfer(num_seqs: int, num_queries: int, block_size: int,
+                            input_tokens: torch.Tensor,
+                            sampled_token_ids: torch.Tensor,
+                            input_positions: torch.Tensor,
+                            seq_lens: torch.Tensor, slot_mapping: torch.Tensor,
+                            block_tables: torch.Tensor,
+                            paged_kv_indices: torch.Tensor,
+                            paged_kv_indptr: torch.Tensor,
+                            paged_kv_last_page_len: torch.Tensor,
+                            block_table_bound: torch.Tensor) -> None:
+
+    return torch.ops._C.advance_step_flashinfer(
+        num_seqs, num_queries, block_size, input_tokens, sampled_token_ids,
+        input_positions, seq_lens, slot_mapping, block_tables,
+        paged_kv_indices, paged_kv_indptr, paged_kv_last_page_len,
+        block_table_bound)


 # quantization ops
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -83,7 +83,9 @@ class AttentionBackend(ABC):
    ) -> None:
        raise NotImplementedError

-    def advance_step(self, num_seqs: int, num_queries: int):
+    def advance_step(self, model_input: "ModelRunnerInputBase",
+                     sampled_token_ids: Optional[torch.Tensor],
+                     block_size: int, num_seqs: int, num_queries: int) -> None:
        raise NotImplementedError


--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -122,6 +122,40 @@ def _(
    return torch.empty_like(decode_query)


+@torch.library.custom_op("vllm::reshape_and_cache_flash",
+                         mutates_args=["kv_cache"])
+def reshape_and_cache_flash(
+    key: torch.Tensor,
+    value: torch.Tensor,
+    kv_cache: torch.Tensor,
+    slot_mapping: torch.Tensor,
+    kv_cache_dtype: str,
+    k_scale: float,
+    v_scale: float,
+) -> None:
+    """Inductor cannot deal with inplace operations on views.
+    See https://github.com/pytorch/pytorch/issues/131192
+    and https://github.com/pytorch/pytorch/issues/130174
+    This is a workaround to hide the view operation from the inductor.
+    """
+    return torch.ops._C_cache_ops.reshape_and_cache_flash(
+        key, value, kv_cache[0], kv_cache[1], slot_mapping, kv_cache_dtype,
+        k_scale, v_scale)
+
+
+@reshape_and_cache_flash.register_fake  # type: ignore
+def _(
+    key: torch.Tensor,
+    value: torch.Tensor,
+    kv_cache: torch.Tensor,
+    slot_mapping: torch.Tensor,
+    kv_cache_dtype: str,
+    k_scale: float,
+    v_scale: float,
+) -> None:
+    pass
+
+
 class FlashAttentionBackend(AttentionBackend):

    @staticmethod
@@ -346,15 +380,15 @@ class FlashAttentionMetadata(AttentionMetadata):
            self.seq_lens[i] += 1
        self.max_decode_seq_len = max(self.seq_lens)

-        ops.advance_step(num_seqs=num_seqs,
-                         num_queries=num_queries,
-                         block_size=block_size,
-                         input_tokens=model_input.input_tokens,
-                         sampled_token_ids=sampled_token_ids,
-                         input_positions=model_input.input_positions,
-                         seq_lens=self.seq_lens_tensor,
-                         slot_mapping=self.slot_mapping,
-                         block_tables=self.block_tables)
+        ops.advance_step_flashattn(num_seqs=num_seqs,
+                                   num_queries=num_queries,
+                                   block_size=block_size,
+                                   input_tokens=model_input.input_tokens,
+                                   sampled_token_ids=sampled_token_ids,
+                                   input_positions=model_input.input_positions,
+                                   seq_lens=self.seq_lens_tensor,
+                                   slot_mapping=self.slot_mapping,
+                                   block_tables=self.block_tables)


 class FlashAttentionMetadataBuilder(
@@ -653,11 +687,10 @@ class FlashAttentionImpl(AttentionImpl):
            # Reshape the input keys and values and store them in the cache.
            # If kv_cache is not provided, the new key and value tensors are
            # not cached. This happens during the initial memory profiling run.
-            ops.reshape_and_cache_flash(
+            torch.ops.vllm.reshape_and_cache_flash(
                key,
                value,
-                key_cache,
-                value_cache,
+                kv_cache,
                attn_metadata.slot_mapping.flatten(),
                self.kv_cache_dtype,
                k_scale,
@@ -669,7 +702,6 @@ class FlashAttentionImpl(AttentionImpl):
        assert key.shape[0] == num_prefill_tokens + num_decode_tokens
        assert value.shape[0] == num_prefill_tokens + num_decode_tokens

-        output = torch.empty_like(query)
        # Query for decode. KV is not needed because it is already cached.
        decode_query = query[num_prefill_tokens:]
        # QKV for prefill.
@@ -680,6 +712,9 @@ class FlashAttentionImpl(AttentionImpl):
        assert query.shape[0] == num_prefill_tokens
        assert decode_query.shape[0] == num_decode_tokens

+        prefill_output: Optional[torch.Tensor] = None
+        decode_output: Optional[torch.Tensor] = None
+
        if prefill_meta := attn_metadata.prefill_metadata:
            # Prompt run.
            if (kv_cache is None or prefill_meta.block_tables is None
@@ -687,7 +722,7 @@ class FlashAttentionImpl(AttentionImpl):
                # normal attention
                # When block_tables are not filled, it means q and k are the
                # prompt, and they have the same length.
-                out = torch.ops.vllm.flash_attn_varlen_func(
+                prefill_output = torch.ops.vllm.flash_attn_varlen_func(
                    q=query,
                    k=key,
                    v=value,
@@ -701,42 +736,44 @@ class FlashAttentionImpl(AttentionImpl):
                    alibi_slopes=self.alibi_slopes,
                    softcap=self.logits_soft_cap,
                )
-                assert output[:num_prefill_tokens].shape == out.shape
-                output[:num_prefill_tokens] = out
            else:
                # prefix-enabled attention
                assert prefill_meta.seq_lens is not None
                max_seq_len = max(prefill_meta.seq_lens)
-                output[:
-                       num_prefill_tokens] = torch.ops.vllm.flash_attn_varlen_func(  # noqa
-                           q=query,
-                           k=key_cache,
-                           v=value_cache,
-                           cu_seqlens_q=prefill_meta.query_start_loc,
-                           max_seqlen_q=prefill_meta.max_query_len,
-                           cu_seqlens_k=prefill_meta.seq_start_loc,
-                           max_seqlen_k=max_seq_len,
-                           softmax_scale=self.scale,
-                           causal=True,
-                           alibi_slopes=self.alibi_slopes,
-                           block_table=prefill_meta.block_tables,
-                           softcap=self.logits_soft_cap,
-                       )
-
-        if decode_meta := attn_metadata.decode_metadata:
-            # Decoding run.
-            output[
-                num_prefill_tokens:] = torch.ops.vllm.flash_attn_with_kvcache(
-                    decode_query.unsqueeze(1),
-                    key_cache,
-                    value_cache,
-                    block_table=decode_meta.block_tables,
-                    cache_seqlens=decode_meta.seq_lens_tensor,
+                prefill_output = torch.ops.vllm.flash_attn_varlen_func(  # noqa
+                    q=query,
+                    k=key_cache,
+                    v=value_cache,
+                    cu_seqlens_q=prefill_meta.query_start_loc,
+                    max_seqlen_q=prefill_meta.max_query_len,
+                    cu_seqlens_k=prefill_meta.seq_start_loc,
+                    max_seqlen_k=max_seq_len,
                    softmax_scale=self.scale,
                    causal=True,
                    alibi_slopes=self.alibi_slopes,
+                    block_table=prefill_meta.block_tables,
                    softcap=self.logits_soft_cap,
-                ).squeeze(1)
+                )

-        # Reshape the output tensor.
+        if decode_meta := attn_metadata.decode_metadata:
+            # Decoding run.
+            decode_output = torch.ops.vllm.flash_attn_with_kvcache(
+                decode_query.unsqueeze(1),
+                key_cache,
+                value_cache,
+                block_table=decode_meta.block_tables,
+                cache_seqlens=decode_meta.seq_lens_tensor,
+                softmax_scale=self.scale,
+                causal=True,
+                alibi_slopes=self.alibi_slopes,
+                softcap=self.logits_soft_cap,
+            ).squeeze(1)
+
+        if prefill_output is None:
+            assert decode_output is not None
+            return decode_output.view(num_decode_tokens, hidden_size)
+        if decode_output is None:
+            assert prefill_output is not None
+            return prefill_output.view(num_prefill_tokens, hidden_size)
+        output = torch.cat([prefill_output, decode_output], dim=0)
        return output.view(num_tokens, hidden_size)
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -30,7 +30,8 @@ from vllm.utils import (async_tensor_h2d, get_kv_cache_torch_dtype,
                        make_tensor_with_pad)

 if TYPE_CHECKING:
-    from vllm.worker.model_runner import ModelInputForGPUBuilder
+    from vllm.worker.model_runner import (ModelInputForGPUBuilder,
+                                          ModelInputForGPUWithSamplingMetadata)


 class FlashInferBackend(AttentionBackend):
@@ -268,6 +269,10 @@ class FlashInferMetadata(AttentionMetadata):
    query_start_loc: Optional[torch.Tensor] = None
    block_tables: Optional[torch.Tensor] = None

+    # used for GPU in-place advance_step
+    seq_lens_tensor: Optional[torch.Tensor] = None
+    block_table_bound: Optional[torch.Tensor] = None
+
    # An example for paged_kv_indices, paged_kv_indptr:
    # request 1, page indices [0, 5, 8]
    # request 2, page indices [1, 6, 7]
@@ -318,6 +323,8 @@ class FlashInferMetadata(AttentionMetadata):
            assert self.paged_kv_indices is not None
            assert self.paged_kv_indptr is not None
            assert self.paged_kv_last_page_len is not None
+            assert self.block_table_bound is not None
+            assert self.seq_lens_tensor is not None
            batch_size = self.query_start_loc.shape[0] - 1
            assert batch_size >= 0
            # We will use flash attention for profiling to
@@ -327,6 +334,8 @@ class FlashInferMetadata(AttentionMetadata):
                self.paged_kv_indptr = self.paged_kv_indptr.to(self.device)
                self.paged_kv_last_page_len = self.paged_kv_last_page_len.to(
                    self.device)
+                self.block_table_bound = self.block_table_bound.to(self.device)
+                self.seq_lens_tensor = self.seq_lens_tensor.to(self.device)
                self.paged_kv_indices = self.paged_kv_indices.to(self.device)
                self.prefill_wrapper.end_forward()
                self.prefill_wrapper.begin_forward(
@@ -335,14 +344,18 @@ class FlashInferMetadata(AttentionMetadata):
                    self.num_qo_heads, self.num_kv_heads, self.head_dim,
                    self.page_size)
        else:
-            if not self.use_cuda_graph:
-                assert self.paged_kv_indices is not None
-                assert self.paged_kv_indptr is not None
-                assert self.paged_kv_last_page_len is not None
-                self.paged_kv_indices = self.paged_kv_indices.to(self.device)
-                self.paged_kv_indptr = self.paged_kv_indptr.to(self.device)
-                self.paged_kv_last_page_len = self.paged_kv_last_page_len.to(
-                    self.device)
+            assert self.paged_kv_indices is not None
+            assert self.paged_kv_indptr is not None
+            assert self.paged_kv_last_page_len is not None
+            self.paged_kv_indices = self.paged_kv_indices.to(self.device)
+            self.paged_kv_indptr = self.paged_kv_indptr.to(self.device)
+            self.paged_kv_last_page_len = self.paged_kv_last_page_len.to(
+                self.device)
+            # handle model warmup path
+            if self.block_table_bound is not None:
+                self.block_table_bound = self.block_table_bound.to(self.device)
+            if self.seq_lens_tensor is not None:
+                self.seq_lens_tensor = self.seq_lens_tensor.to(self.device)

            assert self.decode_wrapper is not None
            self.decode_wrapper.end_forward()
@@ -391,6 +404,48 @@ class FlashInferMetadata(AttentionMetadata):

        return self

+    def advance_step(
+        self,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
+        sampled_token_ids: Optional[torch.Tensor],
+        block_size: int,
+        num_seqs: int,
+        num_queries: int,
+    ):
+        """
+        Update metadata in-place to advance one decode step.
+        """
+
+        assert num_seqs > 0
+        assert num_queries > 0
+        assert model_input.attn_metadata is not None
+        assert sampled_token_ids is not None
+
+        # When using cudagraph, the num_seqs is padded to the next captured
+        # batch sized, but num_queries tracks the actual number of requests in
+        # the batch. For --enforce-eager mode, num_seqs == num_queries
+        if num_seqs != num_queries:
+            assert num_seqs > num_queries
+            assert self.use_cuda_graph
+
+        model_input.input_tokens[:num_queries] = sampled_token_ids.flatten()
+
+        # Update GPU tensors
+        ops.advance_step_flashinfer(
+            num_seqs=num_seqs,
+            num_queries=num_queries,
+            block_size=block_size,
+            input_tokens=model_input.input_tokens,
+            sampled_token_ids=model_input.input_tokens,
+            input_positions=model_input.input_positions,
+            seq_lens=self.seq_lens_tensor,
+            slot_mapping=self.slot_mapping,
+            block_tables=self.block_tables,
+            paged_kv_indices=self.paged_kv_indices,
+            paged_kv_indptr=self.paged_kv_indptr,
+            paged_kv_last_page_len=self.paged_kv_last_page_len,
+            block_table_bound=self.block_table_bound)
+

 class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):

@@ -428,7 +483,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
        self.paged_kv_indptr: List[int] = [0]
        # paged_kv_last_page_len is the length of the last page of each request
        self.paged_kv_last_page_len: List[int] = []
-
+        self.total_blocks = 0
        self.is_profile_run: bool = False

    def _add_seq_group(
@@ -499,6 +554,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
        # block_table_bound is 1 with 1 valid block.
        # If seq_len = 15, block_size = 16,
        # block_table_bound is 0 + 1 with 1 valid block.
+        self.total_blocks += len(block_table)
        block_table_bound = seq_len // self.block_size + 1 \
                            if seq_len % self.block_size != 0 \
                            else seq_len // self.block_size
@@ -541,9 +597,19 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
            # The shape of graph_block_tables is
            # [max batch size, max context len // block size].
            input_block_tables = self.runner.graph_block_tables[:batch_size]
+            max_blocks = input_block_tables.shape[1]
            for i, block_table in enumerate(self.block_tables):
                if block_table:
-                    input_block_tables[i, :len(block_table)] = block_table
+                    num_blocks = len(block_table)
+                    if num_blocks <= max_blocks:
+                        input_block_tables[i, :num_blocks] = block_table
+                    else:
+                        # It may be possible to have more blocks allocated due
+                        # to lookahead slots of multi-step, however, they are
+                        # not used anyway, so can be safely ignored.
+                        input_block_tables[
+                            i, :max_blocks] = block_table[:max_blocks]
+
            block_tables = torch.from_numpy(input_block_tables).to(
                device, non_blocking=True)

@@ -583,6 +649,10 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                     out=query_start_loc[1:])

        if len(self.paged_kv_indptr) > 0:
+            # extend to the maximum number of blocks as returned by the
+            # scheduler
+            self.paged_kv_indices.extend(
+                [0] * (self.total_blocks - len(self.paged_kv_indices)))
            paged_kv_indices_tensor = torch.tensor(self.paged_kv_indices,
                                                   device="cpu",
                                                   dtype=torch.int)
@@ -591,10 +661,15 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                                                  dtype=torch.int)
            paged_kv_last_page_len_tensor = torch.tensor(
                self.paged_kv_last_page_len, device="cpu", dtype=torch.int)
+            block_table_bound_tensor = torch.zeros(len(self.paged_kv_indptr) -
+                                                   1,
+                                                   device="cpu",
+                                                   dtype=torch.int)
        else:
            paged_kv_indices_tensor = None
            paged_kv_indptr_tensor = None
            paged_kv_last_page_len_tensor = None
+            block_table_bound_tensor = None

        if self.runner.kv_cache_dtype.startswith("fp8"):
            kv_cache_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer(
@@ -613,6 +688,8 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
            paged_kv_indptr=paged_kv_indptr_tensor,
            paged_kv_indices=paged_kv_indices_tensor,
            paged_kv_last_page_len=paged_kv_last_page_len_tensor,
+            block_table_bound=block_table_bound_tensor,
+            seq_lens_tensor=seq_lens_tensor,
            num_qo_heads=self.runner.model_config.get_num_attention_heads(
                self.runner.parallel_config),
            num_kv_heads=self.runner.model_config.get_num_kv_heads(
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -869,6 +869,13 @@ class ParallelConfig:
                                 f"distributed executor backend "
                                 f"'{self.distributed_executor_backend}'.")

+        if current_platform.is_tpu() and self.world_size > 1:
+            if self.distributed_executor_backend is None:
+                self.distributed_executor_backend = "ray"
+            if self.distributed_executor_backend != "ray":
+                raise ValueError(
+                    "TPU backend only supports Ray for distributed inference.")
+
        if self.distributed_executor_backend is None and self.world_size > 1:
            # We use multiprocessing by default if world_size fits on the
            # current node and we aren't in a ray placement group.
@@ -876,7 +883,7 @@ class ParallelConfig:
            from vllm.executor import ray_utils
            backend = "mp"
            ray_found = ray_utils.ray_is_available()
-            if (torch.cuda.is_available()
+            if (current_platform.is_cuda()
                    and cuda_device_count_stateless() < self.world_size):
                if not ray_found:
                    raise ValueError("Unable to load Ray which is "
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -843,6 +843,13 @@ class EngineArgs:
        device_config = DeviceConfig(device=self.device)
        model_config = self.create_model_config()

+        if model_config.is_multimodal_model:
+            if self.enable_prefix_caching:
+                logger.warning(
+                    "--enable-prefix-caching is currently not "
+                    "supported for multimodal models and has been disabled.")
+            self.enable_prefix_caching = False
+
        cache_config = CacheConfig(
            block_size=self.block_size if self.device != "neuron" else
            self.max_model_len,  # neuron needs block_size = max_model_len
@@ -874,7 +881,10 @@ class EngineArgs:
            # If not explicitly set, enable chunked prefill by default for
            # long context (> 32K) models. This is to avoid OOM errors in the
            # initial memory profiling phase.
-            if use_long_context:
+
+            # Chunked prefill is currently disabled for multimodal models by
+            # default.
+            if use_long_context and not model_config.is_multimodal_model:
                is_gpu = device_config.device_type == "cuda"
                use_sliding_window = (model_config.get_sliding_window()
                                      is not None)
@@ -1035,7 +1045,6 @@ class EngineArgs:
@dataclass
 class AsyncEngineArgs(EngineArgs):
    """Arguments for asynchronous vLLM engine."""
-    engine_use_ray: bool = False
    disable_log_requests: bool = False

    @staticmethod
@@ -1043,16 +1052,6 @@ class AsyncEngineArgs(EngineArgs):
                     async_args_only: bool = False) -> FlexibleArgumentParser:
        if not async_args_only:
            parser = EngineArgs.add_cli_args(parser)
-        parser.add_argument('--engine-use-ray',
-                            action='store_true',
-                            help='Use Ray to start the LLM engine in a '
-                            'separate process as the server process.'
-                            '(DEPRECATED. This argument is deprecated '
-                            'and will be removed in a future update. '
-                            'Set `VLLM_ALLOW_ENGINE_USE_RAY=1` to force '
-                            'use it. See '
-                            'https://github.com/vllm-project/vllm/issues/7045.'
-                            ')')
        parser.add_argument('--disable-log-requests',
                            action='store_true',
                            help='Disable logging requests.')
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -4,22 +4,18 @@ from functools import partial
 from typing import (Any, AsyncGenerator, Callable, Dict, Iterable, List,
                    Mapping, Optional, Set, Tuple, Type, Union)

-from typing_extensions import assert_never
-
 import vllm.envs as envs
 from vllm.config import (DecodingConfig, EngineConfig, LoRAConfig, ModelConfig,
                         ParallelConfig, SchedulerConfig)
 from vllm.core.scheduler import SchedulerOutputs
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_timeout import asyncio_timeout
-from vllm.engine.llm_engine import (DecoderPromptComponents, LLMEngine,
-                                    PromptComponents, SchedulerOutputState)
+from vllm.engine.llm_engine import LLMEngine, SchedulerOutputState
 from vllm.engine.metrics_types import StatLoggerBase
 from vllm.executor.executor_base import ExecutorAsyncBase
-from vllm.executor.ray_utils import initialize_ray_cluster, ray
-from vllm.inputs import (EncoderDecoderLLMInputs, LLMInputs, PromptInputs,
-                         SingletonPromptInputs)
-from vllm.inputs.parse import is_explicit_encoder_decoder_prompt
+from vllm.executor.gpu_executor import GPUExecutorAsync
+from vllm.executor.ray_utils import initialize_ray_cluster
+from vllm.inputs import PromptInputs
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.layers.sampler import SamplerOutput
@@ -30,7 +26,6 @@ from vllm.sampling_params import SamplingParams
 from vllm.sequence import ExecuteModelRequest
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import print_warning_once

 logger = init_logger(__name__)
 ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S
@@ -404,139 +399,6 @@ class _AsyncLLMEngine(LLMEngine):
        """Stop the remote worker execution loop."""
        await self.model_executor.stop_remote_worker_execution_loop_async()

-    async def _tokenize_prompt_async(
-        self,
-        prompt: str,
-        request_id: str,
-        lora_request: Optional[LoRARequest],
-    ) -> List[int]:
-        """Async version of :meth:`_tokenize_prompt`."""
-        tokenizer = self.get_tokenizer_group(
-            missing_msg="prompts must be None if skip_tokenizer_init is True")
-
-        return await tokenizer.encode_async(request_id=request_id,
-                                            prompt=prompt,
-                                            lora_request=lora_request)
-
-    async def _extract_prompt_components_async(
-        self,
-        inputs: SingletonPromptInputs,
-        request_id: str,
-        lora_request: Optional[LoRARequest] = None,
-    ) -> PromptComponents:
-        """Async version of :meth:`_extract_prompt_components`."""
-        if isinstance(inputs, str):
-            prompt = inputs
-            prompt_token_ids = await self._tokenize_prompt_async(
-                prompt,
-                request_id=request_id,
-                lora_request=lora_request,
-            )
-            multi_modal_data = None
-        elif isinstance(inputs, dict):
-            if "prompt_token_ids" in inputs:
-                prompt = None
-                prompt_token_ids = inputs["prompt_token_ids"]
-            else:
-                # NOTE: This extra assignment is required to pass mypy
-                prompt = parsed_prompt = inputs["prompt"]
-                prompt_token_ids = await self._tokenize_prompt_async(
-                    parsed_prompt,
-                    request_id=request_id,
-                    lora_request=lora_request,
-                )
-
-            multi_modal_data = inputs.get("multi_modal_data")
-        else:
-            assert_never(inputs)
-
-        return prompt, prompt_token_ids, multi_modal_data
-
-    async def _process_encoder_decoder_prompt_async(
-        self,
-        inputs: PromptInputs,
-        request_id: str,
-    ) -> EncoderDecoderLLMInputs:
-        """Async version of :meth:`_process_encoder_decoder_prompt`."""
-        encoder_comps: PromptComponents
-        decoder_comps: DecoderPromptComponents
-
-        if is_explicit_encoder_decoder_prompt(inputs):
-            encoder_task = self._extract_prompt_components_async(
-                inputs["encoder_prompt"],
-                request_id=request_id,
-            )
-
-            if (decoder_input := inputs["decoder_prompt"]) is None:
-                encoder_comps = await encoder_task
-                decoder_comps = None, None, None
-            else:
-                decoder_task = self._extract_prompt_components_async(
-                    decoder_input,
-                    request_id=request_id,
-                )
-
-                encoder_comps, decoder_comps = await asyncio.gather(
-                    encoder_task, decoder_task)
-        else:
-            encoder_comps = await self._extract_prompt_components_async(
-                inputs,
-                request_id=request_id,
-            )
-
-            decoder_comps = None, None, None
-
-        return self._build_enc_dec_llm_inputs(encoder_comps, decoder_comps)
-
-    async def _process_decoder_only_prompt_async(
-        self,
-        inputs: SingletonPromptInputs,
-        request_id: str,
-        lora_request: Optional[LoRARequest] = None,
-        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-    ) -> LLMInputs:
-        """Async version of :meth:`_process_decoder_only_prompt`."""
-        prompt_comps = await self._extract_prompt_components_async(
-            inputs,
-            request_id=request_id,
-            lora_request=lora_request,
-        )
-
-        return self._build_decoder_only_llm_inputs(
-            prompt_comps,
-            prompt_adapter_request=prompt_adapter_request,
-        )
-
-    async def process_model_inputs_async(
-        self,
-        inputs: PromptInputs,
-        request_id: str,
-        lora_request: Optional[LoRARequest] = None,
-        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-    ) -> Union[LLMInputs, EncoderDecoderLLMInputs]:
-        """Async version of :meth:`process_model_inputs`."""
-        if self.is_encoder_decoder_model():
-            # Encoder-decoder model requires special mapping of
-            # input prompts to encoder & decoder
-            model_inputs = await self._process_encoder_decoder_prompt_async(
-                inputs,
-                request_id=request_id,
-            )
-        else:
-            if is_explicit_encoder_decoder_prompt(inputs):
-                raise ValueError("Cannot pass encoder-decoder prompt "
-                                 "to decoder-only models")
-
-            # Decoder-only operation
-            model_inputs = await self._process_decoder_only_prompt_async(
-                inputs,
-                request_id=request_id,
-                lora_request=lora_request,
-                prompt_adapter_request=prompt_adapter_request,
-            )
-
-        return self.input_processor(model_inputs)
-
    async def add_request_async(
        self,
        request_id: str,
@@ -554,12 +416,13 @@ class _AsyncLLMEngine(LLMEngine):
        if arrival_time is None:
            arrival_time = time.time()

-        processed_inputs = await self.process_model_inputs_async(
+        preprocessed_inputs = await self.input_preprocessor.preprocess_async(
            inputs,
            request_id=request_id,
            lora_request=lora_request,
            prompt_adapter_request=prompt_adapter_request,
        )
+        processed_inputs = self.input_processor(preprocessed_inputs)

        self._add_processed_request(
            request_id=request_id,
@@ -590,9 +453,6 @@ class AsyncLLMEngine:
        worker_use_ray: Whether to use Ray for model workers. Required for
            distributed execution. Should be the same as
            `parallel_config.worker_use_ray`.
-        engine_use_ray: Whether to make LLMEngine a Ray actor. If so, the
-            async frontend will be executed in a separate process as the
-            model workers.
        log_requests: Whether to log the requests.
        start_engine_loop: If True, the background task to run the engine
            will be automatically started in the generate call.
@@ -604,41 +464,23 @@ class AsyncLLMEngine:

    def __init__(self,
                 worker_use_ray: bool,
-                 engine_use_ray: bool,
                 *args,
                 log_requests: bool = True,
                 start_engine_loop: bool = True,
                 **kwargs) -> None:
        self.worker_use_ray = worker_use_ray
-        self.engine_use_ray = engine_use_ray
        self.log_requests = log_requests
-        self.engine = self._init_engine(*args, **kwargs)
+        self.engine = self._engine_class(*args, **kwargs)

        # This ensures quick processing of request outputs
        # so the append to asyncio queues is not delayed,
        # especially for multi-step.
        #
-        # TODO: Currently, disabled for engine_use_ray, ask
-        # Cody/Will/Woosuk about this case.
-        self.use_process_request_outputs_callback = not self.engine_use_ray
+        self.use_process_request_outputs_callback = True
        if self.use_process_request_outputs_callback:
            self.engine.process_request_outputs_callback = \
                self.process_request_outputs

-        if self.engine_use_ray:
-            print_warning_once(
-                "DEPRECATED. `--engine-use-ray` is deprecated and will "
-                "be removed in a future update. "
-                "See https://github.com/vllm-project/vllm/issues/7045.")
-
-            if envs.VLLM_ALLOW_ENGINE_USE_RAY:
-                print_warning_once(
-                    "VLLM_ALLOW_ENGINE_USE_RAY is set, force engine use Ray")
-            else:
-                raise ValueError("`--engine-use-ray` is deprecated. "
-                                 "Set `VLLM_ALLOW_ENGINE_USE_RAY=1` to "
-                                 "force use it")
-
        self.background_loop: Optional[asyncio.Future] = None
        # We need to keep a reference to unshielded
        # task as well to prevent it from being garbage
@@ -725,16 +567,11 @@ class AsyncLLMEngine:
        # Create the engine configs.
        engine_config = engine_args.create_engine_config()

-        if engine_args.engine_use_ray:
-            from vllm.executor import ray_utils
-            ray_utils.assert_ray_available()
-
        executor_class = cls._get_executor_cls(engine_config)

        # Create the async LLM engine.
        engine = cls(
            executor_class.uses_ray,
-            engine_args.engine_use_ray,
            **engine_config.to_dict(),
            executor_class=executor_class,
            log_requests=not engine_args.disable_log_requests,
@@ -777,10 +614,6 @@ class AsyncLLMEngine:
        self,
        lora_request: Optional[LoRARequest] = None,
    ) -> AnyTokenizer:
-        if self.engine_use_ray:
-            return await self.engine.get_tokenizer.remote(  # type: ignore
-                lora_request)
-
        return await (self.engine.get_tokenizer_group().
                      get_lora_tokenizer_async(lora_request))

@@ -814,26 +647,6 @@ class AsyncLLMEngine:
            self._background_loop_unshielded = None
        self.background_loop = None

-    def _init_engine(self, *args,
-                     **kwargs) -> Union[_AsyncLLMEngine, "ray.ObjectRef"]:
-        if not self.engine_use_ray:
-            engine_class = self._engine_class
-        elif self.worker_use_ray:
-            engine_class = ray.remote(num_cpus=0)(self._engine_class).remote
-        else:
-            # FIXME(woosuk): This is a bit hacky. Be careful when changing the
-            # order of the arguments.
-            cache_config = kwargs["cache_config"]
-            parallel_config = kwargs["parallel_config"]
-            if (parallel_config.tensor_parallel_size == 1
-                    and parallel_config.pipeline_parallel_size == 1):
-                num_gpus = cache_config.gpu_memory_utilization
-            else:
-                num_gpus = 1
-            engine_class = ray.remote(num_gpus=num_gpus)(
-                self._engine_class).remote
-        return engine_class(*args, **kwargs)
-
    async def engine_step(self, virtual_engine: int) -> bool:
        """Kick the engine to process the waiting requests.

@@ -844,13 +657,8 @@ class AsyncLLMEngine:

        for new_request in new_requests:
            # Add the request into the vLLM engine's waiting queue.
-            # TODO: Maybe add add_request_batch to reduce Ray overhead
            try:
-                if self.engine_use_ray:
-                    await self.engine.add_request.remote(  # type: ignore
-                        **new_request)
-                else:
-                    await self.engine.add_request_async(**new_request)
+                await self.engine.add_request_async(**new_request)
            except ValueError as e:
                # TODO: use a vLLM specific error for failed validation
                self._request_tracker.process_exception(
@@ -862,10 +670,7 @@ class AsyncLLMEngine:
        if aborted_requests:
            await self._engine_abort(aborted_requests)

-        if self.engine_use_ray:
-            request_outputs = await self.engine.step.remote()  # type: ignore
-        else:
-            request_outputs = await self.engine.step_async(virtual_engine)
+        request_outputs = await self.engine.step_async(virtual_engine)

        # Put the outputs into the corresponding streams.
        # If used as a callback, then already invoked inside
@@ -891,16 +696,10 @@ class AsyncLLMEngine:
        return all_finished

    async def _engine_abort(self, request_ids: Iterable[str]):
-        if self.engine_use_ray:
-            await self.engine.abort_request.remote(request_ids)  # type: ignore
-        else:
-            self.engine.abort_request(request_ids)
+        self.engine.abort_request(request_ids)

    async def run_engine_loop(self):
-        if self.engine_use_ray:
-            pipeline_parallel_size = 1  # type: ignore
-        else:
-            pipeline_parallel_size = \
+        pipeline_parallel_size = \
                self.engine.parallel_config.pipeline_parallel_size
        has_requests_in_progress = [False] * pipeline_parallel_size
        while True:
@@ -912,12 +711,7 @@ class AsyncLLMEngine:
                # timeout, and unblocks the RPC thread in the workers so that
                # they can process any other queued control plane messages,
                # such as add/remove lora adapters.
-                if self.engine_use_ray:
-                    await (self.engine.stop_remote_worker_execution_loop.
-                           remote()  # type: ignore
-                           )
-                else:
-                    await self.engine.stop_remote_worker_execution_loop_async()
+                await self.engine.stop_remote_worker_execution_loop_async()
                await self._request_tracker.wait_for_new_requests()
                logger.debug("Got new requests!")
                requests_in_progress = [
@@ -938,17 +732,9 @@ class AsyncLLMEngine:
                for task in done:
                    result = task.result()
                    virtual_engine = requests_in_progress.index(task)
-                    if self.engine_use_ray:
-                        has_unfinished_requests = (
-                            await (self.engine.
-                                   has_unfinished_requests_for_virtual_engine.
-                                   remote(  # type: ignore
-                                       virtual_engine)))
-                    else:
-                        has_unfinished_requests = (
-                            self.engine.
-                            has_unfinished_requests_for_virtual_engine(
-                                virtual_engine))
+                    has_unfinished_requests = (
+                        self.engine.has_unfinished_requests_for_virtual_engine(
+                            virtual_engine))
                    if result or has_unfinished_requests:
                        requests_in_progress[virtual_engine] = (
                            asyncio.create_task(
@@ -1190,52 +976,29 @@ class AsyncLLMEngine:

    async def get_model_config(self) -> ModelConfig:
        """Get the model configuration of the vLLM engine."""
-        if self.engine_use_ray:
-            return await self.engine.get_model_config.remote()  # type: ignore
-        else:
-            return self.engine.get_model_config()
+        return self.engine.get_model_config()

    async def get_parallel_config(self) -> ParallelConfig:
        """Get the parallel configuration of the vLLM engine."""
-        if self.engine_use_ray:
-            return await self.engine.get_parallel_config.remote(  # type: ignore
-            )
-        else:
-            return self.engine.get_parallel_config()
+        return self.engine.get_parallel_config()

    async def get_decoding_config(self) -> DecodingConfig:
        """Get the decoding configuration of the vLLM engine."""
-        if self.engine_use_ray:
-            return await self.engine.get_decoding_config.remote(  # type: ignore
-            )
-        else:
-            return self.engine.get_decoding_config()
+        return self.engine.get_decoding_config()

    async def get_scheduler_config(self) -> SchedulerConfig:
        """Get the scheduling configuration of the vLLM engine."""
-        if self.engine_use_ray:
-            return await self.engine.get_scheduler_config.remote(  # type: ignore
-            )
-        else:
-            return self.engine.get_scheduler_config()
+        return self.engine.get_scheduler_config()

    async def get_lora_config(self) -> LoRAConfig:
        """Get the lora configuration of the vLLM engine."""
-        if self.engine_use_ray:
-            return await self.engine.get_lora_config.remote(  # type: ignore
-            )
-        else:
-            return self.engine.get_lora_config()
+        return self.engine.get_lora_config()

    async def do_log_stats(
            self,
            scheduler_outputs: Optional[SchedulerOutputs] = None,
            model_output: Optional[List[SamplerOutput]] = None) -> None:
-        if self.engine_use_ray:
-            await self.engine.do_log_stats.remote(  # type: ignore
-                scheduler_outputs, model_output)
-        else:
-            self.engine.do_log_stats()
+        self.engine.do_log_stats()

    async def check_health(self) -> None:
        """Raises an error if engine is unhealthy."""
@@ -1244,40 +1007,30 @@ class AsyncLLMEngine:
        if self.is_stopped:
            raise AsyncEngineDeadError("Background loop is stopped.")

-        if self.engine_use_ray:
-            try:
-                await self.engine.check_health.remote()  # type: ignore
-            except ray.exceptions.RayActorError as e:
-                raise RuntimeError("Engine is dead.") from e
-        else:
-            await self.engine.check_health_async()
+        await self.engine.check_health_async()
        logger.debug("Health check took %fs", time.perf_counter() - t)

    async def is_tracing_enabled(self) -> bool:
-        if self.engine_use_ray:
-            return await self.engine.is_tracing_enabled.remote(  # type: ignore
-            )
-        else:
-            return self.engine.is_tracing_enabled()
+        return self.engine.is_tracing_enabled()

    def add_logger(self, logger_name: str, logger: StatLoggerBase) -> None:
-        if self.engine_use_ray:
-            ray.get(
-                self.engine.add_logger.remote(  # type: ignore
-                    logger_name=logger_name, logger=logger))
-        else:
-            self.engine.add_logger(logger_name=logger_name, logger=logger)
+        self.engine.add_logger(logger_name=logger_name, logger=logger)

    def remove_logger(self, logger_name: str) -> None:
-        if self.engine_use_ray:
-            ray.get(
-                self.engine.remove_logger.remote(  # type: ignore
-                    logger_name=logger_name))
-        else:
-            self.engine.remove_logger(logger_name=logger_name)
+        self.engine.remove_logger(logger_name=logger_name)

    async def start_profile(self) -> None:
-        self.engine.model_executor._run_workers("start_profile")
+        # using type instead of isinstance to check to avoid capturing
+        # inherited classes
+        if type(self.engine.model_executor) == GPUExecutorAsync:
+            self.engine.model_executor.start_profile()
+        else:
+            self.engine.model_executor._run_workers("start_profile")

    async def stop_profile(self) -> None:
-        self.engine.model_executor._run_workers("stop_profile")
+        # using type instead of isinstance to check to avoid capturing
+        # inherited classes
+        if type(self.engine.model_executor) == GPUExecutorAsync:
+            self.engine.model_executor.stop_profile()
+        else:
+            self.engine.model_executor._run_workers("stop_profile")
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -3,13 +3,13 @@ import time
 from collections import deque
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import (TYPE_CHECKING, Any, ClassVar, Deque, Dict, Iterable, List,
-                    Mapping, NamedTuple, Optional)
+from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Deque, Dict,
+                    Iterable, List, Mapping, NamedTuple, Optional)
 from typing import Sequence as GenericSequence
-from typing import Set, Tuple, Type, Union
+from typing import Set, Type, Union

 import torch
-from typing_extensions import TypeVar, assert_never
+from typing_extensions import TypeVar

 import vllm.envs as envs
 from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig,
@@ -26,20 +26,19 @@ from vllm.engine.output_processor.interfaces import (
 from vllm.engine.output_processor.stop_checker import StopChecker
 from vllm.engine.output_processor.util import create_output_by_sequence_group
 from vllm.executor.executor_base import ExecutorBase
+from vllm.executor.gpu_executor import GPUExecutor
 from vllm.executor.ray_utils import initialize_ray_cluster
 from vllm.inputs import (INPUT_REGISTRY, EncoderDecoderLLMInputs,
-                         InputRegistry, LLMInputs, PromptInputs,
-                         SingletonPromptInputs)
-from vllm.inputs.parse import is_explicit_encoder_decoder_prompt
+                         InputRegistry, LLMInputs, PromptInputs)
+from vllm.inputs.preprocess import InputPreprocessor
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.multimodal import MultiModalDataDict
 from vllm.outputs import (EmbeddingRequestOutput, RequestOutput,
                          RequestOutputFactory)
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
-from vllm.sampling_params import SamplingParams
+from vllm.sampling_params import RequestOutputKind, SamplingParams
 from vllm.sequence import (EmbeddingSequenceGroupOutput, ExecuteModelRequest,
                           Sequence, SequenceGroup, SequenceGroupMetadata,
                           SequenceStatus)
@@ -75,11 +74,6 @@ def _load_generation_config_dict(model_config: ModelConfig) -> Dict[str, Any]:
 _G = TypeVar("_G", bound=BaseTokenizerGroup, default=BaseTokenizerGroup)
 _O = TypeVar("_O", RequestOutput, EmbeddingRequestOutput)

-PromptComponents = Tuple[Optional[str], List[int],
-                         Optional[MultiModalDataDict]]
-DecoderPromptComponents = Tuple[Optional[str], Optional[List[int]],
-                                Optional[MultiModalDataDict]]
-

@dataclass
 class SchedulerOutputState:
@@ -225,9 +219,6 @@ class LLMEngine:
        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
        stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
        input_registry: InputRegistry = INPUT_REGISTRY,
-        # To improve performance, only final requests outputs may be required.
-        # If this set to true, then no intermediate outputs will be returned.
-        step_return_finished_only: bool = False,
    ) -> None:
        logger.info(
            "Initializing an LLM engine (v%s) with config: "
@@ -295,7 +286,6 @@ class LLMEngine:
        self.observability_config = observability_config or ObservabilityConfig(
        )
        self.log_stats = log_stats
-        self.step_return_finished_only = step_return_finished_only

        if not self.model_config.skip_tokenizer_init:
            self.tokenizer = self._init_tokenizer()
@@ -317,6 +307,9 @@ class LLMEngine:
        self.generation_config_fields = _load_generation_config_dict(
            model_config)

+        self.input_preprocessor = InputPreprocessor(model_config,
+                                                    self.tokenizer)
+
        self.input_registry = input_registry
        self.input_processor = input_registry.create_input_processor(
            model_config)
@@ -397,7 +390,7 @@ class LLMEngine:

        # Currently used by AsyncLLMEngine to ensure quick append
        # of request outputs to asyncio queues
-        self.process_request_outputs_callback = None
+        self.process_request_outputs_callback: Optional[Callable] = None

        # Create the scheduler.
        # NOTE: the cache_config here have been updated with the numbers of
@@ -575,19 +568,15 @@ class LLMEngine:
        if model_executor := getattr(self, "model_executor", None):
            model_executor.shutdown()

-    MISSING_TOKENIZER_GROUP_MSG = ("Unable to get tokenizer because "
-                                   "skip_tokenizer_init is True")
-
    def get_tokenizer_group(
        self,
        group_type: Type[_G] = BaseTokenizerGroup,
-        *,
-        missing_msg: str = MISSING_TOKENIZER_GROUP_MSG,
    ) -> _G:
        tokenizer_group = self.tokenizer

        if tokenizer_group is None:
-            raise ValueError(missing_msg)
+            raise ValueError("Unable to get tokenizer because "
+                             "skip_tokenizer_init is True")
        if not isinstance(tokenizer_group, group_type):
            raise TypeError("Invalid type of tokenizer group. "
                            f"Expected type: {group_type}, but "
@@ -619,52 +608,6 @@ class LLMEngine:
            self.prompt_adapter_config.verify_with_model_config(
                self.model_config)

-    def _get_bos_token_id(self,
-                          lora_request: Optional[LoRARequest] = None
-                          ) -> Optional[int]:
-        if self.tokenizer is None:
-            logger.warning("Using None for BOS token id because tokenizer "
-                           "is not initialized")
-            return None
-
-        return self.tokenizer.get_lora_tokenizer(lora_request).bos_token_id
-
-    def _get_eos_token_id(self,
-                          lora_request: Optional[LoRARequest] = None
-                          ) -> Optional[int]:
-        if self.tokenizer is None:
-            logger.warning("Using None for EOS token id because tokenizer "
-                           "is not initialized")
-            return None
-
-        return self.tokenizer.get_lora_tokenizer(lora_request).eos_token_id
-
-    def _get_decoder_start_token_id(self) -> Optional[int]:
-        '''
-        Obtain the decoder start token id employed by an encoder/decoder
-        model. Returns None for non-encoder/decoder models or if the
-        model config is unavailable.
-        '''
-
-        if not self.is_encoder_decoder_model():
-            logger.warning("Using None for decoder start token id because "
-                           "this is not an encoder/decoder model.")
-            return None
-
-        if (self.model_config is None or self.model_config.hf_config is None):
-            logger.warning("Using None for decoder start token id because "
-                           "model config is not available.")
-            return None
-
-        dec_start_token_id = getattr(self.model_config.hf_config,
-                                     'decoder_start_token_id', None)
-        if dec_start_token_id is None:
-            logger.warning("Falling back on <BOS> for decoder start token id "
-                           "because decoder start token id is not available.")
-            dec_start_token_id = self._get_bos_token_id()
-
-        return dec_start_token_id
-
    def _add_processed_request(
        self,
        request_id: str,
@@ -679,7 +622,7 @@ class LLMEngine:
        # Create the sequences.
        block_size = self.cache_config.block_size
        seq_id = next(self.seq_counter)
-        eos_token_id = self._get_eos_token_id(lora_request)
+        eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)

        seq = Sequence(seq_id, processed_inputs, block_size, eos_token_id,
                       lora_request, prompt_adapter_request)
@@ -729,334 +672,6 @@ class LLMEngine:
    def stop_remote_worker_execution_loop(self) -> None:
        self.model_executor.stop_remote_worker_execution_loop()

-    _LLMInputComponentsType = Tuple[str, List[int]]
-
-    def _prepare_decoder_input_ids_for_generation(
-        self,
-        decoder_input_ids: Optional[List[int]],
-    ) -> List[int]:
-        """
-        Prepares `decoder_input_ids` for generation with encoder-decoder models.
-
-        Based on
-
-        https://github.com/huggingface/transformers/blob/
-        4037a2b5b1278736e566aec12e169100275545ea/
-        src/transformers/generation/utils.py
-
-        specifically GenerationMixin._prepare_decoder_input_ids_for_generation()
-
-        Arguments:
-
-        * decoder_input_ids: input token ids to preprocess
-
-        Returns:
-
-        * Processed token list
-        """
-
-        decoder_start_token_id = self._get_decoder_start_token_id()
-        assert decoder_start_token_id is not None
-
-        if decoder_input_ids is None:
-            # no decoder prompt input ->
-            # use decoder_start_token_id as decoder_input_ids
-            decoder_input_ids = self._get_default_enc_dec_decoder_prompt()
-
-        if (len(decoder_input_ids) == 0
-                or decoder_input_ids[0] != decoder_start_token_id):
-            decoder_input_ids = [decoder_start_token_id] + decoder_input_ids
-
-        return decoder_input_ids
-
-    def _tokenize_prompt(
-        self,
-        prompt: str,
-        request_id: str,
-        lora_request: Optional[LoRARequest],
-    ) -> List[int]:
-        '''
-        Wrapper around application of the model's tokenizer.
-
-        Arguments:
-
-        * prompt
-        * request_id
-        * lora_request
-
-        Returns:
-
-        * prompt token ids
-        '''
-
-        tokenizer = self.get_tokenizer_group(
-            missing_msg="prompts must be None if skip_tokenizer_init is True")
-
-        return tokenizer.encode(request_id=request_id,
-                                prompt=prompt,
-                                lora_request=lora_request)
-
-    def _extract_prompt_components(
-        self,
-        inputs: SingletonPromptInputs,
-        request_id: str,
-        lora_request: Optional[LoRARequest] = None,
-    ) -> PromptComponents:
-        '''
-        Extract the components of any single encoder or decoder input prompt.
-
-        Arguments:
-
-        * request_id
-        * inputs: single encoder or decoder input prompt
-        * lora_request: this is only valid for decoder prompts
-
-        Returns:
-
-        * prompt
-        * prompt_token_ids
-        * multi_modal_data
-        '''
-
-        if isinstance(inputs, str):
-            prompt = inputs
-            prompt_token_ids = self._tokenize_prompt(
-                prompt,
-                request_id=request_id,
-                lora_request=lora_request,
-            )
-            multi_modal_data = None
-        elif isinstance(inputs, dict):
-            if "prompt_token_ids" in inputs:
-                prompt = None
-                prompt_token_ids = inputs["prompt_token_ids"]
-            else:
-                # NOTE: This extra assignment is required to pass mypy
-                prompt = parsed_prompt = inputs["prompt"]
-                prompt_token_ids = self._tokenize_prompt(
-                    parsed_prompt,
-                    request_id=request_id,
-                    lora_request=lora_request,
-                )
-
-            multi_modal_data = inputs.get("multi_modal_data")
-        else:
-            assert_never(inputs)
-
-        return prompt, prompt_token_ids, multi_modal_data
-
-    def _apply_prompt_adapter(
-        self,
-        prompt_token_ids: List[int],
-        prompt_adapter_request: Optional[PromptAdapterRequest],
-    ) -> List[int]:
-        if prompt_adapter_request:
-            prompt_token_ids = (
-                [0] * prompt_adapter_request.prompt_adapter_num_virtual_tokens
-                + prompt_token_ids)
-
-        return prompt_token_ids
-
-    def _get_default_enc_dec_decoder_prompt(self) -> List[int]:
-        '''
-        Specifically for encoder/decoder models:
-        generate a default decoder prompt for when
-        the user specifies only the encoder prompt.
-
-        Encoder/decoder models utilize the decoder
-        prompt in different ways; as new models are
-        added, it is intended that this function
-        will be extended to produce differing
-        default decoder prompts, depending on the
-        model variety.
-
-        Absent a special case, the default behavior
-        of this method is to mirror the behavior of
-        the HuggingFace (HF) GenerationMixin for a None
-        decoder prompt, which is to employ a logit processor
-        setting to force the first decoded token to be <BOS>.
-        Here, this behavior is approximated by having the
-        "default" decoder prompt be <BOS>.
-
-        However, it is possible that in the future
-        other models may have different or more 
-        complex logic for the default decoder prompt.
-        This motivates having a special helper method
-        for default decoder prompts.
-
-        Returns:
-
-        * prompt_token_ids
-        '''
-
-        bos_token_id = self._get_bos_token_id()
-        assert bos_token_id is not None
-        return [bos_token_id]
-
-    def _build_enc_dec_llm_inputs(
-        self,
-        encoder_comps: PromptComponents,
-        decoder_comps: DecoderPromptComponents,
-    ) -> EncoderDecoderLLMInputs:
-        encoder_prompt, encoder_prompt_ids, encoder_mm_data = encoder_comps
-        decoder_prompt, decoder_prompt_ids, decoder_mm_data = decoder_comps
-
-        if encoder_mm_data is not None or decoder_mm_data is not None:
-            raise ValueError("Multi-modal encoder-decoder models are "
-                             "not supported yet")
-
-        decoder_prompt_ids = (
-            self._prepare_decoder_input_ids_for_generation(decoder_prompt_ids))
-
-        return EncoderDecoderLLMInputs(
-            prompt_token_ids=decoder_prompt_ids,
-            prompt=decoder_prompt,
-            encoder_prompt_token_ids=encoder_prompt_ids,
-            encoder_prompt=encoder_prompt,
-        )
-
-    def _process_encoder_decoder_prompt(
-        self,
-        inputs: PromptInputs,
-        request_id: str,
-    ) -> EncoderDecoderLLMInputs:
-        '''
-        For encoder/decoder models only:
-        Process an input prompt into an
-        :class:`EncoderDecoderLLMInputs` instance.
-
-        There are two types of input prompts:
-        singleton prompts which carry only the
-        encoder prompt, and explicit encoder/decoder
-        prompts which carry both the encoder and the
-        decoder prompts as member variables.
-
-        This function handles the following scenarios:
-        * Singleton encoder prompt: extract encoder prompt
-          token ids & infer default decoder prompt token ids
-        * Explicit encoder/decoder prompt: extract encoder
-          and decoder prompt token ids
-
-        Note that for Explicit encoder/decoder prompts,
-        each sub-prompt (encoder or decoder prompt) can
-        have any possible singleton type; thus this
-        method relies on helper functions to obtain
-        token ids for the sub-prompts.
-        
-        Arguments:
-
-        * inputs: an input prompt
-        * request_id
-
-        Returns:
-
-        * :class:`EncoderDecoderLLMInputs` instance
-        '''
-
-        encoder_comps: PromptComponents
-        decoder_comps: DecoderPromptComponents
-
-        if is_explicit_encoder_decoder_prompt(inputs):
-            encoder_comps = self._extract_prompt_components(
-                inputs["encoder_prompt"],
-                request_id=request_id,
-            )
-
-            if (decoder_input := inputs["decoder_prompt"]) is None:
-                decoder_comps = None, None, None
-            else:
-                decoder_comps = self._extract_prompt_components(
-                    decoder_input,
-                    request_id=request_id,
-                )
-        else:
-            encoder_comps = self._extract_prompt_components(
-                inputs,
-                request_id=request_id,
-            )
-
-            decoder_comps = None, None, None
-
-        return self._build_enc_dec_llm_inputs(encoder_comps, decoder_comps)
-
-    def _build_decoder_only_llm_inputs(
-        self,
-        prompt_comps: PromptComponents,
-        prompt_adapter_request: Optional[PromptAdapterRequest],
-    ) -> LLMInputs:
-        prompt, prompt_token_ids, multi_modal_data = prompt_comps
-
-        prompt_token_ids = self._apply_prompt_adapter(
-            prompt_token_ids, prompt_adapter_request=prompt_adapter_request)
-
-        return LLMInputs(prompt_token_ids=prompt_token_ids,
-                         prompt=prompt,
-                         multi_modal_data=multi_modal_data)
-
-    def _process_decoder_only_prompt(
-        self,
-        inputs: SingletonPromptInputs,
-        request_id: str,
-        lora_request: Optional[LoRARequest] = None,
-        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-    ) -> LLMInputs:
-        '''
-        For decoder-only models:
-        Process an input prompt into an :class:`LLMInputs` instance.
-
-        Arguments:
-
-        * inputs: input prompt
-        * request_id
-        * lora_request
-        * prompt_adapter_request
-
-        Returns:
-
-        * :class:`LLMInputs` instance
-        '''
-
-        prompt_comps = self._extract_prompt_components(
-            inputs,
-            request_id=request_id,
-            lora_request=lora_request,
-        )
-
-        return self._build_decoder_only_llm_inputs(
-            prompt_comps,
-            prompt_adapter_request=prompt_adapter_request,
-        )
-
-    def process_model_inputs(
-        self,
-        inputs: PromptInputs,
-        request_id: str,
-        lora_request: Optional[LoRARequest] = None,
-        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-    ) -> Union[LLMInputs, EncoderDecoderLLMInputs]:
-
-        if self.is_encoder_decoder_model():
-            # Encoder-decoder model requires special mapping of
-            # input prompts to encoder & decoder
-            model_inputs = self._process_encoder_decoder_prompt(
-                inputs,
-                request_id=request_id,
-            )
-        else:
-            if is_explicit_encoder_decoder_prompt(inputs):
-                raise ValueError("Cannot pass encoder-decoder prompt "
-                                 "to decoder-only models")
-
-            # Decoder-only operation
-            model_inputs = self._process_decoder_only_prompt(
-                inputs,
-                request_id=request_id,
-                lora_request=lora_request,
-                prompt_adapter_request=prompt_adapter_request,
-            )
-
-        return self.input_processor(model_inputs)
-
    def add_request(
        self,
        request_id: str,
@@ -1115,12 +730,13 @@ class LLMEngine:
        if arrival_time is None:
            arrival_time = time.time()

-        processed_inputs = self.process_model_inputs(
+        preprocessed_inputs = self.input_preprocessor.preprocess(
            inputs,
            request_id=request_id,
            lora_request=lora_request,
            prompt_adapter_request=prompt_adapter_request,
        )
+        processed_inputs = self.input_processor(preprocessed_inputs)

        self._add_processed_request(
            request_id=request_id,
@@ -1378,7 +994,8 @@ class LLMEngine:
            seq_group = scheduled_seq_group.seq_group
            seq_group.maybe_set_first_token_time(now)
            request_output = RequestOutputFactory.create(seq_group)
-            ctx.request_outputs.append(request_output)
+            if request_output:
+                ctx.request_outputs.append(request_output)

        # When we process a single request, we skip it for the next time,
        # and invoke the request output callback (if there was final output)
@@ -1415,14 +1032,19 @@ class LLMEngine:

            seq_group = scheduled_seq_group.seq_group
            seq_group.maybe_set_first_token_time(now)
-            if (seq_group.is_finished()
-                    if self.step_return_finished_only else True):
-                request_output = RequestOutputFactory.create(seq_group)
+            request_output = RequestOutputFactory.create(seq_group)
+            if request_output:
                ctx.request_outputs.append(request_output)

        for seq_group in scheduler_outputs.ignored_seq_groups:
+            params = seq_group.sampling_params
+            if params is not None and params.output_kind == (
+                    RequestOutputKind.DELTA) and not seq_group.is_finished():
+                continue
+
            request_output = RequestOutputFactory.create(seq_group)
-            ctx.request_outputs.append(request_output)
+            if request_output:
+                ctx.request_outputs.append(request_output)

        # Immediately process request outputs here (if callback is given)
        if (ctx.request_outputs
@@ -1435,7 +1057,8 @@ class LLMEngine:
        # LLMEngine/AsyncLLMEngine directly
        if is_async:
            # Log stats.
-            self.do_log_stats(scheduler_outputs, outputs, finished_before)
+            self.do_log_stats(scheduler_outputs, outputs, finished_before,
+                              skip)

            # Tracing
            self.do_tracing(scheduler_outputs)
@@ -1742,18 +1365,20 @@ class LLMEngine:
    def do_log_stats(self,
                     scheduler_outputs: Optional[SchedulerOutputs] = None,
                     model_output: Optional[List[SamplerOutput]] = None,
-                     finished_before: Optional[List[int]] = None) -> None:
+                     finished_before: Optional[List[int]] = None,
+                     skip: Optional[List[int]] = None) -> None:
        """Forced log when no requests active."""
        if self.log_stats:
            stats = self._get_stats(scheduler_outputs, model_output,
-                                    finished_before)
+                                    finished_before, skip)
            for logger in self.stat_loggers.values():
                logger.log(stats)

    def _get_stats(self,
                   scheduler_outputs: Optional[SchedulerOutputs],
                   model_output: Optional[List[SamplerOutput]] = None,
-                   finished_before: Optional[List[int]] = None) -> Stats:
+                   finished_before: Optional[List[int]] = None,
+                   skip: Optional[List[int]] = None) -> Stats:
        """Get Stats to be Logged to Prometheus.

        Args:
@@ -1761,6 +1386,10 @@ class LLMEngine:
                the scheduled batch,
            model_output: Optional, used to emit speculative decoding metrics
                which are created by the workers.
+            finished_before: Optional, indices of sequences that were finished
+                before. These sequences will be ignored.
+            skip: Optional, indices of sequences that were preempted. These
+                sequences will be ignored.
        """
        now = time.time()

@@ -1835,6 +1464,11 @@ class LLMEngine:
                    actual_num_batched_tokens -= 1
                    continue

+                # Currently, skip == preempted sequences, so we need to skip
+                # their log stats
+                if skip and idx in skip:
+                    continue
+
                group_was_prefill = idx < scheduler_outputs.num_prefill_groups
                seq_group = scheduled_seq_group.seq_group

@@ -1964,10 +1598,20 @@ class LLMEngine:
        self.model_executor.check_health()

    def start_profile(self) -> None:
-        self.model_executor.start_profile()
+        # using type instead of isinstance to check to avoid capturing
+        # inherited classes (MultiprocessingGPUExecutor)
+        if type(self.model_executor) == GPUExecutor:
+            self.model_executor.start_profile()
+        else:
+            self.model_executor._run_workers("start_profile")

    def stop_profile(self) -> None:
-        self.model_executor.stop_profile()
+        # using type instead of isinstance to check to avoid capturing
+        # inherited classes (MultiprocessingGPUExecutor)
+        if type(self.model_executor) == GPUExecutor:
+            self.model_executor.stop_profile()
+        else:
+            self.model_executor._run_workers("stop_profile")

    def is_tracing_enabled(self) -> bool:
        return self.tracer is not None
@@ -2041,7 +1685,7 @@ class LLMEngine:
                    metrics.model_execute_time)

    def is_encoder_decoder_model(self):
-        return self.model_config.is_encoder_decoder_model
+        return self.input_preprocessor.is_encoder_decoder_model()

    def is_embedding_model(self):
        return self.model_config.is_embedding_model
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -19,7 +19,7 @@ from vllm.model_executor.guided_decoding.guided_fields import LLMGuidedOptions
 from vllm.outputs import EmbeddingRequestOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
-from vllm.sampling_params import SamplingParams
+from vllm.sampling_params import RequestOutputKind, SamplingParams
 from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
                                               get_cached_tokenizer)
 from vllm.transformers_utils.tokenizer_group import TokenizerGroup
@@ -642,14 +642,12 @@ class LLM:
            raise ValueError("The lengths of prompts and lora_request "
                             "must be the same.")

-        if isinstance(params, list):
-            params = [
-                self._add_guided_processor(param, guided_options)
-                if isinstance(param, SamplingParams) else param
-                for param in params
-            ]
-        elif isinstance(params, SamplingParams):
-            params = self._add_guided_processor(params, guided_options)
+        for sp in params if isinstance(params, list) else (params, ):
+            if isinstance(sp, SamplingParams):
+                self._add_guided_processor(sp, guided_options)
+
+                # We only care about the final output
+                sp.output_kind = RequestOutputKind.FINAL_ONLY

        # Add requests to the engine.
        for i, request_inputs in enumerate(inputs):
@@ -709,9 +707,6 @@ class LLM:
                         f"output: {0:.2f} toks/s"),
            )

-        # In the loop below, only finished outputs are used
-        self.llm_engine.step_return_finished_only = True
-
        # Run the engine.
        outputs: List[Union[RequestOutput, EmbeddingRequestOutput]] = []
        total_in_toks = 0
@@ -724,6 +719,7 @@ class LLM:
                    if use_tqdm:
                        if isinstance(output, RequestOutput):
                            # Calculate tokens only for RequestOutput
+                            assert output.prompt_token_ids is not None
                            total_in_toks += len(output.prompt_token_ids)
                            in_spd = total_in_toks / pbar.format_dict["elapsed"]
                            total_out_toks += sum(
@@ -735,9 +731,6 @@ class LLM:
                                f"output: {out_spd:.2f} toks/s")
                        pbar.update(1)

-        # Restore original behavior
-        self.llm_engine.step_return_finished_only = False
-
        if use_tqdm:
            pbar.close()
        # Sort the outputs by request ID.
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -12,7 +12,8 @@ from typing_extensions import Annotated, Required, TypedDict
 from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
 from vllm.entrypoints.openai.logits_processors import get_logits_processors
 from vllm.pooling_params import PoolingParams
-from vllm.sampling_params import LogitsProcessor, SamplingParams
+from vllm.sampling_params import (LogitsProcessor, RequestOutputKind,
+                                  SamplingParams)
 from vllm.sequence import Logprob
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import random_uuid
@@ -316,6 +317,8 @@ class ChatCompletionRequest(OpenAIBaseModel):
            length_penalty=self.length_penalty,
            logits_processors=logits_processors,
            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            output_kind=RequestOutputKind.DELTA if self.stream \
+                else RequestOutputKind.FINAL_ONLY,
        )

    @model_validator(mode="before")
@@ -559,6 +562,8 @@ class CompletionRequest(OpenAIBaseModel):
            length_penalty=self.length_penalty,
            logits_processors=logits_processors,
            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            output_kind=RequestOutputKind.DELTA if self.stream \
+                else RequestOutputKind.FINAL_ONLY,
        )

    @model_validator(mode="before")
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -195,7 +195,6 @@ async def main(args):
    engine = AsyncLLMEngine.from_engine_args(
        engine_args, usage_context=UsageContext.OPENAI_BATCH_RUNNER)

-    # When using single vLLM without engine_use_ray
    model_config = await engine.get_model_config()

    if args.disable_log_requests:
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -246,8 +246,7 @@ class OpenAIServingChat(OpenAIServing):
    def get_chat_request_role(self, request: ChatCompletionRequest) -> str:
        if request.add_generation_prompt:
            return self.response_role
-        else:
-            return request.messages[-1]["role"]
+        return request.messages[-1]["role"]

    async def chat_completion_stream_generator(
        self,
@@ -264,15 +263,37 @@ class OpenAIServingChat(OpenAIServing):

        # Send response for each token for each request.n (index)
        num_choices = 1 if request.n is None else request.n
-        previous_texts = [""] * num_choices
        previous_num_tokens = [0] * num_choices
        finish_reason_sent = [False] * num_choices

+        num_prompt_tokens = 0
+
        tool_parser: Optional[ToolParser] = self.tool_parser(
            tokenizer) if self.tool_parser else None

+        if isinstance(request.tool_choice, ChatCompletionNamedToolChoiceParam):
+            tool_choice_function_name = request.tool_choice.function.name
+        else:
+            tool_choice_function_name = None
+
+        # Determine whether tools are in use with "auto" tool choice
+        tool_choice_auto = (
+            not tool_choice_function_name
+            and self._should_stream_with_auto_tool_parsing(request))
+
+        all_previous_token_ids: Optional[List[List[int]]]
+        if tool_choice_auto:
+            # These are only required in "auto" tool choice case
+            previous_texts = [""] * num_choices
+            all_previous_token_ids = [[]] * num_choices
+        else:
+            previous_texts, all_previous_token_ids = None, None
+
        try:
            async for res in result_generator:
+                if res.prompt_token_ids is not None:
+                    num_prompt_tokens = len(res.prompt_token_ids)
+
                # We need to do it here, because if there are exceptions in
                # the result_generator, it needs to be sent as the FIRST
                # response (by the try...catch).
@@ -305,10 +326,10 @@ class OpenAIServingChat(OpenAIServing):
                                and request.stream_options.include_usage):
                            # if continuous usage stats are requested, add it
                            if request.stream_options.continuous_usage_stats:
-                                prompt_tokens = len(res.prompt_token_ids)
-                                usage = UsageInfo(prompt_tokens=prompt_tokens,
-                                                  completion_tokens=0,
-                                                  total_tokens=prompt_tokens)
+                                usage = UsageInfo(
+                                    prompt_tokens=num_prompt_tokens,
+                                    completion_tokens=0,
+                                    total_tokens=num_prompt_tokens)
                                chunk.usage = usage
                            # otherwise don't
                            else:
@@ -344,12 +365,10 @@ class OpenAIServingChat(OpenAIServing):
                                        request.stream_options.include_usage):
                                    if (request.stream_options.
                                            continuous_usage_stats):
-                                        prompt_tokens = len(
-                                            res.prompt_token_ids)
                                        usage = UsageInfo(
-                                            prompt_tokens=prompt_tokens,
+                                            prompt_tokens=num_prompt_tokens,
                                            completion_tokens=0,
-                                            total_tokens=prompt_tokens)
+                                            total_tokens=num_prompt_tokens)
                                        chunk.usage = usage
                                    else:
                                        chunk.usage = None
@@ -360,65 +379,66 @@ class OpenAIServingChat(OpenAIServing):
                    first_iteration = False

                for output in res.outputs:
-
                    i = output.index

                    if finish_reason_sent[i]:
                        continue

-                    delta_token_ids = output.token_ids[previous_num_tokens[i]:]
-                    out_logprobs = output.logprobs[
-                        previous_num_tokens[i]:] if output.logprobs else None
-
                    if request.logprobs and request.top_logprobs is not None:
-                        assert out_logprobs is not None, (
+                        assert output.logprobs is not None, (
                            "Did not output logprobs")
                        logprobs = self._create_chat_logprobs(
-                            token_ids=delta_token_ids,
-                            top_logprobs=out_logprobs,
+                            token_ids=output.token_ids,
+                            top_logprobs=output.logprobs,
                            tokenizer=tokenizer,
                            num_output_top_logprobs=request.top_logprobs,
                        )
                    else:
                        logprobs = None

-                    delta_text = output.text[len(previous_texts[i]):]
-                    delta_message: Optional[DeltaMessage] = None
+                    delta_text = output.text
+                    delta_message: Optional[DeltaMessage]

                    # handle streaming deltas for tools with named tool_choice
-                    if (request.tool_choice and type(request.tool_choice) is
-                            ChatCompletionNamedToolChoiceParam):
+                    if tool_choice_function_name:
                        delta_message = DeltaMessage(tool_calls=[
                            DeltaToolCall(function=DeltaFunctionCall(
-                                name=request.tool_choice.function.name,
+                                name=tool_choice_function_name,
                                arguments=delta_text),
                                          index=i)
                        ])

                    # handle streaming deltas for tools with "auto" tool choice
-                    elif (self._should_stream_with_auto_tool_parsing(request)
-                          and tool_parser):
+                    elif tool_choice_auto:
+                        assert previous_texts is not None
+                        assert all_previous_token_ids is not None
+                        assert tool_parser is not None
+                        #TODO optimize manipulation of these lists
+                        previous_text = previous_texts[i]
+                        previous_token_ids = all_previous_token_ids[i]
+                        current_text = previous_text + delta_text
+                        current_token_ids = previous_token_ids + list(
+                            output.token_ids)
+
                        delta_message = (
                            tool_parser.extract_tool_calls_streaming(
-                                previous_text=previous_texts[i],
-                                current_text=output.text,
+                                previous_text=previous_text,
+                                current_text=current_text,
                                delta_text=delta_text,
-                                previous_token_ids= \
-                                    output.token_ids[
-                                    :-1 * len(delta_token_ids)
-                                    ],
-                                current_token_ids=output.token_ids,
-                                delta_token_ids=delta_token_ids
-                            )
-                        )
+                                previous_token_ids=previous_token_ids,
+                                current_token_ids=current_token_ids,
+                                delta_token_ids=output.token_ids))
+
+                        # update the previous values for the next iteration
+                        previous_texts[i] = current_text
+                        all_previous_token_ids[i] = current_token_ids

                    # handle streaming just a content delta
                    else:
                        delta_message = DeltaMessage(content=delta_text)

                    # set the previous values for the next iteration
-                    previous_texts[i] = output.text
-                    previous_num_tokens[i] = len(output.token_ids)
+                    previous_num_tokens[i] += len(output.token_ids)

                    # if the message delta is None (e.g. because it was a
                    # "control token" for tool calls or the parser otherwise
@@ -445,13 +465,12 @@ class OpenAIServingChat(OpenAIServing):
                        # handle usage stats if requested & if continuous
                        if (request.stream_options
                                and request.stream_options.include_usage):
-                            if (request.stream_options.continuous_usage_stats):
-                                prompt_tokens = len(res.prompt_token_ids)
+                            if request.stream_options.continuous_usage_stats:
                                completion_tokens = len(output.token_ids)
                                usage = UsageInfo(
-                                    prompt_tokens=prompt_tokens,
+                                    prompt_tokens=num_prompt_tokens,
                                    completion_tokens=completion_tokens,
-                                    total_tokens=prompt_tokens +
+                                    total_tokens=num_prompt_tokens +
                                    completion_tokens,
                                )
                                chunk.usage = usage
@@ -482,7 +501,7 @@ class OpenAIServingChat(OpenAIServing):
                                tool_parser.prev_tool_call_arr[index].get(
                                    "arguments", {}))

-                            # get what we've streamed so for for arguments
+                            # get what we've streamed so far for arguments
                            # for the current tool
                            actual_call = tool_parser.streamed_args_for_tool[
                                index]
@@ -500,7 +519,6 @@ class OpenAIServingChat(OpenAIServing):
                            ])

                        # Send the finish response for each request.n only once
-                        prompt_tokens = len(res.prompt_token_ids)
                        choice_data = ChatCompletionResponseStreamChoice(
                            index=i,
                            delta=delta_message,
@@ -518,13 +536,12 @@ class OpenAIServingChat(OpenAIServing):
                            model=model_name)
                        if (request.stream_options
                                and request.stream_options.include_usage):
-                            if (request.stream_options.continuous_usage_stats):
-                                prompt_tokens = len(res.prompt_token_ids)
+                            if request.stream_options.continuous_usage_stats:
                                completion_tokens = len(output.token_ids)
                                usage = UsageInfo(
-                                    prompt_tokens=prompt_tokens,
+                                    prompt_tokens=num_prompt_tokens,
                                    completion_tokens=completion_tokens,
-                                    total_tokens=prompt_tokens +
+                                    total_tokens=num_prompt_tokens +
                                    completion_tokens,
                                )
                                chunk.usage = usage
@@ -538,10 +555,11 @@ class OpenAIServingChat(OpenAIServing):
            # is sent, send the usage
            if (request.stream_options
                    and request.stream_options.include_usage):
+                completion_tokens = previous_num_tokens[i]
                final_usage = UsageInfo(
-                    prompt_tokens=prompt_tokens,
-                    completion_tokens=previous_num_tokens[i],
-                    total_tokens=prompt_tokens + previous_num_tokens[i],
+                    prompt_tokens=num_prompt_tokens,
+                    completion_tokens=completion_tokens,
+                    total_tokens=num_prompt_tokens + completion_tokens,
                )

                final_usage_chunk = ChatCompletionStreamResponse(
@@ -607,7 +625,7 @@ class OpenAIServingChat(OpenAIServing):

            # if auto tools are not enabled, and a named tool choice using
            #   outlines is not being used
-            if not (self.enable_auto_tools
+            if (not self.enable_auto_tools
                    or not self.tool_parser) and not isinstance(
                        request.tool_choice,
                        ChatCompletionNamedToolChoiceParam):
@@ -680,6 +698,7 @@ class OpenAIServingChat(OpenAIServing):
                                                   or "")
                choice.message.content = full_message

+        assert final_res.prompt_token_ids is not None
        num_prompt_tokens = len(final_res.prompt_token_ids)
        num_generated_tokens = sum(
            len(output.token_ids) for output in final_res.outputs)
@@ -789,9 +808,9 @@ class OpenAIServingChat(OpenAIServing):
        return bool(
            # if there is a delta message that includes tool calls which
            # include a function that has arguments
-            self.enable_auto_tools and self.tool_parser and delta_message
+            output.finish_reason is not None
+            and self.enable_auto_tools and self.tool_parser and delta_message
            and delta_message.tool_calls and delta_message.tool_calls[0]
            and delta_message.tool_calls[0].function
            and delta_message.tool_calls[0].function.arguments is not None
-            and output.finish_reason is not None
        )
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -223,9 +223,10 @@ class OpenAIServingCompletion(OpenAIServing):
        tokenizer: AnyTokenizer,
    ) -> AsyncGenerator[str, None]:
        num_choices = 1 if request.n is None else request.n
-        previous_texts = [""] * num_choices * num_prompts
+        previous_text_lens = [0] * num_choices * num_prompts
        previous_num_tokens = [0] * num_choices * num_prompts
        has_echoed = [False] * num_choices * num_prompts
+        num_prompt_tokens = [0] * num_prompts

        try:
            async for prompt_idx, res in result_generator:
@@ -233,6 +234,10 @@ class OpenAIServingCompletion(OpenAIServing):
                prompt_logprobs = res.prompt_logprobs
                prompt_text = res.prompt

+                # Prompt details are excluded from later streamed outputs
+                if res.prompt_token_ids is not None:
+                    num_prompt_tokens[prompt_idx] = len(res.prompt_token_ids)
+
                delta_token_ids: GenericSequence[int]
                out_logprobs: Optional[GenericSequence[Optional[Dict[
                    int, Logprob]]]]
@@ -244,6 +249,7 @@ class OpenAIServingCompletion(OpenAIServing):

                    assert request.max_tokens is not None
                    if request.echo and request.max_tokens == 0:
+                        assert prompt_token_ids is not None
                        assert prompt_text is not None
                        # only return the prompt
                        delta_text = prompt_text
@@ -252,6 +258,7 @@ class OpenAIServingCompletion(OpenAIServing):
                        has_echoed[i] = True
                    elif (request.echo and request.max_tokens > 0
                          and not has_echoed[i]):
+                        assert prompt_token_ids is not None
                        assert prompt_text is not None
                        assert prompt_logprobs is not None
                        # echo the prompt and first token
@@ -266,11 +273,9 @@ class OpenAIServingCompletion(OpenAIServing):
                        has_echoed[i] = True
                    else:
                        # return just the delta
-                        delta_text = output.text[len(previous_texts[i]):]
-                        delta_token_ids = output.token_ids[
-                            previous_num_tokens[i]:]
-                        out_logprobs = output.logprobs[previous_num_tokens[
-                            i]:] if output.logprobs else None
+                        delta_text = output.text
+                        delta_token_ids = output.token_ids
+                        out_logprobs = output.logprobs

                    if request.logprobs is not None:
                        assert out_logprobs is not None, (
@@ -280,13 +285,13 @@ class OpenAIServingCompletion(OpenAIServing):
                            top_logprobs=out_logprobs,
                            num_output_top_logprobs=request.logprobs,
                            tokenizer=tokenizer,
-                            initial_text_offset=len(previous_texts[i]),
+                            initial_text_offset=previous_text_lens[i],
                        )
                    else:
                        logprobs = None

-                    previous_texts[i] = output.text
-                    previous_num_tokens[i] = len(output.token_ids)
+                    previous_text_lens[i] += len(output.text)
+                    previous_num_tokens[i] += len(output.token_ids)
                    finish_reason = output.finish_reason
                    stop_reason = output.stop_reason

@@ -307,8 +312,8 @@ class OpenAIServingCompletion(OpenAIServing):
                            and request.stream_options.include_usage):
                        if (request.stream_options.continuous_usage_stats
                                or output.finish_reason is not None):
-                            prompt_tokens = len(prompt_token_ids)
-                            completion_tokens = len(output.token_ids)
+                            prompt_tokens = num_prompt_tokens[prompt_idx]
+                            completion_tokens = previous_num_tokens[i]
                            usage = UsageInfo(
                                prompt_tokens=prompt_tokens,
                                completion_tokens=completion_tokens,
@@ -356,6 +361,7 @@ class OpenAIServingCompletion(OpenAIServing):

        for final_res in final_res_batch:
            prompt_token_ids = final_res.prompt_token_ids
+            assert prompt_token_ids is not None
            prompt_logprobs = final_res.prompt_logprobs
            prompt_text = final_res.prompt

@@ -411,9 +417,9 @@ class OpenAIServingCompletion(OpenAIServing):
                )
                choices.append(choice_data)

+                num_generated_tokens += len(output.token_ids)
+
            num_prompt_tokens += len(prompt_token_ids)
-            num_generated_tokens += sum(
-                len(output.token_ids) for output in final_res.outputs)

        usage = UsageInfo(
            prompt_tokens=num_prompt_tokens,
--- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
@@ -33,7 +33,6 @@ class Hermes2ProToolParser(ToolParser):
        self.current_tool_name_sent: bool = False
        self.prev_tool_call_arr: List[Dict] = []
        self.current_tool_id: int = -1
-        self.current_tool_name_sent = False
        self.streamed_args_for_tool: List[str] = [
        ]  # map what has been streamed for each tool so far to a list

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -58,7 +58,6 @@ if TYPE_CHECKING:
    VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False
    VLLM_TEST_FORCE_FP8_MARLIN: bool = False
    VLLM_RPC_GET_DATA_TIMEOUT_MS: int = 5000
-    VLLM_ALLOW_ENGINE_USE_RAY: bool = False
    VLLM_PLUGINS: Optional[List[str]] = None
    VLLM_TORCH_PROFILER_DIR: Optional[str] = None
    VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
@@ -391,14 +390,6 @@ environment_variables: Dict[str, Callable[[], Any]] = {
    "VLLM_RPC_GET_DATA_TIMEOUT_MS":
    lambda: int(os.getenv("VLLM_RPC_GET_DATA_TIMEOUT_MS", "5000")),

-    # If set, allow running the engine as a separate ray actor,
-    # which is a deprecated feature soon to be removed.
-    # See https://github.com/vllm-project/vllm/issues/7045
-    "VLLM_ALLOW_ENGINE_USE_RAY":
-    lambda:
-    (os.environ.get("VLLM_ALLOW_ENGINE_USE_RAY", "0").strip().lower() in
-     ("1", "true")),
-
    # a list of plugin names to load, separated by commas.
    # if this is not set, it means all plugins will be loaded
    # if this is set to an empty string, no plugins will be loaded
--- a/vllm/inputs/parse.py
+++ b/vllm/inputs/parse.py
@@ -5,7 +5,8 @@ from typing_extensions import TypeIs
 from vllm.utils import is_list_of

 from .data import (EncoderDecoderLLMInputs, ExplicitEncoderDecoderPrompt,
-                   LLMInputs, PromptInputs)
+                   LLMInputs, PromptInputs, SingletonPromptInputs, TextPrompt,
+                   TokensPrompt)


 class ParsedText(TypedDict):
@@ -60,8 +61,38 @@ def parse_and_batch_prompt(
                    for elem in prompt
                ]

-    raise ValueError("prompt must be a string, array of strings, "
-                     "array of tokens, or array of token arrays")
+    raise TypeError("prompt must be a string, array of strings, "
+                    "array of tokens, or array of token arrays")
+
+
+class ParsedStrPrompt(TypedDict):
+    type: Literal["str"]
+    content: str
+
+
+class ParsedTextPrompt(TypedDict):
+    type: Literal["text"]
+    content: TextPrompt
+
+
+class ParsedTokensPrompt(TypedDict):
+    type: Literal["tokens"]
+    content: TokensPrompt
+
+
+def parse_singleton_prompt(
+    inputs: SingletonPromptInputs,
+) -> Union[ParsedStrPrompt, ParsedTextPrompt, ParsedTokensPrompt]:
+    if isinstance(inputs, str):
+        return ParsedStrPrompt(type="str", content=inputs)
+    elif isinstance(inputs, dict):
+        if "prompt_token_ids" in inputs:
+            return ParsedTokensPrompt(type="tokens",
+                                      content=inputs)  # type: ignore
+        elif "prompt" in inputs:
+            return ParsedTextPrompt(type="text", content=inputs)
+
+    raise TypeError("inputs must be a string, TextPrompt, or TokensPrompt")


 def is_explicit_encoder_decoder_prompt(
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -0,0 +1,536 @@
+import asyncio
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union
+
+from typing_extensions import assert_never
+
+from vllm.config import ModelConfig
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.prompt_adapter.request import PromptAdapterRequest
+from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
+
+from .data import (EncoderDecoderLLMInputs, LLMInputs, PromptInputs,
+                   SingletonPromptInputs)
+from .parse import is_explicit_encoder_decoder_prompt, parse_singleton_prompt
+
+if TYPE_CHECKING:
+    from vllm.multimodal import MultiModalDataDict
+
+logger = init_logger(__name__)
+
+PromptComponents = Tuple[Optional[str], List[int],
+                         Optional["MultiModalDataDict"]]
+DecoderPromptComponents = Tuple[Optional[str], Optional[List[int]],
+                                Optional["MultiModalDataDict"]]
+
+
+class InputPreprocessor:
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        tokenizer: Optional[BaseTokenizerGroup],
+    ) -> None:
+        super().__init__()
+
+        self.model_config = model_config
+        self.tokenizer = tokenizer
+
+    def get_tokenizer_group(self) -> BaseTokenizerGroup:
+        if self.tokenizer is None:
+            raise ValueError("You cannot pass text prompts when "
+                             "`skip_tokenizer_init` is True")
+
+        return self.tokenizer
+
+    def get_bos_token_id(self,
+                         lora_request: Optional[LoRARequest] = None
+                         ) -> Optional[int]:
+        if self.tokenizer is None:
+            logger.warning("Using None for BOS token id because tokenizer "
+                           "is not initialized")
+            return None
+
+        return self.tokenizer.get_lora_tokenizer(lora_request).bos_token_id
+
+    def get_eos_token_id(self,
+                         lora_request: Optional[LoRARequest] = None
+                         ) -> Optional[int]:
+        if self.tokenizer is None:
+            logger.warning("Using None for EOS token id because tokenizer "
+                           "is not initialized")
+            return None
+
+        return self.tokenizer.get_lora_tokenizer(lora_request).eos_token_id
+
+    def get_decoder_start_token_id(self) -> Optional[int]:
+        '''
+        Obtain the decoder start token id employed by an encoder/decoder
+        model. Returns None for non-encoder/decoder models or if the
+        model config is unavailable.
+        '''
+
+        if not self.is_encoder_decoder_model():
+            logger.warning("Using None for decoder start token id because "
+                           "this is not an encoder/decoder model.")
+            return None
+
+        if (self.model_config is None or self.model_config.hf_config is None):
+            logger.warning("Using None for decoder start token id because "
+                           "model config is not available.")
+            return None
+
+        dec_start_token_id = getattr(self.model_config.hf_config,
+                                     'decoder_start_token_id', None)
+        if dec_start_token_id is None:
+            logger.warning("Falling back on <BOS> for decoder start token id "
+                           "because decoder start token id is not available.")
+            dec_start_token_id = self.get_bos_token_id()
+
+        return dec_start_token_id
+
+    def _get_default_enc_dec_decoder_prompt(self) -> List[int]:
+        '''
+        Specifically for encoder/decoder models:
+        generate a default decoder prompt for when
+        the user specifies only the encoder prompt.
+
+        Encoder/decoder models utilize the decoder
+        prompt in different ways; as new models are
+        added, it is intended that this function
+        will be extended to produce differing
+        default decoder prompts, depending on the
+        model variety.
+
+        Absent a special case, the default behavior
+        of this method is to mirror the behavior of
+        the HuggingFace (HF) GenerationMixin for a None
+        decoder prompt, which is to employ a logit processor
+        setting to force the first decoded token to be <BOS>.
+        Here, this behavior is approximated by having the
+        "default" decoder prompt be <BOS>.
+
+        However, it is possible that in the future
+        other models may have different or more 
+        complex logic for the default decoder prompt.
+        This motivates having a special helper method
+        for default decoder prompts.
+
+        Returns:
+
+        * prompt_token_ids
+        '''
+
+        bos_token_id = self.get_bos_token_id()
+        assert bos_token_id is not None
+        return [bos_token_id]
+
+    def _prepare_decoder_input_ids_for_generation(
+        self,
+        decoder_input_ids: Optional[List[int]],
+    ) -> List[int]:
+        """
+        Prepares `decoder_input_ids` for generation with encoder-decoder models.
+
+        Based on
+
+        https://github.com/huggingface/transformers/blob/
+        4037a2b5b1278736e566aec12e169100275545ea/
+        src/transformers/generation/utils.py
+
+        specifically GenerationMixin._prepare_decoder_input_ids_for_generation()
+
+        Arguments:
+
+        * decoder_input_ids: input token ids to preprocess
+
+        Returns:
+
+        * Processed token list
+        """
+
+        decoder_start_token_id = self.get_decoder_start_token_id()
+        assert decoder_start_token_id is not None
+
+        if decoder_input_ids is None:
+            # no decoder prompt input ->
+            # use decoder_start_token_id as decoder_input_ids
+            decoder_input_ids = self._get_default_enc_dec_decoder_prompt()
+
+        if (len(decoder_input_ids) == 0
+                or decoder_input_ids[0] != decoder_start_token_id):
+            decoder_input_ids = [decoder_start_token_id] + decoder_input_ids
+
+        return decoder_input_ids
+
+    def _apply_prompt_adapter(
+        self,
+        prompt_token_ids: List[int],
+        prompt_adapter_request: Optional[PromptAdapterRequest],
+    ) -> List[int]:
+        if prompt_adapter_request:
+            prompt_token_ids = (
+                [0] * prompt_adapter_request.prompt_adapter_num_virtual_tokens
+                + prompt_token_ids)
+
+        return prompt_token_ids
+
+    def _tokenize_prompt(
+        self,
+        prompt: str,
+        request_id: str,
+        lora_request: Optional[LoRARequest],
+    ) -> List[int]:
+        """
+        Apply the model's tokenizer to a text prompt, returning the
+        corresponding token IDs.
+        """
+        tokenizer = self.get_tokenizer_group()
+
+        return tokenizer.encode(request_id=request_id,
+                                prompt=prompt,
+                                lora_request=lora_request)
+
+    async def _tokenize_prompt_async(
+        self,
+        prompt: str,
+        request_id: str,
+        lora_request: Optional[LoRARequest],
+    ) -> List[int]:
+        """Async version of :meth:`_tokenize_prompt`."""
+        tokenizer = self.get_tokenizer_group()
+
+        return await tokenizer.encode_async(request_id=request_id,
+                                            prompt=prompt,
+                                            lora_request=lora_request)
+
+    def _extract_prompt_components(
+        self,
+        inputs: SingletonPromptInputs,
+        request_id: str,
+        lora_request: Optional[LoRARequest] = None,
+    ) -> PromptComponents:
+        '''
+        Extract the components of any single encoder or decoder input prompt.
+
+        Arguments:
+
+        * request_id
+        * inputs: single encoder or decoder input prompt
+        * lora_request: this is only valid for decoder prompts
+
+        Returns:
+
+        * prompt
+        * prompt_token_ids
+        * multi_modal_data
+        '''
+
+        parsed = parse_singleton_prompt(inputs)
+
+        if parsed["type"] == "str":
+            prompt = parsed["content"]
+            prompt_token_ids = self._tokenize_prompt(
+                prompt,
+                request_id=request_id,
+                lora_request=lora_request,
+            )
+            multi_modal_data = None
+        elif parsed["type"] == "tokens":
+            prompt = None
+            prompt_token_ids = parsed["content"]["prompt_token_ids"]
+            multi_modal_data = parsed["content"].get("multi_modal_data")
+        elif parsed["type"] == "text":
+            prompt = parsed["content"]["prompt"]
+            prompt_token_ids = self._tokenize_prompt(
+                prompt,
+                request_id=request_id,
+                lora_request=lora_request,
+            )
+            multi_modal_data = parsed["content"].get("multi_modal_data")
+        else:
+            assert_never(parsed)
+
+        return prompt, prompt_token_ids, multi_modal_data
+
+    async def _extract_prompt_components_async(
+        self,
+        inputs: SingletonPromptInputs,
+        request_id: str,
+        lora_request: Optional[LoRARequest] = None,
+    ) -> PromptComponents:
+        """Async version of :meth:`_extract_prompt_components`."""
+        parsed = parse_singleton_prompt(inputs)
+
+        if parsed["type"] == "str":
+            prompt = parsed["content"]
+            prompt_token_ids = await self._tokenize_prompt_async(
+                prompt,
+                request_id=request_id,
+                lora_request=lora_request,
+            )
+            multi_modal_data = None
+        elif parsed["type"] == "tokens":
+            prompt = None
+            prompt_token_ids = parsed["content"]["prompt_token_ids"]
+            multi_modal_data = parsed["content"].get("multi_modal_data")
+        elif parsed["type"] == "text":
+            prompt = parsed["content"]["prompt"]
+            prompt_token_ids = await self._tokenize_prompt_async(
+                prompt,
+                request_id=request_id,
+                lora_request=lora_request,
+            )
+            multi_modal_data = parsed["content"].get("multi_modal_data")
+        else:
+            assert_never(parsed)
+
+        return prompt, prompt_token_ids, multi_modal_data
+
+    def _build_enc_dec_llm_inputs(
+        self,
+        encoder_comps: PromptComponents,
+        decoder_comps: DecoderPromptComponents,
+    ) -> EncoderDecoderLLMInputs:
+        encoder_prompt, encoder_prompt_ids, encoder_mm_data = encoder_comps
+        decoder_prompt, decoder_prompt_ids, decoder_mm_data = decoder_comps
+
+        if encoder_mm_data is not None or decoder_mm_data is not None:
+            raise ValueError("Multi-modal encoder-decoder models are "
+                             "not supported yet")
+
+        decoder_prompt_ids = (
+            self._prepare_decoder_input_ids_for_generation(decoder_prompt_ids))
+
+        return EncoderDecoderLLMInputs(
+            prompt_token_ids=decoder_prompt_ids,
+            prompt=decoder_prompt,
+            encoder_prompt_token_ids=encoder_prompt_ids,
+            encoder_prompt=encoder_prompt,
+        )
+
+    def _process_encoder_decoder_prompt(
+        self,
+        inputs: PromptInputs,
+        request_id: str,
+    ) -> EncoderDecoderLLMInputs:
+        '''
+        For encoder/decoder models only:
+        Process an input prompt into an
+        :class:`EncoderDecoderLLMInputs` instance.
+
+        There are two types of input prompts:
+        singleton prompts which carry only the
+        encoder prompt, and explicit encoder/decoder
+        prompts which carry both the encoder and the
+        decoder prompts as member variables.
+
+        This function handles the following scenarios:
+        * Singleton encoder prompt: extract encoder prompt
+          token ids & infer default decoder prompt token ids
+        * Explicit encoder/decoder prompt: extract encoder
+          and decoder prompt token ids
+
+        Note that for Explicit encoder/decoder prompts,
+        each sub-prompt (encoder or decoder prompt) can
+        have any possible singleton type; thus this
+        method relies on helper functions to obtain
+        token ids for the sub-prompts.
+        
+        Arguments:
+
+        * inputs: an input prompt
+        * request_id
+
+        Returns:
+
+        * :class:`EncoderDecoderLLMInputs` instance
+        '''
+
+        encoder_comps: PromptComponents
+        decoder_comps: DecoderPromptComponents
+
+        if is_explicit_encoder_decoder_prompt(inputs):
+            encoder_comps = self._extract_prompt_components(
+                inputs["encoder_prompt"],
+                request_id=request_id,
+            )
+
+            if (decoder_input := inputs["decoder_prompt"]) is None:
+                decoder_comps = None, None, None
+            else:
+                decoder_comps = self._extract_prompt_components(
+                    decoder_input,
+                    request_id=request_id,
+                )
+        else:
+            encoder_comps = self._extract_prompt_components(
+                inputs,
+                request_id=request_id,
+            )
+
+            decoder_comps = None, None, None
+
+        return self._build_enc_dec_llm_inputs(encoder_comps, decoder_comps)
+
+    async def _process_encoder_decoder_prompt_async(
+        self,
+        inputs: PromptInputs,
+        request_id: str,
+    ) -> EncoderDecoderLLMInputs:
+        """Async version of :meth:`_process_encoder_decoder_prompt`."""
+        encoder_comps: PromptComponents
+        decoder_comps: DecoderPromptComponents
+
+        if is_explicit_encoder_decoder_prompt(inputs):
+            encoder_task = self._extract_prompt_components_async(
+                inputs["encoder_prompt"],
+                request_id=request_id,
+            )
+
+            if (decoder_input := inputs["decoder_prompt"]) is None:
+                encoder_comps = await encoder_task
+                decoder_comps = None, None, None
+            else:
+                decoder_task = self._extract_prompt_components_async(
+                    decoder_input,
+                    request_id=request_id,
+                )
+
+                encoder_comps, decoder_comps = await asyncio.gather(
+                    encoder_task, decoder_task)
+        else:
+            encoder_comps = await self._extract_prompt_components_async(
+                inputs,
+                request_id=request_id,
+            )
+
+            decoder_comps = None, None, None
+
+        return self._build_enc_dec_llm_inputs(encoder_comps, decoder_comps)
+
+    def _build_decoder_only_llm_inputs(
+        self,
+        prompt_comps: PromptComponents,
+        prompt_adapter_request: Optional[PromptAdapterRequest],
+    ) -> LLMInputs:
+        prompt, prompt_token_ids, multi_modal_data = prompt_comps
+
+        prompt_token_ids = self._apply_prompt_adapter(
+            prompt_token_ids, prompt_adapter_request=prompt_adapter_request)
+
+        return LLMInputs(prompt_token_ids=prompt_token_ids,
+                         prompt=prompt,
+                         multi_modal_data=multi_modal_data)
+
+    def _process_decoder_only_prompt(
+        self,
+        inputs: SingletonPromptInputs,
+        request_id: str,
+        lora_request: Optional[LoRARequest] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+    ) -> LLMInputs:
+        '''
+        For decoder-only models:
+        Process an input prompt into an :class:`LLMInputs` instance.
+
+        Arguments:
+
+        * inputs: input prompt
+        * request_id
+        * lora_request
+        * prompt_adapter_request
+
+        Returns:
+
+        * :class:`LLMInputs` instance
+        '''
+
+        prompt_comps = self._extract_prompt_components(
+            inputs,
+            request_id=request_id,
+            lora_request=lora_request,
+        )
+
+        return self._build_decoder_only_llm_inputs(
+            prompt_comps,
+            prompt_adapter_request=prompt_adapter_request,
+        )
+
+    async def _process_decoder_only_prompt_async(
+        self,
+        inputs: SingletonPromptInputs,
+        request_id: str,
+        lora_request: Optional[LoRARequest] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+    ) -> LLMInputs:
+        """Async version of :meth:`_process_decoder_only_prompt`."""
+        prompt_comps = await self._extract_prompt_components_async(
+            inputs,
+            request_id=request_id,
+            lora_request=lora_request,
+        )
+
+        return self._build_decoder_only_llm_inputs(
+            prompt_comps,
+            prompt_adapter_request=prompt_adapter_request,
+        )
+
+    def preprocess(
+        self,
+        inputs: PromptInputs,
+        request_id: str,
+        lora_request: Optional[LoRARequest] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+    ) -> Union[LLMInputs, EncoderDecoderLLMInputs]:
+        """Preprocess the input prompt."""
+        if self.is_encoder_decoder_model():
+            # Encoder-decoder model requires special mapping of
+            # input prompts to encoder & decoder
+            return self._process_encoder_decoder_prompt(
+                inputs,
+                request_id=request_id,
+            )
+
+        if is_explicit_encoder_decoder_prompt(inputs):
+            raise ValueError("Cannot pass encoder-decoder prompt "
+                             "to decoder-only models")
+
+        # Decoder-only operation
+        return self._process_decoder_only_prompt(
+            inputs,
+            request_id=request_id,
+            lora_request=lora_request,
+            prompt_adapter_request=prompt_adapter_request,
+        )
+
+    async def preprocess_async(
+        self,
+        inputs: PromptInputs,
+        request_id: str,
+        lora_request: Optional[LoRARequest] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+    ) -> Union[LLMInputs, EncoderDecoderLLMInputs]:
+        """Async version of :meth:`preprocess`."""
+        if self.is_encoder_decoder_model():
+            # Encoder-decoder model requires special mapping of
+            # input prompts to encoder & decoder
+            return await self._process_encoder_decoder_prompt_async(
+                inputs,
+                request_id=request_id,
+            )
+
+        if is_explicit_encoder_decoder_prompt(inputs):
+            raise ValueError("Cannot pass encoder-decoder prompt "
+                             "to decoder-only models")
+
+        # Decoder-only operation
+        return await self._process_decoder_only_prompt_async(
+            inputs,
+            request_id=request_id,
+            lora_request=lora_request,
+            prompt_adapter_request=prompt_adapter_request,
+        )
+
+    def is_encoder_decoder_model(self):
+        return self.model_config.is_encoder_decoder_model
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -410,6 +410,7 @@ def fused_topk(

    if renormalize:
        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+
    return topk_weights, topk_ids


@@ -443,7 +444,8 @@ def grouped_topk(hidden_states: torch.Tensor,

    if renormalize:
        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
-    return topk_weights, topk_ids
+
+    return topk_weights, topk_ids.to(torch.int32)


 def get_config_dtype_str(dtype: torch.dtype,
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -990,7 +990,7 @@ def get_rope(
                base, is_neox_style, dtype, short_factor, long_factor,
                **extra_kwargs)
        elif scaling_type == "mrope":
-            return MRotaryEmbedding(
+            rotary_emb = MRotaryEmbedding(
                head_size,
                rotary_dim,
                max_position,
--- a/vllm/model_executor/models/init.py
+++ b/vllm/model_executor/models/init.py
@@ -90,12 +90,12 @@ _MULTIMODAL_MODELS = {
    "PaliGemmaForConditionalGeneration": ("paligemma",
                                          "PaliGemmaForConditionalGeneration"),
    "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
-    "UltravoxModel": ("ultravox", "UltravoxModel"),
-    "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
    "PixtralForConditionalGeneration": ("pixtral",
                                        "PixtralForConditionalGeneration"),
+    "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
    "Qwen2VLForConditionalGeneration": ("qwen2_vl",
                                        "Qwen2VLForConditionalGeneration"),
+    "UltravoxModel": ("ultravox", "UltravoxModel"),
 }
 _CONDITIONAL_GENERATION_MODELS = {
    "BartModel": ("bart", "BartForConditionalGeneration"),
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Simon Mo	9ba0817ff1	bump version to v0.6.1.post2 (#8473 ) Some checks failed Create Release / Create Release (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.10, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.11, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.12, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.8, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.9, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (12.1, ubuntu-20.04, 3.10, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (12.1, ubuntu-20.04, 3.11, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (12.1, ubuntu-20.04, 3.12, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (12.1, ubuntu-20.04, 3.8, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (12.1, ubuntu-20.04, 3.9, 2.4.0) (push) Has been cancelled Details	2024-09-13 11:35:00 -07:00
Nick Hill	18e9e1f7b3	[HotFix] Fix final output truncation with stop string + streaming (#8468 )	2024-09-13 11:31:12 -07:00
Isotr0py	f57092c00b	[Doc] Add oneDNN installation to CPU backend documentation (#8467 )	2024-09-13 18:06:30 +00:00
Cyrus Leung	a84e598e21	[CI/Build] Reorganize models tests (#7820 )	2024-09-13 10:20:06 -07:00
youkaichao	0a4806f0a9	[plugin][torch.compile] allow to add custom compile backend (#8445 )	2024-09-13 09:32:42 -07:00
Cyrus Leung	ecd7a1d5b6	[Installation] Gate FastAPI version for Python 3.8 (#8456 )	2024-09-13 09:02:26 -07:00
youkaichao	a2469127db	[misc][ci] fix quant test (#8449 )	2024-09-13 17:20:14 +08:00
Jee Jee Li	06311e2956	[Misc] Skip loading extra bias for Qwen2-VL GPTQ-Int8 (#8442 )	2024-09-13 07:58:28 +00:00
youkaichao	cab69a15e4	[doc] recommend pip instead of conda (#8446 )	2024-09-12 23:52:41 -07:00
Isotr0py	9b4a3b235e	[CI/Build] Enable InternVL2 PP test only on single node (#8437 )	2024-09-13 06:35:20 +00:00
Simon Mo	acda0b35d0	bump version to v0.6.1.post1 (#8440 ) Some checks failed Create Release / Create Release (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.10, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.11, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.12, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.8, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (11.8, ubuntu-20.04, 3.9, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (12.1, ubuntu-20.04, 3.10, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (12.1, ubuntu-20.04, 3.11, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (12.1, ubuntu-20.04, 3.12, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (12.1, ubuntu-20.04, 3.8, 2.4.0) (push) Has been cancelled Details Create Release / Build Wheel (12.1, ubuntu-20.04, 3.9, 2.4.0) (push) Has been cancelled Details	2024-09-12 21:39:49 -07:00
William Lin	ba77527955	[bugfix] torch profiler bug for single gpu with GPUExecutor (#8354 )	2024-09-12 21:30:00 -07:00
Alexander Matveev	6821020109	[Bugfix] Fix async log stats (#8417 )	2024-09-12 20:48:59 -07:00
Cyrus Leung	8427550488	[CI/Build] Update pixtral tests to use JSON (#8436 )	2024-09-13 03:47:52 +00:00
Cyrus Leung	3f79bc3d1a	[Bugfix] Bump fastapi and pydantic version (#8435 )	2024-09-13 03:21:42 +00:00
shangmingc	40c396533d	[Bugfix] Mapping physical device indices for e2e test utils (#8290 )	2024-09-13 11:06:28 +08:00
Cyrus Leung	5ec9c0fb3c	[Core] Factor out input preprocessing to a separate class (#7329 )	2024-09-13 02:56:13 +00:00
Dipika Sikka	8f44a92d85	[BugFix] fix group_topk (#8430 )	2024-09-13 09:23:42 +08:00
Roger Wang	360ddbd37e	[Misc] Update Pixtral example (#8431 )	2024-09-12 17:31:18 -07:00
Wenxiang	a480939e8e	[Bugfix] Fix weight loading issue by rename variable. (#8293 )	2024-09-12 19:25:00 -04:00
Patrick von Platen	d31174a4e1	[Hotfix][Pixtral] Fix multiple images bugs (#8415 )	2024-09-12 15:21:51 -07:00
Roger Wang	b61bd98f90	[CI/Build] Disable multi-node test for InternVL2 (#8428 )	2024-09-12 15:05:35 -07:00
Roger Wang	c16369455f	[Hotfix][Core][VLM] Disable chunked prefill by default and prefix caching for multimodal models (#8425 )	2024-09-12 14:06:51 -07:00
Alexander Matveev	019877253b	[Bugfix] multi-step + flashinfer: ensure cuda graph compatible (#8427 )	2024-09-12 21:01:50 +00:00
Nick Hill	551ce01078	[Core] Add engine option to return only deltas or final output (#7381 )	2024-09-12 12:02:00 -07:00
William Lin	a6c0f3658d	[multi-step] add flashinfer backend (#7928 )	2024-09-12 11:16:22 -07:00
Joe Runde	f2e263b801	[Bugfix] Offline mode fix (#8376 ) Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>	2024-09-12 11:11:57 -07:00
Luis Vega	1f0c75afa9	[BugFix] Fix Duplicate Assignment in Hermes2ProToolParser (#8423 )	2024-09-12 11:10:11 -07:00
WANGWEI	8a23e93302	[BugFix] lazy init _copy_stream to avoid torch init wrong gpu instance (#8403 )	2024-09-12 10:47:42 -07:00
Alex Brooks	c6202daeed	[Model] Support multiple images for qwen-vl (#8247 ) Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>	2024-09-12 10:10:54 -07:00
Isotr0py	e56bf27741	[Bugfix] Fix InternVL2 inference with various num_patches (#8375 ) Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>	2024-09-12 10:10:35 -07:00
Roger Wang	520ca380ae	[Hotfix][VLM] Fixing max position embeddings for Pixtral (#8399 )	2024-09-12 09:28:37 -07:00
youkaichao	7de49aa86c	[torch.compile] hide slicing under custom op for inductor (#8384 )	2024-09-12 00:11:55 -07:00
Woosuk Kwon	42ffba11ad	[Misc] Use RoPE cache for MRoPE (#8396 )	2024-09-11 23:13:14 -07:00
Kevin Lin	295c4730a8	[Misc] Raise error when using encoder/decoder model with cpu backend (#8355 )	2024-09-12 05:45:24 +00:00
Blueyo0	1bf2dd9df0	[Gemma2] add bitsandbytes support for Gemma2 (#8338 )	2024-09-11 21:53:12 -07:00
tomeras91	5a60699c45	[Bugfix]: Fix the logic for deciding if tool parsing is used (#8366 )	2024-09-12 03:55:30 +00:00
Michael Goin	b6c75e1cf2	Fix the AMD weight loading tests (#8390 )	2024-09-11 20:35:33 -07:00
Woosuk Kwon	b71c956deb	[TPU] Use Ray for default distributed backend (#8389 )	2024-09-11 20:31:51 -07:00
youkaichao	f842a7aff1	[misc] remove engine_use_ray (#8126 )	2024-09-11 18:23:36 -07:00
Cody Yu	a65cb16067	[MISC] Dump model runner inputs when crashing (#8305 )	2024-09-12 01:12:25 +00:00