[Model][Spec Decode] Nemotron-H MTP and Mamba Speculative Decoding Support (#33726)

Signed-off-by: Shahar Mor <smor@nvidia.com> Signed-off-by: Benjamin Chislett <bchislett@nvidia.com> Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Co-authored-by: Shahar Mor <smor@nvidia.com> Co-authored-by: Roi Koren <roik@nvidia.com> Co-authored-by: Lucas Wilkinson <lwilkins@redhat.com>
2026-02-24 12:49:56 -05:00
parent a9e15e040d
commit f5972a872f
19 changed files with 799 additions and 157 deletions
--- a/tests/v1/attention/test_mamba_update_block_table.py
+++ b/tests/v1/attention/test_mamba_update_block_table.py
@@ -41,6 +41,9 @@ def _make_vllm_config(block_size, max_model_len, max_num_seqs):
            cudagraph_mode=CUDAGraphMode.FULL,
            max_cudagraph_capture_size=None,
        ),
+        speculative_config=None,
+        num_speculative_tokens=0,
+        parallel_config=SimpleNamespace(decode_context_parallel_size=1),
        scheduler_config=SimpleNamespace(max_num_seqs=max_num_seqs),
        model_config=SimpleNamespace(max_model_len=max_model_len),
    )
@@ -92,7 +95,10 @@ def test_update_block_table_copies_block_idx_to_persistent_buffers():
        has_initial_states_p=None,
        query_start_loc_p=None,
        num_computed_tokens_p=None,
-        state_indices_tensor=builder_a.state_indices_tensor[:num_reqs],
+        state_indices_tensor_p=None,
+        query_start_loc_d=None,
+        num_accepted_tokens=None,
+        state_indices_tensor_d=builder_a.state_indices_tensor_d[:num_reqs],
        block_idx_last_scheduled_token=(
            builder_a.block_idx_last_scheduled_token[:num_reqs]
        ),