diff --git a/tests/entrypoints/openai/responses/test_parsable_context.py b/tests/entrypoints/openai/responses/test_parsable_context.py
index 48cb28a0f..16a5c735e 100644
--- a/tests/entrypoints/openai/responses/test_parsable_context.py
+++ b/tests/entrypoints/openai/responses/test_parsable_context.py
@@ -172,19 +172,26 @@ async def test_mcp_tool_call(client: OpenAI, model_name: str):
 
     assert response is not None
     assert response.status == "completed"
-    assert response.output[0].type == "reasoning"
-    assert response.output[1].type == "mcp_call"
-    assert type(response.output[1].arguments) is str
-    assert type(response.output[1].output) is str
-    assert response.output[2].type == "reasoning"
-    # make sure the correct math is in the final output
-    assert response.output[3].type == "message"
-    assert any(s in response.output[3].content[0].text for s in ("56088", "56,088"))
 
-    # test raw input_messages / output_messages
+    # The model may produce multiple reasoning/mcp_call rounds before the
+    # final message, so validate structurally rather than by exact index.
+    output_types = [o.type for o in response.output]
+    assert "reasoning" in output_types
+    mcp_calls = [o for o in response.output if o.type == "mcp_call"]
+    assert len(mcp_calls) >= 1
+    assert type(mcp_calls[0].arguments) is str
+    assert type(mcp_calls[0].output) is str
+
+    # The final output should be a message containing the correct answer
+    assert response.output[-1].type == "message"
+    assert any(s in response.output[-1].content[0].text for s in ("56088", "56,088"))
+
+    # Test raw input_messages / output_messages
     assert len(response.input_messages) == 1
-    assert len(response.output_messages) == 3
-    assert any(s in response.output_messages[2]["message"] for s in ("56088", "56,088"))
+    assert len(response.output_messages) >= 3
+    assert any(
+        s in response.output_messages[-1]["message"] for s in ("56088", "56,088")
+    )
 
 
 @pytest.mark.asyncio