diff --git a/tests/entrypoints/openai/responses/test_harmony.py b/tests/entrypoints/openai/responses/test_harmony.py
index 9d97800a9..36d51812e 100644
--- a/tests/entrypoints/openai/responses/test_harmony.py
+++ b/tests/entrypoints/openai/responses/test_harmony.py
@@ -906,6 +906,10 @@ async def test_function_calling_no_code_interpreter_events(
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.skip(
+    reason="This test is flaky in CI, needs investigation and "
+    "potential fixes in the code interpreter MCP implementation."
+)
 async def test_mcp_code_interpreter_streaming(client: OpenAI, model_name: str, server):
     tools = [{"type": "mcp", "server_label": "code_interpreter"}]
     input_text = (