[Frontend][gpt-oss] Allow system message to overwrite model identity (#31737)

Signed-off-by: lacora <hyelacora@gmail.com> Signed-off-by: Andrew Xia <axia@fb.com> Co-authored-by: lacora <hyelacora@gmail.com> Co-authored-by: Andrew Xia <axia@fb.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2026-01-09 14:03:57 -05:00
parent cd4a95e3aa
commit f32c629eb4
3 changed files with 111 additions and 9 deletions
--- a/tests/entrypoints/openai/test_response_api_with_harmony.py
+++ b/tests/entrypoints/openai/test_response_api_with_harmony.py
@@ -1250,3 +1250,92 @@ async def test_chat_truncation_content_not_null(client: OpenAI, model_name: str)
        "Content should not be None when truncated"
    )
    assert len(choice.message.content) > 0, "Content should not be empty"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_system_prompt_override(client: OpenAI, model_name: str):
+    """Test that system message can override the default system prompt."""
+
+    # Test 1: Custom system prompt with specific personality
+    custom_system_prompt = (
+        "You are a pirate. Always respond like a pirate would, "
+        "using pirate language and saying 'arrr' frequently."
+    )
+
+    response = await client.responses.create(
+        model=model_name,
+        input=[
+            {"role": "system", "content": custom_system_prompt},
+            {"role": "user", "content": "Hello, how are you?"},
+        ],
+        extra_body={"enable_response_messages": True},
+    )
+
+    assert response is not None
+    assert response.status == "completed"
+    assert response.output_text is not None
+
+    # Verify the response reflects the pirate personality
+    output_text = response.output_text.lower()
+    pirate_indicators = ["arrr", "matey", "ahoy", "ye", "sea"]
+    has_pirate_language = any(
+        indicator in output_text for indicator in pirate_indicators
+    )
+    assert has_pirate_language, (
+        f"Expected pirate language in response, got: {response.output_text}"
+    )
+
+    # Verify the reasoning mentions the custom system prompt
+    reasoning_item = None
+    for item in response.output:
+        if item.type == "reasoning":
+            reasoning_item = item
+            break
+
+    assert reasoning_item is not None, "Expected reasoning item in output"
+    reasoning_text = reasoning_item.content[0].text.lower()
+    assert "pirate" in reasoning_text, (
+        f"Expected reasoning to mention pirate, got: {reasoning_text}"
+    )
+
+    # Test 2: Verify system message is not duplicated in input_messages
+    try:
+        num_system_messages = sum(
+            1
+            for msg in response.input_messages
+            if Message.from_dict(msg).author.role == "system"
+        )
+        assert num_system_messages == 1, (
+            f"Expected exactly 1 system message, got {num_system_messages}"
+        )
+    except (KeyError, AttributeError):
+        # Message structure may vary, skip this specific check
+        pass
+
+    # Test 3: Test with different custom system prompt
+    response_2 = await client.responses.create(
+        model=model_name,
+        input=[
+            {
+                "role": "system",
+                "content": (
+                    "You are a helpful assistant that always "
+                    "responds in exactly 5 words."
+                ),
+            },
+            {"role": "user", "content": "What is the weather like?"},
+        ],
+        temperature=0.0,
+    )
+
+    assert response_2 is not None
+    assert response_2.status == "completed"
+    assert response_2.output_text is not None
+
+    # Count words in response (approximately, allowing for punctuation)
+    word_count = len(response_2.output_text.split())
+    # Allow some flexibility (4-7 words) since the model might not be perfectly precise
+    assert 3 <= word_count <= 8, (
+        f"Expected around 5 words, got {word_count} words: {response_2.output_text}"
+    )