Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions configs/default_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,13 @@ llm:
api_key: null # API key (defaults to OPENAI_API_KEY env variable)
# or use ${VAR} syntax to specify which environment variable to read from:
# api_key: ${GEMINI_API_KEY} # Reads API key from $GEMINI_API_KEY
api_type: "auto" # API type: "auto, "responses" or "chat_completions"

# Generation parameters
temperature: 0.7 # Temperature for generation (higher = more creative)
top_p: 0.95 # Top-p sampling parameter
max_tokens: 4096 # Maximum tokens to generate
reasoning_effort: "medium" # Reasoning effort: "low", "medium", "high", "xhigh"

# Request parameters
timeout: 60 # Timeout for API requests in seconds
Expand Down
12 changes: 12 additions & 0 deletions openevolve/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,13 @@ class LLMModelConfig:
# Reasoning parameters
reasoning_effort: Optional[str] = None

# API type selection: "auto" (default), "responses", or "chat_completions"
# - "auto": Use Responses API for OpenAI endpoints, Chat Completions for others
# - "responses": Force use of OpenAI Responses API
# - "chat_completions": Force use of Chat Completions API
# None means inherit from parent config (defaults to "auto")
api_type: Optional[str] = None

def __post_init__(self):
"""Post-initialization to resolve ${VAR} env var references in api_key"""
self.api_key = _resolve_env_var(self.api_key)
Expand Down Expand Up @@ -116,6 +123,9 @@ class LLMConfig(LLMModelConfig):
# Reasoning parameters (inherited from LLMModelConfig but can be overridden)
reasoning_effort: Optional[str] = None

# API type for LLM level (defaults to "auto" for auto-detection)
api_type: str = "auto"

def __post_init__(self):
"""Post-initialization to set up model configurations"""
super().__post_init__() # Resolve ${VAR} in api_key at LLMConfig level
Expand Down Expand Up @@ -170,6 +180,7 @@ def __post_init__(self):
"retry_delay": self.retry_delay,
"random_seed": self.random_seed,
"reasoning_effort": self.reasoning_effort,
"api_type": self.api_type,
}
self.update_model_params(shared_config)

Expand Down Expand Up @@ -223,6 +234,7 @@ def rebuild_models(self) -> None:
"retry_delay": self.retry_delay,
"random_seed": self.random_seed,
"reasoning_effort": self.reasoning_effort,
"api_type": self.api_type,
}
self.update_model_params(shared_config)

Expand Down
117 changes: 111 additions & 6 deletions openevolve/llm/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def __init__(
self.api_key = model_cfg.api_key
self.random_seed = getattr(model_cfg, "random_seed", None)
self.reasoning_effort = getattr(model_cfg, "reasoning_effort", None)
self.api_type = getattr(model_cfg, "api_type", "auto")

# Set up API client
# OpenAI client requires max_retries to be int, not None
Expand All @@ -45,6 +46,9 @@ def __init__(
max_retries=max_retries,
)

# Determine which API to use (Responses API vs Chat Completions)
self.use_responses_api = self._should_use_responses_api()

# Only log unique models to reduce duplication
if not hasattr(logger, "_initialized_models"):
logger._initialized_models = set()
Expand All @@ -53,6 +57,39 @@ def __init__(
logger.info(f"Initialized OpenAI LLM with model: {self.model}")
logger._initialized_models.add(self.model)

def _should_use_responses_api(self) -> bool:
"""
Determine if the Responses API should be used instead of Chat Completions.

The Responses API is only available on OpenAI's official endpoints.
For other providers (OpenRouter, Google AI Studio, local servers, etc.),
we must use the Chat Completions API for compatibility.

Returns:
True if Responses API should be used, False for Chat Completions
"""
# Normalize api_type (None defaults to "auto")
api_type = self.api_type if self.api_type is not None else "auto"

# Check for explicit override
if api_type == "responses":
return True
if api_type == "chat_completions":
return False

# Auto-detect based on API base URL
if not self.api_base:
return False

api_lower = self.api_base.lower()

# Only use Responses API for official OpenAI endpoints
return (
api_lower.startswith("https://api.openai.com") or
api_lower.startswith("https://eu.api.openai.com") or
api_lower.startswith("https://apac.api.openai.com")
)

Comment on lines +60 to +92
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can also delete this and force user to specify

async def generate(self, prompt: str, **kwargs) -> str:
"""Generate text from a prompt"""
return await self.generate_with_context(
Expand Down Expand Up @@ -159,14 +196,82 @@ async def generate_with_context(
raise

async def _call_api(self, params: Dict[str, Any]) -> str:
"""Make the actual API call"""
"""Make the actual API call, dispatching to appropriate API"""
# Use asyncio to run the blocking API call in a thread pool
loop = asyncio.get_event_loop()
response = await loop.run_in_executor(
None, lambda: self.client.chat.completions.create(**params)
)

if self.use_responses_api:
response = await loop.run_in_executor(
None, lambda: self._call_responses_api(params)
)
response_text = response.output_text
else:
response = await loop.run_in_executor(
None, lambda: self.client.chat.completions.create(**params)
)
response_text = response.choices[0].message.content

# Logging of system prompt, user message and response content
logger = logging.getLogger(__name__)
logger.debug(f"API parameters: {params}")
logger.debug(f"API response: {response.choices[0].message.content}")
return response.choices[0].message.content
logger.debug(f"API response: {response_text}")
return response_text

def _call_responses_api(self, chat_params: Dict[str, Any]) -> Any:
"""
Convert Chat Completions params to Responses API format and make the call.

The Responses API uses a different parameter structure:
- 'messages' -> 'input' (can be array of messages)
- System message in 'messages' -> 'instructions' parameter
- 'max_tokens'/'max_completion_tokens' -> 'max_output_tokens'
- 'reasoning_effort' -> 'reasoning: {"effort": ...}'

Args:
chat_params: Parameters in Chat Completions format

Returns:
Response object from client.responses.create()
"""
messages = chat_params["messages"]

# Extract system message as instructions, keep other messages as input
instructions = None
input_messages = []
for msg in messages:
if msg["role"] == "system":
instructions = msg["content"]
else:
input_messages.append(msg)

# Build Responses API params
resp_params = {
"model": chat_params["model"],
"input": input_messages,
}

if instructions:
resp_params["instructions"] = instructions

# Map token limits (Responses API uses max_output_tokens)
if "max_completion_tokens" in chat_params:
resp_params["max_output_tokens"] = chat_params["max_completion_tokens"]
elif "max_tokens" in chat_params:
resp_params["max_output_tokens"] = chat_params["max_tokens"]

# Map sampling parameters
if "temperature" in chat_params:
resp_params["temperature"] = chat_params["temperature"]
if "top_p" in chat_params:
resp_params["top_p"] = chat_params["top_p"]
if "seed" in chat_params:
resp_params["seed"] = chat_params["seed"]

# Map reasoning_effort to nested format for Responses API
if "reasoning_effort" in chat_params:
resp_params["reasoning"] = {"effort": chat_params["reasoning_effort"]}

# Disable conversation storage (not needed for OpenEvolve's use case)
resp_params["store"] = False

return self.client.responses.create(**resp_params)
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ authors = [
{name = "codelion"}
]
dependencies = [
"openai>=1.0.0",
"openai>=1.80.0", # Required for Responses API
"pyyaml>=6.0",
"numpy>=1.22.0",
"tqdm>=4.64.0",
Expand Down
87 changes: 87 additions & 0 deletions tests/test_openai_model_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,5 +94,92 @@ def is_reasoning_model(model_name, api_base):
)


class TestResponsesAPIDetection(unittest.TestCase):
"""Test Responses API vs Chat Completions API selection logic"""

def _should_use_responses_api(self, api_base, api_type="auto"):
"""Test function that mimics the logic in openai.py"""
# Check for explicit override
if api_type == "responses":
return True
if api_type == "chat_completions":
return False

# Auto-detect based on API base URL
if not api_base:
return False

api_lower = api_base.lower()

# Only use Responses API for official OpenAI endpoints
return (
api_lower.startswith("https://api.openai.com") or
api_lower.startswith("https://eu.api.openai.com") or
api_lower.startswith("https://apac.api.openai.com")
)

def test_openai_endpoints_use_responses_api(self):
"""Test that official OpenAI endpoints use Responses API by default"""
test_cases = [
("https://api.openai.com/v1", True, "Main OpenAI endpoint"),
("https://api.openai.com", True, "OpenAI without path"),
("https://eu.api.openai.com/v1", True, "EU endpoint"),
("https://apac.api.openai.com/v1", True, "APAC endpoint"),
("https://API.OPENAI.COM/v1", True, "Uppercase URL"),
]

for api_base, expected, description in test_cases:
with self.subTest(api_base=api_base, desc=description):
result = self._should_use_responses_api(api_base)
self.assertEqual(
result,
expected,
f"API base '{api_base}' ({description}): expected {expected}, got {result}",
)

def test_non_openai_endpoints_use_chat_completions(self):
"""Test that non-OpenAI endpoints use Chat Completions API"""
test_cases = [
("https://generativelanguage.googleapis.com/v1beta/openai/", False, "Google AI Studio"),
("https://openrouter.ai/api/v1", False, "OpenRouter"),
("http://localhost:8000/v1", False, "Local server"),
("https://api.anthropic.com/v1", False, "Anthropic"),
("https://api.deepseek.com/v1", False, "DeepSeek"),
(None, False, "None API base"),
("", False, "Empty API base"),
]

for api_base, expected, description in test_cases:
with self.subTest(api_base=api_base, desc=description):
result = self._should_use_responses_api(api_base)
self.assertEqual(
result,
expected,
f"API base '{api_base}' ({description}): expected {expected}, got {result}",
)

def test_explicit_api_type_override(self):
"""Test that api_type override works correctly"""
# Force responses API even for non-OpenAI endpoint
self.assertTrue(
self._should_use_responses_api("http://localhost:8000/v1", api_type="responses")
)

# Force chat completions even for OpenAI endpoint
self.assertFalse(
self._should_use_responses_api("https://api.openai.com/v1", api_type="chat_completions")
)

# Auto detection with OpenAI endpoint
self.assertTrue(
self._should_use_responses_api("https://api.openai.com/v1", api_type="auto")
)

# Auto detection with non-OpenAI endpoint
self.assertFalse(
self._should_use_responses_api("http://localhost:8000/v1", api_type="auto")
)


if __name__ == "__main__":
unittest.main()
51 changes: 49 additions & 2 deletions tests/test_reasoning_effort_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ def test_openai_llm_uses_reasoning_effort(self):
model_cfg.api_key = "test-key"
model_cfg.random_seed = None
model_cfg.reasoning_effort = "high"
model_cfg.api_type = "chat_completions" # Force Chat Completions API for this test

# Mock OpenAI client to avoid actual API calls
with unittest.mock.patch('openai.OpenAI'):
Expand All @@ -140,8 +141,8 @@ def test_openai_llm_uses_reasoning_effort(self):
# Verify the reasoning_effort is stored
self.assertEqual(llm.reasoning_effort, "high")

def test_reasoning_effort_passed_to_api_params(self):
"""Test that reasoning_effort is included in API call parameters"""
def test_reasoning_effort_passed_to_api_params_chat_completions(self):
"""Test that reasoning_effort is included in API call parameters (Chat Completions)"""
model_cfg = Mock()
model_cfg.name = "gpt-oss-120b"
model_cfg.system_message = "system"
Expand All @@ -155,6 +156,7 @@ def test_reasoning_effort_passed_to_api_params(self):
model_cfg.api_key = "test-key"
model_cfg.random_seed = None
model_cfg.reasoning_effort = "medium"
model_cfg.api_type = "chat_completions" # Force Chat Completions API for this test

with unittest.mock.patch('openai.OpenAI'):
llm = OpenAILLM(model_cfg)
Expand All @@ -178,6 +180,51 @@ def test_reasoning_effort_passed_to_api_params(self):
# Verify the API was called with reasoning_effort
llm.client.chat.completions.create.assert_called_once_with(**test_params)

def test_reasoning_effort_passed_to_responses_api(self):
"""Test that reasoning_effort is converted to nested format for Responses API"""
model_cfg = Mock()
model_cfg.name = "gpt-oss-120b"
model_cfg.system_message = "system"
model_cfg.temperature = 0.7
model_cfg.top_p = 0.95
model_cfg.max_tokens = 4096
model_cfg.timeout = 60
model_cfg.retries = 3
model_cfg.retry_delay = 5
model_cfg.api_base = "https://api.openai.com/v1"
model_cfg.api_key = "test-key"
model_cfg.random_seed = None
model_cfg.reasoning_effort = "medium"
model_cfg.api_type = "responses" # Force Responses API for this test

with unittest.mock.patch('openai.OpenAI'):
llm = OpenAILLM(model_cfg)

# Test the _call_api method directly with mocked client
mock_response = Mock()
mock_response.output_text = "Test response"
llm.client.responses.create.return_value = mock_response

# Input params in Chat Completions format
test_params = {
"model": "gpt-oss-120b",
"messages": [{"role": "system", "content": "Test"}, {"role": "user", "content": "Test user"}],
"max_completion_tokens": 4096,
"reasoning_effort": "medium"
}

result = asyncio.run(llm._call_api(test_params))

# Verify the Responses API was called with nested reasoning format
llm.client.responses.create.assert_called_once()
call_args = llm.client.responses.create.call_args
self.assertEqual(call_args.kwargs["model"], "gpt-oss-120b")
self.assertEqual(call_args.kwargs["instructions"], "Test")
self.assertEqual(call_args.kwargs["input"], [{"role": "user", "content": "Test user"}])
self.assertEqual(call_args.kwargs["reasoning"], {"effort": "medium"})
self.assertEqual(call_args.kwargs["max_output_tokens"], 4096)
self.assertFalse(call_args.kwargs["store"])

def test_yaml_file_loading_with_reasoning_effort(self):
"""Test loading reasoning_effort from actual YAML file"""
yaml_content = """
Expand Down
Loading