Enhance automated review temperature adjustment for OpenAI O-series models

Wojciech Napierała · Wojciech Napierała · commit 66b3022cdfd7 · 2025-11-07T21:18:36.000+01:00
- Updated the ReviewEngine to dynamically adjust the sampling temperature for O-series models based on API constraints.
- Added a new method to resolve the appropriate temperature based on the model name.
- Included a test to verify that the correct temperature is set for O-series models.
- Updated CHANGELOG to document this enhancement.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -33,6 +33,7 @@ All notable changes to this project will be documented in this file. The format
 
 ### Changed
 - Startup health checks now warn (rather than exit) when Redis or ChromaDB are unavailable, enabling seamless in-memory fallbacks.
+- Automated review automatically adjusts sampling temperature for OpenAI O-series models to satisfy API constraints.
 ## [0.1.1] - 2025-11-07
 
 ### Added
diff --git a/core/review.py b/core/review.py
@@ -8,6 +8,7 @@
     v0.5 - 2025-11-07 - Normalised metadata serialisation for automated review payloads.
     v0.6 - 2025-11-07 - Applied provider-aware routing for automated review models.
     v0.7 - 2025-11-07 - Logged automated review failures before surfacing to callers.
+    v0.8 - 2025-11-07 - Adjusted reviewer temperature for O-series OpenAI models.
 """
 
 from __future__ import annotations
@@ -113,6 +114,7 @@ def _run_automated_review(
         try:
             self._logger.debug("Running automated review with model %s", model)
             model_name, provider_kwargs = self._resolve_model_configuration()
+            sampling_temperature = self._resolve_temperature(model_name)
             payload = {
                 "task_prompt": request.prompt,
                 "workflow": request.workflow,
@@ -138,7 +140,7 @@ def _run_automated_review(
                         ),
                     },
                 ],
-                temperature=0.0,
+                temperature=sampling_temperature,
                 request_timeout=self._config.llm.timeouts.request_seconds,
                 **provider_kwargs,
             )
@@ -193,6 +195,15 @@ def _normalise_verdict(raw_verdict: Optional[str]) -> str:
             return "fail-auto"
         return verdict
 
+    @staticmethod
+    def _resolve_temperature(model_name: str) -> float:
+        """Return the sampling temperature to use for the given reviewer model."""
+        normalized = model_name.lower()
+        short_name = normalized.split("/")[-1]
+        if short_name.startswith(("o1", "o3", "o-")) or short_name.startswith("o"):
+            return 1.0
+        return 0.0
+
     def _activate_litellm_debug(self) -> None:
         """Enable LiteLLM debug logging for automated review when configured."""
         if not self._config.llm.enable_debug:
diff --git a/tests/test_review_engine.py b/tests/test_review_engine.py
@@ -15,6 +15,7 @@
 from config import settings
 from core.live_loop import LiveTaskLoop
 from core.review import ReviewEngine
+from models.workflows import TaskRequest, TaskResult
 
 
 def _entry_has_user_task(entry: Dict[str, object], expected: str) -> bool:
@@ -186,3 +187,53 @@ def test_resolve_model_configuration_uses_azure_provider(monkeypatch: pytest.Mon
     assert model_name == "azure/gpt-4.1"
     assert kwargs["custom_llm_provider"] == "azure"
     assert kwargs["api_base"] == "https://example.openai.azure.com"
+
+
+def test_review_engine_sets_o_series_temperature(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    config = _load_sample_config(tmp_path)
+    config.review.auto_reviewer_model = "o3-mini"
+    config.review.auto_reviewer_provider = None
+
+    captured: Dict[str, Any] = {}
+
+    def _fake_completion(*args: Any, temperature: float, **kwargs: Any) -> Dict[str, Any]:
+        captured["temperature"] = temperature
+        return {
+            "choices": [
+                {
+                    "message": {
+                        "content": (
+                            "VERDICT: PASS\n"
+                            "REASONING: Compliant.\n"
+                            "QUALITY_SCORE: 0.9\n"
+                            "SUGGESTIONS:\n"
+                            "- None."
+                        )
+                    }
+                }
+            ],
+            "usage": {},
+        }
+
+    class DummyTimeout(Exception):
+        """Placeholder timeout exception."""
+
+    dummy_litellm = SimpleNamespace(
+        completion=_fake_completion,
+        Timeout=DummyTimeout,
+    )
+    monkeypatch.setattr("core.review.litellm", dummy_litellm)
+
+    engine = ReviewEngine(config)
+    monkeypatch.setattr(
+        engine,
+        "_resolve_model_configuration",
+        lambda: ("o3-mini", {}),
+    )
+
+    request = TaskRequest(workflow="fast", prompt="demo")
+    result = TaskResult(workflow="fast", content="ok", latency_seconds=0.1)
+
+    engine.perform_review(request, result)
+
+    assert captured.get("temperature") == pytest.approx(1.0)