From da0e3c3176e9768f11b9e8e66f0ebb4777e76c0b Mon Sep 17 00:00:00 2001
From: Game_Time <108236317+RayBytes@users.noreply.github.com>
Date: Mon, 23 Mar 2026 23:13:58 +0500
Subject: [PATCH] fixes #103: responses api max_output_tokens bug

---
 DOCKER.md                               |   3 +-
 chatmock/cli.py                         |   2 +-
 chatmock/responses_api.py               |   1 +
 chatmock/routes_ollama.py               |   6 +-
 chatmock/routes_openai.py               |   6 +-
 gui.py                                  |  19 ++-
 scripts/test_responses_cached_tokens.py | 176 ------------------------
 scripts/test_responses_reuse.py         | 143 -------------------
 tests/test_routes.py                    | 107 +++++++++++++-
 9 files changed, 132 insertions(+), 331 deletions(-)
 delete mode 100644 scripts/test_responses_cached_tokens.py
 delete mode 100644 scripts/test_responses_reuse.py
diff --git a/DOCKER.md b/DOCKER.md
index 1314c97..c483d63 100644
--- a/DOCKER.md
+++ b/DOCKER.md
@@ -25,13 +25,12 @@ Set options in `.env` or pass environment variables:
 - `CHATGPT_LOCAL_REASONING_SUMMARY`: auto|concise|detailed|none
 - `CHATGPT_LOCAL_REASONING_COMPAT`: legacy|o3|think-tags|current
 - `CHATGPT_LOCAL_FAST_MODE`: `true|false` to enable fast mode by default for supported models
-- `CHATGPT_LOCAL_DEBUG_MODEL`: force model override (e.g., `gpt-5.4`)
 - `CHATGPT_LOCAL_CLIENT_ID`: OAuth client id override (rarely needed)
 - `CHATGPT_LOCAL_EXPOSE_REASONING_MODELS`: `true|false` to add reasoning model variants to `/v1/models`
 - `CHATGPT_LOCAL_ENABLE_WEB_SEARCH`: `true|false` to enable default web search tool
 
 ## Logs
-Set `VERBOSE=true` to include extra logging for debugging issues in upstream or chat app requests. Please include and use these logs when submitting bug reports.
+Set `VERBOSE=true` to include extra logging for troubleshooting upstream or chat app requests. Please include and use these logs when submitting bug reports.
 
 ## Test
 
diff --git a/chatmock/cli.py b/chatmock/cli.py
index 78a69ae..8482cf3 100644
--- a/chatmock/cli.py
+++ b/chatmock/cli.py
@@ -284,7 +284,7 @@ def cmd_serve(
         default_web_search=default_web_search,
     )
 
-    app.run(host=host, debug=False, use_reloader=False, port=port, threaded=True)
+    app.run(host=host, use_reloader=False, port=port, threaded=True)
     return 0
 
 
diff --git a/chatmock/responses_api.py b/chatmock/responses_api.py
index 9aae843..51bda2a 100644
--- a/chatmock/responses_api.py
+++ b/chatmock/responses_api.py
@@ -88,6 +88,7 @@ def normalize_responses_payload(
 
     normalized = dict(payload)
     normalized["model"] = normalized_model
+    normalized.pop("max_output_tokens", None)
 
     if "input" in normalized:
         normalized["input"] = canonicalize_responses_input(normalized.get("input"))
diff --git a/chatmock/routes_ollama.py b/chatmock/routes_ollama.py
index 96c7c8b..5da18d0 100644
--- a/chatmock/routes_ollama.py
+++ b/chatmock/routes_ollama.py
@@ -250,7 +250,7 @@ def ollama_chat() -> Response:
     input_items = convert_chat_messages_to_responses_input(messages)
 
     model_reasoning = extract_reasoning_from_model_name(model)
-    normalized_model = normalize_model_name(model)
+    normalized_model = normalize_model_name(model, current_app.config.get("DEBUG_MODEL"))
     service_tier_resolution = resolve_service_tier(
         normalized_model,
         request_fast_mode=payload.get("fast_mode"),
@@ -306,7 +306,7 @@ def ollama_chat() -> Response:
             base_tools_only = convert_tools_chat_to_responses(normalize_ollama_tools(tools_req))
             safe_choice = payload.get("tool_choice", "auto")
             upstream2, err2 = start_upstream_request(
-                normalize_model_name(model),
+                normalize_model_name(model, current_app.config.get("DEBUG_MODEL")),
                 input_items,
                 instructions=BASE_INSTRUCTIONS,
                 tools=base_tools_only,
@@ -570,7 +570,7 @@ def ollama_chat() -> Response:
             full_text = f"<think>{rtxt}</think>" + (full_text or "")
 
     out_json = {
-        "model": normalize_model_name(model),
+        "model": normalize_model_name(model, current_app.config.get("DEBUG_MODEL")),
         "created_at": created_at,
         "message": {"role": "assistant", "content": full_text, **({"tool_calls": tool_calls} if tool_calls else {})},
         "done": True,
diff --git a/chatmock/routes_openai.py b/chatmock/routes_openai.py
index 437ebef..eb37842 100644
--- a/chatmock/routes_openai.py
+++ b/chatmock/routes_openai.py
@@ -109,7 +109,6 @@ def chat_completions() -> Response:
     reasoning_effort = current_app.config.get("REASONING_EFFORT", "medium")
     reasoning_summary = current_app.config.get("REASONING_SUMMARY", "auto")
     reasoning_compat = current_app.config.get("REASONING_COMPAT", "think-tags")
-    debug_model = current_app.config.get("DEBUG_MODEL")
 
     raw = request.get_data(cache=True, as_text=True) or ""
     if verbose:
@@ -129,7 +128,7 @@ def chat_completions() -> Response:
             return jsonify(err), 400
 
     requested_model = payload.get("model")
-    model = normalize_model_name(requested_model, debug_model)
+    model = normalize_model_name(requested_model, current_app.config.get("DEBUG_MODEL"))
     messages = payload.get("messages")
     if messages is None and isinstance(payload.get("prompt"), str):
         messages = [{"role": "user", "content": payload.get("prompt") or ""}]
@@ -413,7 +412,6 @@ def chat_completions() -> Response:
 def completions() -> Response:
     verbose = bool(current_app.config.get("VERBOSE"))
     verbose_obfuscation = bool(current_app.config.get("VERBOSE_OBFUSCATION"))
-    debug_model = current_app.config.get("DEBUG_MODEL")
     reasoning_effort = current_app.config.get("REASONING_EFFORT", "medium")
     reasoning_summary = current_app.config.get("REASONING_SUMMARY", "auto")
 
@@ -432,7 +430,7 @@ def completions() -> Response:
         return jsonify(err), 400
 
     requested_model = payload.get("model")
-    model = normalize_model_name(requested_model, debug_model)
+    model = normalize_model_name(requested_model, current_app.config.get("DEBUG_MODEL"))
     prompt = payload.get("prompt")
     if isinstance(prompt, list):
         prompt = "".join([p if isinstance(p, str) else "" for p in prompt])
diff --git a/gui.py b/gui.py
index 82929fe..5bdc18c 100644
--- a/gui.py
+++ b/gui.py
@@ -19,6 +19,7 @@ def run_server(
     reasoning_summary: str = "auto",
     reasoning_compat: str = "think-tags",
     fast_mode: bool = False,
+    debug_model: str | None = None,
     expose_reasoning_models: bool = False,
     default_web_search: bool = False,
 ) -> None:
@@ -27,10 +28,11 @@ def run_server(
         reasoning_summary=reasoning_summary,
         reasoning_compat=reasoning_compat,
         fast_mode=fast_mode,
+        debug_model=debug_model,
         expose_reasoning_models=expose_reasoning_models,
         default_web_search=default_web_search,
     )
-    app.run(host=host, port=port, debug=False, use_reloader=False, threaded=True)
+    app.run(host=host, port=port, use_reloader=False, threaded=True)
 
 
 class ServerProcess(QtCore.QObject):
@@ -45,6 +47,7 @@ class ServerProcess(QtCore.QObject):
         self._summary = "auto"
         self._compat = "think-tags"
         self._fast_mode = False
+        self._debug_model: str | None = None
         self._expose_reasoning_models = False
         self._default_web_search = False
 
@@ -59,6 +62,7 @@ class ServerProcess(QtCore.QObject):
         summary: str,
         compat: str,
         fast_mode: bool,
+        debug_model: str | None,
         expose_reasoning_models: bool,
         default_web_search: bool,
     ) -> None:
@@ -68,6 +72,7 @@ class ServerProcess(QtCore.QObject):
         self._effort, self._summary = effort, summary
         self._compat = compat
         self._fast_mode = fast_mode
+        self._debug_model = debug_model
         self._expose_reasoning_models = expose_reasoning_models
         self._default_web_search = default_web_search
         self._proc = QtCore.QProcess()
@@ -80,6 +85,8 @@ class ServerProcess(QtCore.QObject):
             "--summary", summary,
             "--compat", compat,
         ]
+        if isinstance(debug_model, str) and debug_model.strip():
+            args.extend(["--debug-model", debug_model.strip()])
         if fast_mode:
             args.append("--fast-mode")
         if expose_reasoning_models:
@@ -317,6 +324,12 @@ class MainWindow(QtWidgets.QMainWindow):
         self.port_edit.setValidator(QtGui.QIntValidator(1, 65535, self))
         self.port_edit.setMaximumWidth(100)
         form.addWidget(self.port_edit, 0, 3)
+        form.addWidget(QtWidgets.QLabel("Debug Model"), 1, 0)
+        self.debug_model_edit = QtWidgets.QLineEdit("")
+        self.debug_model_edit.setClearButtonEnabled(True)
+        self.debug_model_edit.setPlaceholderText("Optional override, e.g. gpt-5.4")
+        self.debug_model_edit.setSizePolicy(QtWidgets.QSizePolicy.Expanding, QtWidgets.QSizePolicy.Fixed)
+        form.addWidget(self.debug_model_edit, 1, 1, 1, 3)
         form.setColumnStretch(1, 1)
         srv_layout.addLayout(form)
 
@@ -473,6 +486,7 @@ class MainWindow(QtWidgets.QMainWindow):
         summary = self.summary.currentText().strip()
         compat = self.compat.currentText().strip()
         fast_mode = self.fast_mode.isChecked()
+        debug_model = self.debug_model_edit.text().strip() or None
         expose_reasoning_models = self.expose_reasoning_models.isChecked()
         default_web_search = self.enable_web_search.isChecked()
         self.status.setText(f"Starting server at http://{host}:{port} …")
@@ -484,6 +498,7 @@ class MainWindow(QtWidgets.QMainWindow):
             summary,
             compat,
             fast_mode,
+            debug_model,
             expose_reasoning_models,
             default_web_search,
         )
@@ -536,6 +551,7 @@ def main() -> None:
         p.add_argument("--summary", default="auto")
         p.add_argument("--compat", default="think-tags")
         p.add_argument("--fast-mode", action="store_true")
+        p.add_argument("--debug-model")
         p.add_argument("--expose-reasoning-models", action="store_true")
         p.add_argument("--enable-web-search", action="store_true")
         args, _ = p.parse_known_args()
@@ -546,6 +562,7 @@ def main() -> None:
             args.summary,
             args.compat,
             args.fast_mode,
+            args.debug_model,
             args.expose_reasoning_models,
             args.enable_web_search,
         )
diff --git a/scripts/test_responses_cached_tokens.py b/scripts/test_responses_cached_tokens.py
deleted file mode 100644
index 9cf05f5..0000000
--- a/scripts/test_responses_cached_tokens.py
+++ /dev/null
@@ -1,176 +0,0 @@
-#!/usr/bin/env python3
-from __future__ import annotations
-
-import argparse
-import json
-import sys
-import uuid
-from typing import Any, Dict
-
-import requests
-
-
-def _post(url: str, api_key: str, session_id: str, payload: Dict[str, Any]) -> Dict[str, Any]:
-    response = requests.post(
-        url,
-        headers={
-            "Authorization": f"Bearer {api_key}",
-            "Content-Type": "application/json",
-            "X-Session-Id": session_id,
-        },
-        json=payload,
-        timeout=180,
-    )
-    try:
-        body = response.json()
-    except Exception:
-        body = {"raw": response.text}
-    if response.status_code >= 400:
-        raise RuntimeError(
-            f"POST {url} failed with {response.status_code}: {json.dumps(body, ensure_ascii=False)}"
-        )
-    if not isinstance(body, dict):
-        raise RuntimeError(f"Expected JSON object response, got: {body!r}")
-    return body
-
-
-def _usage_summary(body: Dict[str, Any]) -> Dict[str, Any]:
-    usage = body.get("usage")
-    if not isinstance(usage, dict):
-        return {}
-    return usage
-
-
-def _cached_tokens(body: Dict[str, Any]) -> int | None:
-    usage = _usage_summary(body)
-    details = usage.get("input_tokens_details")
-    if not isinstance(details, dict):
-        return None
-    value = details.get("cached_tokens")
-    try:
-        return int(value)
-    except Exception:
-        return None
-
-
-def _assistant_message_item(body: Dict[str, Any]) -> Dict[str, Any]:
-    output = body.get("output")
-    if not isinstance(output, list):
-        raise RuntimeError("Response did not include an output list.")
-    for item in output:
-        if isinstance(item, dict) and item.get("type") == "message" and item.get("role") == "assistant":
-            return item
-    raise RuntimeError("Response did not include an assistant message item.")
-
-
-def _user_message(text: str) -> Dict[str, Any]:
-    return {
-        "type": "message",
-        "role": "user",
-        "content": [{"type": "input_text", "text": text}],
-    }
-
-
-def _default_prefix() -> str:
-    seed = "Cache test prefix. Repeat this context exactly for cache measurement. "
-    return "".join(seed for _ in range(220))
-
-
-def main() -> int:
-    parser = argparse.ArgumentParser(
-        description="Drive two raw /v1/responses turns through ChatMock and check cached input tokens."
-    )
-    parser.add_argument("--base-url", default="http://127.0.0.1:8000", help="ChatMock base URL.")
-    parser.add_argument("--api-key", default="key", help="Bearer token to send to ChatMock.")
-    parser.add_argument("--model", default="gpt-5.4", help="Model to request.")
-    parser.add_argument(
-        "--session-id",
-        default=f"cache-check-{uuid.uuid4()}",
-        help="Fixed X-Session-Id for both turns.",
-    )
-    parser.add_argument(
-        "--prefix",
-        default=_default_prefix(),
-        help="Large repeated first-turn prompt prefix.",
-    )
-    parser.add_argument(
-        "--first-question",
-        default="Reply with exactly: alpha",
-        help="Trailing instruction for the first turn.",
-    )
-    parser.add_argument(
-        "--second-question",
-        default="Reply with exactly: beta",
-        help="Trailing instruction for the second turn.",
-    )
-    args = parser.parse_args()
-
-    responses_url = args.base_url.rstrip("/") + "/v1/responses"
-    session_id = args.session_id
-    first_text = f"{args.prefix}\n\n{args.first_question}"
-    second_text = args.second_question
-
-    print(f"Using session id: {session_id}")
-    print(f"POST target: {responses_url}")
-    print("This checks the raw Responses usage object returned through ChatMock.")
-    print()
-
-    first_payload = {
-        "model": args.model,
-        "store": False,
-        "stream": False,
-        "input": first_text,
-    }
-    first_response = _post(responses_url, args.api_key, session_id, first_payload)
-    assistant_item = _assistant_message_item(first_response)
-
-    second_payload = {
-        "model": args.model,
-        "store": False,
-        "stream": False,
-        "input": [
-            _user_message(first_text),
-            assistant_item,
-            _user_message(second_text),
-        ],
-    }
-    second_response = _post(responses_url, args.api_key, session_id, second_payload)
-
-    first_usage = _usage_summary(first_response)
-    second_usage = _usage_summary(second_response)
-    first_cached = _cached_tokens(first_response)
-    second_cached = _cached_tokens(second_response)
-
-    print("Turn 1")
-    print(json.dumps(first_usage, indent=2, ensure_ascii=False) if first_usage else "  no usage object")
-    print()
-    print("Turn 2")
-    print(json.dumps(second_usage, indent=2, ensure_ascii=False) if second_usage else "  no usage object")
-    print()
-
-    if second_cached is None:
-        first_input_tokens = first_usage.get("input_tokens") if isinstance(first_usage, dict) else None
-        second_input_tokens = second_usage.get("input_tokens") if isinstance(second_usage, dict) else None
-        print("Result: inconclusive")
-        print("Reason: upstream did not include `usage.input_tokens_details.cached_tokens`.")
-        if isinstance(first_input_tokens, int) and isinstance(second_input_tokens, int):
-            print(f"Observed input_tokens delta: first={first_input_tokens}, second={second_input_tokens}")
-        print("Codex treats cached-token reporting as the direct cache-hit signal; without it, this script cannot prove caching.")
-        return 2
-
-    if second_cached > 0:
-        print(f"Result: success, follow-up turn reported cached_tokens={second_cached}.")
-        return 0
-
-    print("Result: failure, follow-up turn reported cached_tokens=0.")
-    return 1
-
-
-if __name__ == "__main__":
-    try:
-        raise SystemExit(main())
-    except KeyboardInterrupt:
-        raise SystemExit(130)
-    except Exception as exc:
-        print(f"error: {exc}", file=sys.stderr)
-        raise SystemExit(1)
diff --git a/scripts/test_responses_reuse.py b/scripts/test_responses_reuse.py
deleted file mode 100644
index 5e506ab..0000000
--- a/scripts/test_responses_reuse.py
+++ /dev/null
@@ -1,143 +0,0 @@
-#!/usr/bin/env python3
-from __future__ import annotations
-
-import argparse
-import json
-import sys
-import uuid
-from typing import Any, Dict, Tuple
-
-from websockets.sync.client import connect
-
-
-def _user_message(text: str) -> Dict[str, Any]:
-    return {
-        "type": "message",
-        "role": "user",
-        "content": [{"type": "input_text", "text": text}],
-    }
-
-
-def _receive_turn(ws) -> Tuple[str, Dict[str, Any]]:
-    response_id: str | None = None
-    assistant_item: Dict[str, Any] | None = None
-
-    while True:
-        raw = ws.recv(timeout=120)
-        event = json.loads(raw)
-        event_type = event.get("type")
-        if event_type == "error":
-            raise RuntimeError(f"websocket error: {json.dumps(event, ensure_ascii=False)}")
-        if event_type == "response.created":
-            response = event.get("response")
-            if isinstance(response, dict) and isinstance(response.get("id"), str):
-                response_id = response["id"]
-        elif event_type == "response.output_item.done":
-            item = event.get("item")
-            if (
-                isinstance(item, dict)
-                and item.get("type") == "message"
-                and item.get("role") == "assistant"
-            ):
-                assistant_item = item
-        elif event_type == "response.completed":
-            if not response_id:
-                response = event.get("response")
-                if isinstance(response, dict) and isinstance(response.get("id"), str):
-                    response_id = response["id"]
-            if not response_id:
-                raise RuntimeError("turn completed without a response id")
-            if assistant_item is None:
-                raise RuntimeError("turn completed without an assistant message item")
-            return response_id, assistant_item
-
-
-def main() -> int:
-    parser = argparse.ArgumentParser(
-        description="Exercise ChatMock websocket reuse the same way Codex does."
-    )
-    parser.add_argument(
-        "--ws-url",
-        default="ws://127.0.0.1:8000/v1/responses",
-        help="ChatMock websocket URL.",
-    )
-    parser.add_argument("--model", default="gpt-5.4", help="Model to request.")
-    parser.add_argument(
-        "--session-id",
-        default=f"reuse-demo-{uuid.uuid4()}",
-        help="Fixed X-Session-Id for the whole run.",
-    )
-    parser.add_argument(
-        "--first-prompt",
-        default="Say exactly: alpha",
-        help="Prompt for the first turn.",
-    )
-    parser.add_argument(
-        "--second-prompt",
-        default="Now say exactly: beta",
-        help="Prompt appended in the reuse-candidate turn.",
-    )
-    parser.add_argument(
-        "--no-fast-mode",
-        action="store_true",
-        help="Do not send fast_mode=true.",
-    )
-    args = parser.parse_args()
-
-    headers = {"X-Session-Id": args.session_id}
-    fast_mode = not args.no_fast_mode
-
-    print(f"Using websocket session id: {args.session_id}")
-    print(f"Connecting to: {args.ws_url}")
-    print("Run ChatMock with `python3 chatmock.py serve --verbose` in another terminal.")
-    print("This verifies the Codex-aligned path: websocket `response.create` reuse.")
-    print("HTTP `/v1/responses` is not expected to send `previous_response_id`.")
-    print()
-
-    with connect(args.ws_url, additional_headers=headers, open_timeout=15) as ws:
-        first_request = {
-            "type": "response.create",
-            "model": args.model,
-            "store": False,
-            "input": args.first_prompt,
-            "fast_mode": fast_mode,
-        }
-        ws.send(json.dumps(first_request))
-        first_response_id, assistant_item = _receive_turn(ws)
-
-        second_request = {
-            "type": "response.create",
-            "model": args.model,
-            "store": False,
-            "input": [
-                _user_message(args.first_prompt),
-                assistant_item,
-                _user_message(args.second_prompt),
-            ],
-            "fast_mode": fast_mode,
-        }
-        ws.send(json.dumps(second_request))
-        second_response_id, _ = _receive_turn(ws)
-
-    print("Turn 1 completed.")
-    print(f"  response id: {first_response_id}")
-    print("Turn 2 completed.")
-    print(f"  response id: {second_response_id}")
-    print()
-    print("Expected in the verbose ChatMock server log for turn 2:")
-    print("  - outbound websocket payload includes `previous_response_id`")
-    print("  - `previous_response_id` equals the first response id")
-    print("  - outbound `input` only contains the new trailing user message")
-    print()
-    print("If turn 2 still shows the full conversation in the outbound websocket payload, reuse is not working.")
-    return 0
-
-
-if __name__ == "__main__":
-    try:
-        raise SystemExit(main())
-    except KeyboardInterrupt:
-        raise SystemExit(130)
-    except Exception as exc:
-        print(f"error: {exc}", file=sys.stderr)
-        raise SystemExit(1)
diff --git a/tests/test_routes.py b/tests/test_routes.py
index 1316bc8..c5d94bc 100644
--- a/tests/test_routes.py
+++ b/tests/test_routes.py
@@ -91,6 +91,26 @@ class RouteTests(unittest.TestCase):
         self.assertEqual(body["choices"][0]["message"]["content"], "hello")
         self.assertEqual(body["model"], "gpt5.4-mini")
 
+    @patch("chatmock.routes_openai.start_upstream_request")
+    def test_chat_completions_honors_debug_model_override(self, mock_start) -> None:
+        app = create_app(debug_model="gpt-5.4")
+        client = app.test_client()
+        mock_start.return_value = (
+            FakeUpstream(
+                [
+                    {"type": "response.output_text.delta", "delta": "hello"},
+                    {"type": "response.completed", "response": {"id": "resp-openai"}},
+                ]
+            ),
+            None,
+        )
+        response = client.post(
+            "/v1/chat/completions",
+            json={"model": "gpt-5.3-codex", "messages": [{"role": "user", "content": "hi"}]},
+        )
+        self.assertEqual(response.status_code, 200)
+        self.assertEqual(mock_start.call_args.args[0], "gpt-5.4")
+
     @patch("chatmock.routes_ollama.start_upstream_request")
     def test_ollama_chat(self, mock_start) -> None:
         mock_start.return_value = (
@@ -111,6 +131,28 @@ class RouteTests(unittest.TestCase):
         self.assertEqual(body["message"]["content"], "hello")
         self.assertEqual(body["model"], "gpt-5.4")
 
+    @patch("chatmock.routes_ollama.start_upstream_request")
+    def test_ollama_chat_honors_debug_model_override(self, mock_start) -> None:
+        app = create_app(debug_model="gpt-5.4")
+        client = app.test_client()
+        mock_start.return_value = (
+            FakeUpstream(
+                [
+                    {"type": "response.output_text.delta", "delta": "hello"},
+                    {"type": "response.completed"},
+                ]
+            ),
+            None,
+        )
+        response = client.post(
+            "/api/chat",
+            json={"model": "gpt-5.3-codex", "messages": [{"role": "user", "content": "hi"}], "stream": False},
+        )
+        body = response.get_json()
+        self.assertEqual(response.status_code, 200)
+        self.assertEqual(mock_start.call_args.args[0], "gpt-5.4")
+        self.assertEqual(body["model"], "gpt-5.4")
+
     @patch("chatmock.routes_openai.start_upstream_request")
     def test_chat_completions_fast_mode_sets_priority_service_tier(self, mock_start) -> None:
         mock_start.return_value = (
@@ -212,6 +254,70 @@ class RouteTests(unittest.TestCase):
         self.assertEqual(outbound_payload["reasoning"]["effort"], "medium")
         self.assertIsInstance(outbound_payload["prompt_cache_key"], str)
 
+    @patch("chatmock.routes_openai.start_upstream_raw_request")
+    def test_responses_route_honors_debug_model_override(self, mock_start) -> None:
+        app = create_app(debug_model="gpt-5.4")
+        client = app.test_client()
+        mock_start.return_value = (
+            FakeUpstream(
+                [
+                    {
+                        "type": "response.created",
+                        "response": {"id": "resp_debug", "object": "response", "status": "in_progress"},
+                    },
+                    {
+                        "type": "response.completed",
+                        "response": {
+                            "id": "resp_debug",
+                            "object": "response",
+                            "status": "completed",
+                            "output": [],
+                        },
+                    },
+                ],
+                headers={"Content-Type": "text/event-stream"},
+            ),
+            None,
+        )
+        response = client.post(
+            "/v1/responses",
+            json={"model": "gpt-5.3-codex", "input": "hello"},
+        )
+        self.assertEqual(response.status_code, 200)
+        outbound_payload = mock_start.call_args.args[0]
+        self.assertEqual(outbound_payload["model"], "gpt-5.4")
+
+    @patch("chatmock.routes_openai.start_upstream_raw_request")
+    def test_responses_route_strips_unsupported_max_output_tokens(self, mock_start) -> None:
+        mock_start.return_value = (
+            FakeUpstream(
+                [
+                    {
+                        "type": "response.created",
+                        "response": {"id": "resp_limit", "object": "response", "status": "in_progress"},
+                    },
+                    {
+                        "type": "response.completed",
+                        "response": {
+                            "id": "resp_limit",
+                            "object": "response",
+                            "status": "completed",
+                            "output": [],
+                        },
+                    },
+                ],
+                headers={"Content-Type": "text/event-stream"},
+            ),
+            None,
+        )
+        response = self.client.post(
+            "/v1/responses",
+            json={"model": "gpt-5.4", "input": "hello", "max_output_tokens": 20},
+        )
+        self.assertEqual(response.status_code, 200)
+        outbound_payload = mock_start.call_args.args[0]
+        self.assertNotIn("max_output_tokens", outbound_payload)
+
     @patch("chatmock.routes_openai.start_upstream_raw_request")
     def test_responses_route_does_not_use_previous_response_id_for_http_follow_up(self, mock_start) -> None:
         mock_start.side_effect = [
@@ -496,7 +602,6 @@ class RouteTests(unittest.TestCase):
             kwargs={
                 "host": host,
                 "port": port,
-                "debug": False,
                 "use_reloader": False,
                 "threaded": True,
             },