From da0e3c3176e9768f11b9e8e66f0ebb4777e76c0b Mon Sep 17 00:00:00 2001 From: Game_Time <108236317+RayBytes@users.noreply.github.com> Date: Mon, 23 Mar 2026 23:13:58 +0500 Subject: [PATCH] fixes #103: responses api max_output_tokens bug --- DOCKER.md | 3 +- chatmock/cli.py | 2 +- chatmock/responses_api.py | 1 + chatmock/routes_ollama.py | 6 +- chatmock/routes_openai.py | 6 +- gui.py | 19 ++- scripts/test_responses_cached_tokens.py | 176 ------------------------ scripts/test_responses_reuse.py | 143 ------------------- tests/test_routes.py | 107 +++++++++++++- 9 files changed, 132 insertions(+), 331 deletions(-) delete mode 100644 scripts/test_responses_cached_tokens.py delete mode 100644 scripts/test_responses_reuse.py diff --git a/DOCKER.md b/DOCKER.md index 1314c97..c483d63 100644 --- a/DOCKER.md +++ b/DOCKER.md @@ -25,13 +25,12 @@ Set options in `.env` or pass environment variables: - `CHATGPT_LOCAL_REASONING_SUMMARY`: auto|concise|detailed|none - `CHATGPT_LOCAL_REASONING_COMPAT`: legacy|o3|think-tags|current - `CHATGPT_LOCAL_FAST_MODE`: `true|false` to enable fast mode by default for supported models -- `CHATGPT_LOCAL_DEBUG_MODEL`: force model override (e.g., `gpt-5.4`) - `CHATGPT_LOCAL_CLIENT_ID`: OAuth client id override (rarely needed) - `CHATGPT_LOCAL_EXPOSE_REASONING_MODELS`: `true|false` to add reasoning model variants to `/v1/models` - `CHATGPT_LOCAL_ENABLE_WEB_SEARCH`: `true|false` to enable default web search tool ## Logs -Set `VERBOSE=true` to include extra logging for debugging issues in upstream or chat app requests. Please include and use these logs when submitting bug reports. +Set `VERBOSE=true` to include extra logging for troubleshooting upstream or chat app requests. Please include and use these logs when submitting bug reports. ## Test diff --git a/chatmock/cli.py b/chatmock/cli.py index 78a69ae..8482cf3 100644 --- a/chatmock/cli.py +++ b/chatmock/cli.py @@ -284,7 +284,7 @@ def cmd_serve( default_web_search=default_web_search, ) - app.run(host=host, debug=False, use_reloader=False, port=port, threaded=True) + app.run(host=host, use_reloader=False, port=port, threaded=True) return 0 diff --git a/chatmock/responses_api.py b/chatmock/responses_api.py index 9aae843..51bda2a 100644 --- a/chatmock/responses_api.py +++ b/chatmock/responses_api.py @@ -88,6 +88,7 @@ def normalize_responses_payload( normalized = dict(payload) normalized["model"] = normalized_model + normalized.pop("max_output_tokens", None) if "input" in normalized: normalized["input"] = canonicalize_responses_input(normalized.get("input")) diff --git a/chatmock/routes_ollama.py b/chatmock/routes_ollama.py index 96c7c8b..5da18d0 100644 --- a/chatmock/routes_ollama.py +++ b/chatmock/routes_ollama.py @@ -250,7 +250,7 @@ def ollama_chat() -> Response: input_items = convert_chat_messages_to_responses_input(messages) model_reasoning = extract_reasoning_from_model_name(model) - normalized_model = normalize_model_name(model) + normalized_model = normalize_model_name(model, current_app.config.get("DEBUG_MODEL")) service_tier_resolution = resolve_service_tier( normalized_model, request_fast_mode=payload.get("fast_mode"), @@ -306,7 +306,7 @@ def ollama_chat() -> Response: base_tools_only = convert_tools_chat_to_responses(normalize_ollama_tools(tools_req)) safe_choice = payload.get("tool_choice", "auto") upstream2, err2 = start_upstream_request( - normalize_model_name(model), + normalize_model_name(model, current_app.config.get("DEBUG_MODEL")), input_items, instructions=BASE_INSTRUCTIONS, tools=base_tools_only, @@ -570,7 +570,7 @@ def ollama_chat() -> Response: full_text = f"{rtxt}" + (full_text or "") out_json = { - "model": normalize_model_name(model), + "model": normalize_model_name(model, current_app.config.get("DEBUG_MODEL")), "created_at": created_at, "message": {"role": "assistant", "content": full_text, **({"tool_calls": tool_calls} if tool_calls else {})}, "done": True, diff --git a/chatmock/routes_openai.py b/chatmock/routes_openai.py index 437ebef..eb37842 100644 --- a/chatmock/routes_openai.py +++ b/chatmock/routes_openai.py @@ -109,7 +109,6 @@ def chat_completions() -> Response: reasoning_effort = current_app.config.get("REASONING_EFFORT", "medium") reasoning_summary = current_app.config.get("REASONING_SUMMARY", "auto") reasoning_compat = current_app.config.get("REASONING_COMPAT", "think-tags") - debug_model = current_app.config.get("DEBUG_MODEL") raw = request.get_data(cache=True, as_text=True) or "" if verbose: @@ -129,7 +128,7 @@ def chat_completions() -> Response: return jsonify(err), 400 requested_model = payload.get("model") - model = normalize_model_name(requested_model, debug_model) + model = normalize_model_name(requested_model, current_app.config.get("DEBUG_MODEL")) messages = payload.get("messages") if messages is None and isinstance(payload.get("prompt"), str): messages = [{"role": "user", "content": payload.get("prompt") or ""}] @@ -413,7 +412,6 @@ def chat_completions() -> Response: def completions() -> Response: verbose = bool(current_app.config.get("VERBOSE")) verbose_obfuscation = bool(current_app.config.get("VERBOSE_OBFUSCATION")) - debug_model = current_app.config.get("DEBUG_MODEL") reasoning_effort = current_app.config.get("REASONING_EFFORT", "medium") reasoning_summary = current_app.config.get("REASONING_SUMMARY", "auto") @@ -432,7 +430,7 @@ def completions() -> Response: return jsonify(err), 400 requested_model = payload.get("model") - model = normalize_model_name(requested_model, debug_model) + model = normalize_model_name(requested_model, current_app.config.get("DEBUG_MODEL")) prompt = payload.get("prompt") if isinstance(prompt, list): prompt = "".join([p if isinstance(p, str) else "" for p in prompt]) diff --git a/gui.py b/gui.py index 82929fe..5bdc18c 100644 --- a/gui.py +++ b/gui.py @@ -19,6 +19,7 @@ def run_server( reasoning_summary: str = "auto", reasoning_compat: str = "think-tags", fast_mode: bool = False, + debug_model: str | None = None, expose_reasoning_models: bool = False, default_web_search: bool = False, ) -> None: @@ -27,10 +28,11 @@ def run_server( reasoning_summary=reasoning_summary, reasoning_compat=reasoning_compat, fast_mode=fast_mode, + debug_model=debug_model, expose_reasoning_models=expose_reasoning_models, default_web_search=default_web_search, ) - app.run(host=host, port=port, debug=False, use_reloader=False, threaded=True) + app.run(host=host, port=port, use_reloader=False, threaded=True) class ServerProcess(QtCore.QObject): @@ -45,6 +47,7 @@ class ServerProcess(QtCore.QObject): self._summary = "auto" self._compat = "think-tags" self._fast_mode = False + self._debug_model: str | None = None self._expose_reasoning_models = False self._default_web_search = False @@ -59,6 +62,7 @@ class ServerProcess(QtCore.QObject): summary: str, compat: str, fast_mode: bool, + debug_model: str | None, expose_reasoning_models: bool, default_web_search: bool, ) -> None: @@ -68,6 +72,7 @@ class ServerProcess(QtCore.QObject): self._effort, self._summary = effort, summary self._compat = compat self._fast_mode = fast_mode + self._debug_model = debug_model self._expose_reasoning_models = expose_reasoning_models self._default_web_search = default_web_search self._proc = QtCore.QProcess() @@ -80,6 +85,8 @@ class ServerProcess(QtCore.QObject): "--summary", summary, "--compat", compat, ] + if isinstance(debug_model, str) and debug_model.strip(): + args.extend(["--debug-model", debug_model.strip()]) if fast_mode: args.append("--fast-mode") if expose_reasoning_models: @@ -317,6 +324,12 @@ class MainWindow(QtWidgets.QMainWindow): self.port_edit.setValidator(QtGui.QIntValidator(1, 65535, self)) self.port_edit.setMaximumWidth(100) form.addWidget(self.port_edit, 0, 3) + form.addWidget(QtWidgets.QLabel("Debug Model"), 1, 0) + self.debug_model_edit = QtWidgets.QLineEdit("") + self.debug_model_edit.setClearButtonEnabled(True) + self.debug_model_edit.setPlaceholderText("Optional override, e.g. gpt-5.4") + self.debug_model_edit.setSizePolicy(QtWidgets.QSizePolicy.Expanding, QtWidgets.QSizePolicy.Fixed) + form.addWidget(self.debug_model_edit, 1, 1, 1, 3) form.setColumnStretch(1, 1) srv_layout.addLayout(form) @@ -473,6 +486,7 @@ class MainWindow(QtWidgets.QMainWindow): summary = self.summary.currentText().strip() compat = self.compat.currentText().strip() fast_mode = self.fast_mode.isChecked() + debug_model = self.debug_model_edit.text().strip() or None expose_reasoning_models = self.expose_reasoning_models.isChecked() default_web_search = self.enable_web_search.isChecked() self.status.setText(f"Starting server at http://{host}:{port} …") @@ -484,6 +498,7 @@ class MainWindow(QtWidgets.QMainWindow): summary, compat, fast_mode, + debug_model, expose_reasoning_models, default_web_search, ) @@ -536,6 +551,7 @@ def main() -> None: p.add_argument("--summary", default="auto") p.add_argument("--compat", default="think-tags") p.add_argument("--fast-mode", action="store_true") + p.add_argument("--debug-model") p.add_argument("--expose-reasoning-models", action="store_true") p.add_argument("--enable-web-search", action="store_true") args, _ = p.parse_known_args() @@ -546,6 +562,7 @@ def main() -> None: args.summary, args.compat, args.fast_mode, + args.debug_model, args.expose_reasoning_models, args.enable_web_search, ) diff --git a/scripts/test_responses_cached_tokens.py b/scripts/test_responses_cached_tokens.py deleted file mode 100644 index 9cf05f5..0000000 --- a/scripts/test_responses_cached_tokens.py +++ /dev/null @@ -1,176 +0,0 @@ -#!/usr/bin/env python3 -from __future__ import annotations - -import argparse -import json -import sys -import uuid -from typing import Any, Dict - -import requests - - -def _post(url: str, api_key: str, session_id: str, payload: Dict[str, Any]) -> Dict[str, Any]: - response = requests.post( - url, - headers={ - "Authorization": f"Bearer {api_key}", - "Content-Type": "application/json", - "X-Session-Id": session_id, - }, - json=payload, - timeout=180, - ) - try: - body = response.json() - except Exception: - body = {"raw": response.text} - if response.status_code >= 400: - raise RuntimeError( - f"POST {url} failed with {response.status_code}: {json.dumps(body, ensure_ascii=False)}" - ) - if not isinstance(body, dict): - raise RuntimeError(f"Expected JSON object response, got: {body!r}") - return body - - -def _usage_summary(body: Dict[str, Any]) -> Dict[str, Any]: - usage = body.get("usage") - if not isinstance(usage, dict): - return {} - return usage - - -def _cached_tokens(body: Dict[str, Any]) -> int | None: - usage = _usage_summary(body) - details = usage.get("input_tokens_details") - if not isinstance(details, dict): - return None - value = details.get("cached_tokens") - try: - return int(value) - except Exception: - return None - - -def _assistant_message_item(body: Dict[str, Any]) -> Dict[str, Any]: - output = body.get("output") - if not isinstance(output, list): - raise RuntimeError("Response did not include an output list.") - for item in output: - if isinstance(item, dict) and item.get("type") == "message" and item.get("role") == "assistant": - return item - raise RuntimeError("Response did not include an assistant message item.") - - -def _user_message(text: str) -> Dict[str, Any]: - return { - "type": "message", - "role": "user", - "content": [{"type": "input_text", "text": text}], - } - - -def _default_prefix() -> str: - seed = "Cache test prefix. Repeat this context exactly for cache measurement. " - return "".join(seed for _ in range(220)) - - -def main() -> int: - parser = argparse.ArgumentParser( - description="Drive two raw /v1/responses turns through ChatMock and check cached input tokens." - ) - parser.add_argument("--base-url", default="http://127.0.0.1:8000", help="ChatMock base URL.") - parser.add_argument("--api-key", default="key", help="Bearer token to send to ChatMock.") - parser.add_argument("--model", default="gpt-5.4", help="Model to request.") - parser.add_argument( - "--session-id", - default=f"cache-check-{uuid.uuid4()}", - help="Fixed X-Session-Id for both turns.", - ) - parser.add_argument( - "--prefix", - default=_default_prefix(), - help="Large repeated first-turn prompt prefix.", - ) - parser.add_argument( - "--first-question", - default="Reply with exactly: alpha", - help="Trailing instruction for the first turn.", - ) - parser.add_argument( - "--second-question", - default="Reply with exactly: beta", - help="Trailing instruction for the second turn.", - ) - args = parser.parse_args() - - responses_url = args.base_url.rstrip("/") + "/v1/responses" - session_id = args.session_id - first_text = f"{args.prefix}\n\n{args.first_question}" - second_text = args.second_question - - print(f"Using session id: {session_id}") - print(f"POST target: {responses_url}") - print("This checks the raw Responses usage object returned through ChatMock.") - print() - - first_payload = { - "model": args.model, - "store": False, - "stream": False, - "input": first_text, - } - first_response = _post(responses_url, args.api_key, session_id, first_payload) - assistant_item = _assistant_message_item(first_response) - - second_payload = { - "model": args.model, - "store": False, - "stream": False, - "input": [ - _user_message(first_text), - assistant_item, - _user_message(second_text), - ], - } - second_response = _post(responses_url, args.api_key, session_id, second_payload) - - first_usage = _usage_summary(first_response) - second_usage = _usage_summary(second_response) - first_cached = _cached_tokens(first_response) - second_cached = _cached_tokens(second_response) - - print("Turn 1") - print(json.dumps(first_usage, indent=2, ensure_ascii=False) if first_usage else " no usage object") - print() - print("Turn 2") - print(json.dumps(second_usage, indent=2, ensure_ascii=False) if second_usage else " no usage object") - print() - - if second_cached is None: - first_input_tokens = first_usage.get("input_tokens") if isinstance(first_usage, dict) else None - second_input_tokens = second_usage.get("input_tokens") if isinstance(second_usage, dict) else None - print("Result: inconclusive") - print("Reason: upstream did not include `usage.input_tokens_details.cached_tokens`.") - if isinstance(first_input_tokens, int) and isinstance(second_input_tokens, int): - print(f"Observed input_tokens delta: first={first_input_tokens}, second={second_input_tokens}") - print("Codex treats cached-token reporting as the direct cache-hit signal; without it, this script cannot prove caching.") - return 2 - - if second_cached > 0: - print(f"Result: success, follow-up turn reported cached_tokens={second_cached}.") - return 0 - - print("Result: failure, follow-up turn reported cached_tokens=0.") - return 1 - - -if __name__ == "__main__": - try: - raise SystemExit(main()) - except KeyboardInterrupt: - raise SystemExit(130) - except Exception as exc: - print(f"error: {exc}", file=sys.stderr) - raise SystemExit(1) diff --git a/scripts/test_responses_reuse.py b/scripts/test_responses_reuse.py deleted file mode 100644 index 5e506ab..0000000 --- a/scripts/test_responses_reuse.py +++ /dev/null @@ -1,143 +0,0 @@ -#!/usr/bin/env python3 -from __future__ import annotations - -import argparse -import json -import sys -import uuid -from typing import Any, Dict, Tuple - -from websockets.sync.client import connect - - -def _user_message(text: str) -> Dict[str, Any]: - return { - "type": "message", - "role": "user", - "content": [{"type": "input_text", "text": text}], - } - - -def _receive_turn(ws) -> Tuple[str, Dict[str, Any]]: - response_id: str | None = None - assistant_item: Dict[str, Any] | None = None - - while True: - raw = ws.recv(timeout=120) - event = json.loads(raw) - event_type = event.get("type") - if event_type == "error": - raise RuntimeError(f"websocket error: {json.dumps(event, ensure_ascii=False)}") - if event_type == "response.created": - response = event.get("response") - if isinstance(response, dict) and isinstance(response.get("id"), str): - response_id = response["id"] - elif event_type == "response.output_item.done": - item = event.get("item") - if ( - isinstance(item, dict) - and item.get("type") == "message" - and item.get("role") == "assistant" - ): - assistant_item = item - elif event_type == "response.completed": - if not response_id: - response = event.get("response") - if isinstance(response, dict) and isinstance(response.get("id"), str): - response_id = response["id"] - if not response_id: - raise RuntimeError("turn completed without a response id") - if assistant_item is None: - raise RuntimeError("turn completed without an assistant message item") - return response_id, assistant_item - - -def main() -> int: - parser = argparse.ArgumentParser( - description="Exercise ChatMock websocket reuse the same way Codex does." - ) - parser.add_argument( - "--ws-url", - default="ws://127.0.0.1:8000/v1/responses", - help="ChatMock websocket URL.", - ) - parser.add_argument("--model", default="gpt-5.4", help="Model to request.") - parser.add_argument( - "--session-id", - default=f"reuse-demo-{uuid.uuid4()}", - help="Fixed X-Session-Id for the whole run.", - ) - parser.add_argument( - "--first-prompt", - default="Say exactly: alpha", - help="Prompt for the first turn.", - ) - parser.add_argument( - "--second-prompt", - default="Now say exactly: beta", - help="Prompt appended in the reuse-candidate turn.", - ) - parser.add_argument( - "--no-fast-mode", - action="store_true", - help="Do not send fast_mode=true.", - ) - args = parser.parse_args() - - headers = {"X-Session-Id": args.session_id} - fast_mode = not args.no_fast_mode - - print(f"Using websocket session id: {args.session_id}") - print(f"Connecting to: {args.ws_url}") - print("Run ChatMock with `python3 chatmock.py serve --verbose` in another terminal.") - print("This verifies the Codex-aligned path: websocket `response.create` reuse.") - print("HTTP `/v1/responses` is not expected to send `previous_response_id`.") - print() - - with connect(args.ws_url, additional_headers=headers, open_timeout=15) as ws: - first_request = { - "type": "response.create", - "model": args.model, - "store": False, - "input": args.first_prompt, - "fast_mode": fast_mode, - } - ws.send(json.dumps(first_request)) - first_response_id, assistant_item = _receive_turn(ws) - - second_request = { - "type": "response.create", - "model": args.model, - "store": False, - "input": [ - _user_message(args.first_prompt), - assistant_item, - _user_message(args.second_prompt), - ], - "fast_mode": fast_mode, - } - ws.send(json.dumps(second_request)) - second_response_id, _ = _receive_turn(ws) - - print("Turn 1 completed.") - print(f" response id: {first_response_id}") - print("Turn 2 completed.") - print(f" response id: {second_response_id}") - print() - print("Expected in the verbose ChatMock server log for turn 2:") - print(" - outbound websocket payload includes `previous_response_id`") - print(" - `previous_response_id` equals the first response id") - print(" - outbound `input` only contains the new trailing user message") - print() - print("If turn 2 still shows the full conversation in the outbound websocket payload, reuse is not working.") - return 0 - - -if __name__ == "__main__": - try: - raise SystemExit(main()) - except KeyboardInterrupt: - raise SystemExit(130) - except Exception as exc: - print(f"error: {exc}", file=sys.stderr) - raise SystemExit(1) diff --git a/tests/test_routes.py b/tests/test_routes.py index 1316bc8..c5d94bc 100644 --- a/tests/test_routes.py +++ b/tests/test_routes.py @@ -91,6 +91,26 @@ class RouteTests(unittest.TestCase): self.assertEqual(body["choices"][0]["message"]["content"], "hello") self.assertEqual(body["model"], "gpt5.4-mini") + @patch("chatmock.routes_openai.start_upstream_request") + def test_chat_completions_honors_debug_model_override(self, mock_start) -> None: + app = create_app(debug_model="gpt-5.4") + client = app.test_client() + mock_start.return_value = ( + FakeUpstream( + [ + {"type": "response.output_text.delta", "delta": "hello"}, + {"type": "response.completed", "response": {"id": "resp-openai"}}, + ] + ), + None, + ) + response = client.post( + "/v1/chat/completions", + json={"model": "gpt-5.3-codex", "messages": [{"role": "user", "content": "hi"}]}, + ) + self.assertEqual(response.status_code, 200) + self.assertEqual(mock_start.call_args.args[0], "gpt-5.4") + @patch("chatmock.routes_ollama.start_upstream_request") def test_ollama_chat(self, mock_start) -> None: mock_start.return_value = ( @@ -111,6 +131,28 @@ class RouteTests(unittest.TestCase): self.assertEqual(body["message"]["content"], "hello") self.assertEqual(body["model"], "gpt-5.4") + @patch("chatmock.routes_ollama.start_upstream_request") + def test_ollama_chat_honors_debug_model_override(self, mock_start) -> None: + app = create_app(debug_model="gpt-5.4") + client = app.test_client() + mock_start.return_value = ( + FakeUpstream( + [ + {"type": "response.output_text.delta", "delta": "hello"}, + {"type": "response.completed"}, + ] + ), + None, + ) + response = client.post( + "/api/chat", + json={"model": "gpt-5.3-codex", "messages": [{"role": "user", "content": "hi"}], "stream": False}, + ) + body = response.get_json() + self.assertEqual(response.status_code, 200) + self.assertEqual(mock_start.call_args.args[0], "gpt-5.4") + self.assertEqual(body["model"], "gpt-5.4") + @patch("chatmock.routes_openai.start_upstream_request") def test_chat_completions_fast_mode_sets_priority_service_tier(self, mock_start) -> None: mock_start.return_value = ( @@ -212,6 +254,70 @@ class RouteTests(unittest.TestCase): self.assertEqual(outbound_payload["reasoning"]["effort"], "medium") self.assertIsInstance(outbound_payload["prompt_cache_key"], str) + @patch("chatmock.routes_openai.start_upstream_raw_request") + def test_responses_route_honors_debug_model_override(self, mock_start) -> None: + app = create_app(debug_model="gpt-5.4") + client = app.test_client() + mock_start.return_value = ( + FakeUpstream( + [ + { + "type": "response.created", + "response": {"id": "resp_debug", "object": "response", "status": "in_progress"}, + }, + { + "type": "response.completed", + "response": { + "id": "resp_debug", + "object": "response", + "status": "completed", + "output": [], + }, + }, + ], + headers={"Content-Type": "text/event-stream"}, + ), + None, + ) + response = client.post( + "/v1/responses", + json={"model": "gpt-5.3-codex", "input": "hello"}, + ) + self.assertEqual(response.status_code, 200) + outbound_payload = mock_start.call_args.args[0] + self.assertEqual(outbound_payload["model"], "gpt-5.4") + + @patch("chatmock.routes_openai.start_upstream_raw_request") + def test_responses_route_strips_unsupported_max_output_tokens(self, mock_start) -> None: + mock_start.return_value = ( + FakeUpstream( + [ + { + "type": "response.created", + "response": {"id": "resp_limit", "object": "response", "status": "in_progress"}, + }, + { + "type": "response.completed", + "response": { + "id": "resp_limit", + "object": "response", + "status": "completed", + "output": [], + }, + }, + ], + headers={"Content-Type": "text/event-stream"}, + ), + None, + ) + response = self.client.post( + "/v1/responses", + json={"model": "gpt-5.4", "input": "hello", "max_output_tokens": 20}, + ) + self.assertEqual(response.status_code, 200) + outbound_payload = mock_start.call_args.args[0] + self.assertNotIn("max_output_tokens", outbound_payload) + @patch("chatmock.routes_openai.start_upstream_raw_request") def test_responses_route_does_not_use_previous_response_id_for_http_follow_up(self, mock_start) -> None: mock_start.side_effect = [ @@ -496,7 +602,6 @@ class RouteTests(unittest.TestCase): kwargs={ "host": host, "port": port, - "debug": False, "use_reloader": False, "threaded": True, },