fixes #103: responses api max_output_tokens bug
This commit is contained in:
@@ -25,13 +25,12 @@ Set options in `.env` or pass environment variables:
|
|||||||
- `CHATGPT_LOCAL_REASONING_SUMMARY`: auto|concise|detailed|none
|
- `CHATGPT_LOCAL_REASONING_SUMMARY`: auto|concise|detailed|none
|
||||||
- `CHATGPT_LOCAL_REASONING_COMPAT`: legacy|o3|think-tags|current
|
- `CHATGPT_LOCAL_REASONING_COMPAT`: legacy|o3|think-tags|current
|
||||||
- `CHATGPT_LOCAL_FAST_MODE`: `true|false` to enable fast mode by default for supported models
|
- `CHATGPT_LOCAL_FAST_MODE`: `true|false` to enable fast mode by default for supported models
|
||||||
- `CHATGPT_LOCAL_DEBUG_MODEL`: force model override (e.g., `gpt-5.4`)
|
|
||||||
- `CHATGPT_LOCAL_CLIENT_ID`: OAuth client id override (rarely needed)
|
- `CHATGPT_LOCAL_CLIENT_ID`: OAuth client id override (rarely needed)
|
||||||
- `CHATGPT_LOCAL_EXPOSE_REASONING_MODELS`: `true|false` to add reasoning model variants to `/v1/models`
|
- `CHATGPT_LOCAL_EXPOSE_REASONING_MODELS`: `true|false` to add reasoning model variants to `/v1/models`
|
||||||
- `CHATGPT_LOCAL_ENABLE_WEB_SEARCH`: `true|false` to enable default web search tool
|
- `CHATGPT_LOCAL_ENABLE_WEB_SEARCH`: `true|false` to enable default web search tool
|
||||||
|
|
||||||
## Logs
|
## Logs
|
||||||
Set `VERBOSE=true` to include extra logging for debugging issues in upstream or chat app requests. Please include and use these logs when submitting bug reports.
|
Set `VERBOSE=true` to include extra logging for troubleshooting upstream or chat app requests. Please include and use these logs when submitting bug reports.
|
||||||
|
|
||||||
## Test
|
## Test
|
||||||
|
|
||||||
|
|||||||
@@ -284,7 +284,7 @@ def cmd_serve(
|
|||||||
default_web_search=default_web_search,
|
default_web_search=default_web_search,
|
||||||
)
|
)
|
||||||
|
|
||||||
app.run(host=host, debug=False, use_reloader=False, port=port, threaded=True)
|
app.run(host=host, use_reloader=False, port=port, threaded=True)
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -88,6 +88,7 @@ def normalize_responses_payload(
|
|||||||
|
|
||||||
normalized = dict(payload)
|
normalized = dict(payload)
|
||||||
normalized["model"] = normalized_model
|
normalized["model"] = normalized_model
|
||||||
|
normalized.pop("max_output_tokens", None)
|
||||||
|
|
||||||
if "input" in normalized:
|
if "input" in normalized:
|
||||||
normalized["input"] = canonicalize_responses_input(normalized.get("input"))
|
normalized["input"] = canonicalize_responses_input(normalized.get("input"))
|
||||||
|
|||||||
@@ -250,7 +250,7 @@ def ollama_chat() -> Response:
|
|||||||
input_items = convert_chat_messages_to_responses_input(messages)
|
input_items = convert_chat_messages_to_responses_input(messages)
|
||||||
|
|
||||||
model_reasoning = extract_reasoning_from_model_name(model)
|
model_reasoning = extract_reasoning_from_model_name(model)
|
||||||
normalized_model = normalize_model_name(model)
|
normalized_model = normalize_model_name(model, current_app.config.get("DEBUG_MODEL"))
|
||||||
service_tier_resolution = resolve_service_tier(
|
service_tier_resolution = resolve_service_tier(
|
||||||
normalized_model,
|
normalized_model,
|
||||||
request_fast_mode=payload.get("fast_mode"),
|
request_fast_mode=payload.get("fast_mode"),
|
||||||
@@ -306,7 +306,7 @@ def ollama_chat() -> Response:
|
|||||||
base_tools_only = convert_tools_chat_to_responses(normalize_ollama_tools(tools_req))
|
base_tools_only = convert_tools_chat_to_responses(normalize_ollama_tools(tools_req))
|
||||||
safe_choice = payload.get("tool_choice", "auto")
|
safe_choice = payload.get("tool_choice", "auto")
|
||||||
upstream2, err2 = start_upstream_request(
|
upstream2, err2 = start_upstream_request(
|
||||||
normalize_model_name(model),
|
normalize_model_name(model, current_app.config.get("DEBUG_MODEL")),
|
||||||
input_items,
|
input_items,
|
||||||
instructions=BASE_INSTRUCTIONS,
|
instructions=BASE_INSTRUCTIONS,
|
||||||
tools=base_tools_only,
|
tools=base_tools_only,
|
||||||
@@ -570,7 +570,7 @@ def ollama_chat() -> Response:
|
|||||||
full_text = f"<think>{rtxt}</think>" + (full_text or "")
|
full_text = f"<think>{rtxt}</think>" + (full_text or "")
|
||||||
|
|
||||||
out_json = {
|
out_json = {
|
||||||
"model": normalize_model_name(model),
|
"model": normalize_model_name(model, current_app.config.get("DEBUG_MODEL")),
|
||||||
"created_at": created_at,
|
"created_at": created_at,
|
||||||
"message": {"role": "assistant", "content": full_text, **({"tool_calls": tool_calls} if tool_calls else {})},
|
"message": {"role": "assistant", "content": full_text, **({"tool_calls": tool_calls} if tool_calls else {})},
|
||||||
"done": True,
|
"done": True,
|
||||||
|
|||||||
@@ -109,7 +109,6 @@ def chat_completions() -> Response:
|
|||||||
reasoning_effort = current_app.config.get("REASONING_EFFORT", "medium")
|
reasoning_effort = current_app.config.get("REASONING_EFFORT", "medium")
|
||||||
reasoning_summary = current_app.config.get("REASONING_SUMMARY", "auto")
|
reasoning_summary = current_app.config.get("REASONING_SUMMARY", "auto")
|
||||||
reasoning_compat = current_app.config.get("REASONING_COMPAT", "think-tags")
|
reasoning_compat = current_app.config.get("REASONING_COMPAT", "think-tags")
|
||||||
debug_model = current_app.config.get("DEBUG_MODEL")
|
|
||||||
|
|
||||||
raw = request.get_data(cache=True, as_text=True) or ""
|
raw = request.get_data(cache=True, as_text=True) or ""
|
||||||
if verbose:
|
if verbose:
|
||||||
@@ -129,7 +128,7 @@ def chat_completions() -> Response:
|
|||||||
return jsonify(err), 400
|
return jsonify(err), 400
|
||||||
|
|
||||||
requested_model = payload.get("model")
|
requested_model = payload.get("model")
|
||||||
model = normalize_model_name(requested_model, debug_model)
|
model = normalize_model_name(requested_model, current_app.config.get("DEBUG_MODEL"))
|
||||||
messages = payload.get("messages")
|
messages = payload.get("messages")
|
||||||
if messages is None and isinstance(payload.get("prompt"), str):
|
if messages is None and isinstance(payload.get("prompt"), str):
|
||||||
messages = [{"role": "user", "content": payload.get("prompt") or ""}]
|
messages = [{"role": "user", "content": payload.get("prompt") or ""}]
|
||||||
@@ -413,7 +412,6 @@ def chat_completions() -> Response:
|
|||||||
def completions() -> Response:
|
def completions() -> Response:
|
||||||
verbose = bool(current_app.config.get("VERBOSE"))
|
verbose = bool(current_app.config.get("VERBOSE"))
|
||||||
verbose_obfuscation = bool(current_app.config.get("VERBOSE_OBFUSCATION"))
|
verbose_obfuscation = bool(current_app.config.get("VERBOSE_OBFUSCATION"))
|
||||||
debug_model = current_app.config.get("DEBUG_MODEL")
|
|
||||||
reasoning_effort = current_app.config.get("REASONING_EFFORT", "medium")
|
reasoning_effort = current_app.config.get("REASONING_EFFORT", "medium")
|
||||||
reasoning_summary = current_app.config.get("REASONING_SUMMARY", "auto")
|
reasoning_summary = current_app.config.get("REASONING_SUMMARY", "auto")
|
||||||
|
|
||||||
@@ -432,7 +430,7 @@ def completions() -> Response:
|
|||||||
return jsonify(err), 400
|
return jsonify(err), 400
|
||||||
|
|
||||||
requested_model = payload.get("model")
|
requested_model = payload.get("model")
|
||||||
model = normalize_model_name(requested_model, debug_model)
|
model = normalize_model_name(requested_model, current_app.config.get("DEBUG_MODEL"))
|
||||||
prompt = payload.get("prompt")
|
prompt = payload.get("prompt")
|
||||||
if isinstance(prompt, list):
|
if isinstance(prompt, list):
|
||||||
prompt = "".join([p if isinstance(p, str) else "" for p in prompt])
|
prompt = "".join([p if isinstance(p, str) else "" for p in prompt])
|
||||||
|
|||||||
19
gui.py
19
gui.py
@@ -19,6 +19,7 @@ def run_server(
|
|||||||
reasoning_summary: str = "auto",
|
reasoning_summary: str = "auto",
|
||||||
reasoning_compat: str = "think-tags",
|
reasoning_compat: str = "think-tags",
|
||||||
fast_mode: bool = False,
|
fast_mode: bool = False,
|
||||||
|
debug_model: str | None = None,
|
||||||
expose_reasoning_models: bool = False,
|
expose_reasoning_models: bool = False,
|
||||||
default_web_search: bool = False,
|
default_web_search: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
@@ -27,10 +28,11 @@ def run_server(
|
|||||||
reasoning_summary=reasoning_summary,
|
reasoning_summary=reasoning_summary,
|
||||||
reasoning_compat=reasoning_compat,
|
reasoning_compat=reasoning_compat,
|
||||||
fast_mode=fast_mode,
|
fast_mode=fast_mode,
|
||||||
|
debug_model=debug_model,
|
||||||
expose_reasoning_models=expose_reasoning_models,
|
expose_reasoning_models=expose_reasoning_models,
|
||||||
default_web_search=default_web_search,
|
default_web_search=default_web_search,
|
||||||
)
|
)
|
||||||
app.run(host=host, port=port, debug=False, use_reloader=False, threaded=True)
|
app.run(host=host, port=port, use_reloader=False, threaded=True)
|
||||||
|
|
||||||
|
|
||||||
class ServerProcess(QtCore.QObject):
|
class ServerProcess(QtCore.QObject):
|
||||||
@@ -45,6 +47,7 @@ class ServerProcess(QtCore.QObject):
|
|||||||
self._summary = "auto"
|
self._summary = "auto"
|
||||||
self._compat = "think-tags"
|
self._compat = "think-tags"
|
||||||
self._fast_mode = False
|
self._fast_mode = False
|
||||||
|
self._debug_model: str | None = None
|
||||||
self._expose_reasoning_models = False
|
self._expose_reasoning_models = False
|
||||||
self._default_web_search = False
|
self._default_web_search = False
|
||||||
|
|
||||||
@@ -59,6 +62,7 @@ class ServerProcess(QtCore.QObject):
|
|||||||
summary: str,
|
summary: str,
|
||||||
compat: str,
|
compat: str,
|
||||||
fast_mode: bool,
|
fast_mode: bool,
|
||||||
|
debug_model: str | None,
|
||||||
expose_reasoning_models: bool,
|
expose_reasoning_models: bool,
|
||||||
default_web_search: bool,
|
default_web_search: bool,
|
||||||
) -> None:
|
) -> None:
|
||||||
@@ -68,6 +72,7 @@ class ServerProcess(QtCore.QObject):
|
|||||||
self._effort, self._summary = effort, summary
|
self._effort, self._summary = effort, summary
|
||||||
self._compat = compat
|
self._compat = compat
|
||||||
self._fast_mode = fast_mode
|
self._fast_mode = fast_mode
|
||||||
|
self._debug_model = debug_model
|
||||||
self._expose_reasoning_models = expose_reasoning_models
|
self._expose_reasoning_models = expose_reasoning_models
|
||||||
self._default_web_search = default_web_search
|
self._default_web_search = default_web_search
|
||||||
self._proc = QtCore.QProcess()
|
self._proc = QtCore.QProcess()
|
||||||
@@ -80,6 +85,8 @@ class ServerProcess(QtCore.QObject):
|
|||||||
"--summary", summary,
|
"--summary", summary,
|
||||||
"--compat", compat,
|
"--compat", compat,
|
||||||
]
|
]
|
||||||
|
if isinstance(debug_model, str) and debug_model.strip():
|
||||||
|
args.extend(["--debug-model", debug_model.strip()])
|
||||||
if fast_mode:
|
if fast_mode:
|
||||||
args.append("--fast-mode")
|
args.append("--fast-mode")
|
||||||
if expose_reasoning_models:
|
if expose_reasoning_models:
|
||||||
@@ -317,6 +324,12 @@ class MainWindow(QtWidgets.QMainWindow):
|
|||||||
self.port_edit.setValidator(QtGui.QIntValidator(1, 65535, self))
|
self.port_edit.setValidator(QtGui.QIntValidator(1, 65535, self))
|
||||||
self.port_edit.setMaximumWidth(100)
|
self.port_edit.setMaximumWidth(100)
|
||||||
form.addWidget(self.port_edit, 0, 3)
|
form.addWidget(self.port_edit, 0, 3)
|
||||||
|
form.addWidget(QtWidgets.QLabel("Debug Model"), 1, 0)
|
||||||
|
self.debug_model_edit = QtWidgets.QLineEdit("")
|
||||||
|
self.debug_model_edit.setClearButtonEnabled(True)
|
||||||
|
self.debug_model_edit.setPlaceholderText("Optional override, e.g. gpt-5.4")
|
||||||
|
self.debug_model_edit.setSizePolicy(QtWidgets.QSizePolicy.Expanding, QtWidgets.QSizePolicy.Fixed)
|
||||||
|
form.addWidget(self.debug_model_edit, 1, 1, 1, 3)
|
||||||
form.setColumnStretch(1, 1)
|
form.setColumnStretch(1, 1)
|
||||||
srv_layout.addLayout(form)
|
srv_layout.addLayout(form)
|
||||||
|
|
||||||
@@ -473,6 +486,7 @@ class MainWindow(QtWidgets.QMainWindow):
|
|||||||
summary = self.summary.currentText().strip()
|
summary = self.summary.currentText().strip()
|
||||||
compat = self.compat.currentText().strip()
|
compat = self.compat.currentText().strip()
|
||||||
fast_mode = self.fast_mode.isChecked()
|
fast_mode = self.fast_mode.isChecked()
|
||||||
|
debug_model = self.debug_model_edit.text().strip() or None
|
||||||
expose_reasoning_models = self.expose_reasoning_models.isChecked()
|
expose_reasoning_models = self.expose_reasoning_models.isChecked()
|
||||||
default_web_search = self.enable_web_search.isChecked()
|
default_web_search = self.enable_web_search.isChecked()
|
||||||
self.status.setText(f"Starting server at http://{host}:{port} …")
|
self.status.setText(f"Starting server at http://{host}:{port} …")
|
||||||
@@ -484,6 +498,7 @@ class MainWindow(QtWidgets.QMainWindow):
|
|||||||
summary,
|
summary,
|
||||||
compat,
|
compat,
|
||||||
fast_mode,
|
fast_mode,
|
||||||
|
debug_model,
|
||||||
expose_reasoning_models,
|
expose_reasoning_models,
|
||||||
default_web_search,
|
default_web_search,
|
||||||
)
|
)
|
||||||
@@ -536,6 +551,7 @@ def main() -> None:
|
|||||||
p.add_argument("--summary", default="auto")
|
p.add_argument("--summary", default="auto")
|
||||||
p.add_argument("--compat", default="think-tags")
|
p.add_argument("--compat", default="think-tags")
|
||||||
p.add_argument("--fast-mode", action="store_true")
|
p.add_argument("--fast-mode", action="store_true")
|
||||||
|
p.add_argument("--debug-model")
|
||||||
p.add_argument("--expose-reasoning-models", action="store_true")
|
p.add_argument("--expose-reasoning-models", action="store_true")
|
||||||
p.add_argument("--enable-web-search", action="store_true")
|
p.add_argument("--enable-web-search", action="store_true")
|
||||||
args, _ = p.parse_known_args()
|
args, _ = p.parse_known_args()
|
||||||
@@ -546,6 +562,7 @@ def main() -> None:
|
|||||||
args.summary,
|
args.summary,
|
||||||
args.compat,
|
args.compat,
|
||||||
args.fast_mode,
|
args.fast_mode,
|
||||||
|
args.debug_model,
|
||||||
args.expose_reasoning_models,
|
args.expose_reasoning_models,
|
||||||
args.enable_web_search,
|
args.enable_web_search,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1,176 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import json
|
|
||||||
import sys
|
|
||||||
import uuid
|
|
||||||
from typing import Any, Dict
|
|
||||||
|
|
||||||
import requests
|
|
||||||
|
|
||||||
|
|
||||||
def _post(url: str, api_key: str, session_id: str, payload: Dict[str, Any]) -> Dict[str, Any]:
|
|
||||||
response = requests.post(
|
|
||||||
url,
|
|
||||||
headers={
|
|
||||||
"Authorization": f"Bearer {api_key}",
|
|
||||||
"Content-Type": "application/json",
|
|
||||||
"X-Session-Id": session_id,
|
|
||||||
},
|
|
||||||
json=payload,
|
|
||||||
timeout=180,
|
|
||||||
)
|
|
||||||
try:
|
|
||||||
body = response.json()
|
|
||||||
except Exception:
|
|
||||||
body = {"raw": response.text}
|
|
||||||
if response.status_code >= 400:
|
|
||||||
raise RuntimeError(
|
|
||||||
f"POST {url} failed with {response.status_code}: {json.dumps(body, ensure_ascii=False)}"
|
|
||||||
)
|
|
||||||
if not isinstance(body, dict):
|
|
||||||
raise RuntimeError(f"Expected JSON object response, got: {body!r}")
|
|
||||||
return body
|
|
||||||
|
|
||||||
|
|
||||||
def _usage_summary(body: Dict[str, Any]) -> Dict[str, Any]:
|
|
||||||
usage = body.get("usage")
|
|
||||||
if not isinstance(usage, dict):
|
|
||||||
return {}
|
|
||||||
return usage
|
|
||||||
|
|
||||||
|
|
||||||
def _cached_tokens(body: Dict[str, Any]) -> int | None:
|
|
||||||
usage = _usage_summary(body)
|
|
||||||
details = usage.get("input_tokens_details")
|
|
||||||
if not isinstance(details, dict):
|
|
||||||
return None
|
|
||||||
value = details.get("cached_tokens")
|
|
||||||
try:
|
|
||||||
return int(value)
|
|
||||||
except Exception:
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def _assistant_message_item(body: Dict[str, Any]) -> Dict[str, Any]:
|
|
||||||
output = body.get("output")
|
|
||||||
if not isinstance(output, list):
|
|
||||||
raise RuntimeError("Response did not include an output list.")
|
|
||||||
for item in output:
|
|
||||||
if isinstance(item, dict) and item.get("type") == "message" and item.get("role") == "assistant":
|
|
||||||
return item
|
|
||||||
raise RuntimeError("Response did not include an assistant message item.")
|
|
||||||
|
|
||||||
|
|
||||||
def _user_message(text: str) -> Dict[str, Any]:
|
|
||||||
return {
|
|
||||||
"type": "message",
|
|
||||||
"role": "user",
|
|
||||||
"content": [{"type": "input_text", "text": text}],
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def _default_prefix() -> str:
|
|
||||||
seed = "Cache test prefix. Repeat this context exactly for cache measurement. "
|
|
||||||
return "".join(seed for _ in range(220))
|
|
||||||
|
|
||||||
|
|
||||||
def main() -> int:
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description="Drive two raw /v1/responses turns through ChatMock and check cached input tokens."
|
|
||||||
)
|
|
||||||
parser.add_argument("--base-url", default="http://127.0.0.1:8000", help="ChatMock base URL.")
|
|
||||||
parser.add_argument("--api-key", default="key", help="Bearer token to send to ChatMock.")
|
|
||||||
parser.add_argument("--model", default="gpt-5.4", help="Model to request.")
|
|
||||||
parser.add_argument(
|
|
||||||
"--session-id",
|
|
||||||
default=f"cache-check-{uuid.uuid4()}",
|
|
||||||
help="Fixed X-Session-Id for both turns.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--prefix",
|
|
||||||
default=_default_prefix(),
|
|
||||||
help="Large repeated first-turn prompt prefix.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--first-question",
|
|
||||||
default="Reply with exactly: alpha",
|
|
||||||
help="Trailing instruction for the first turn.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--second-question",
|
|
||||||
default="Reply with exactly: beta",
|
|
||||||
help="Trailing instruction for the second turn.",
|
|
||||||
)
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
responses_url = args.base_url.rstrip("/") + "/v1/responses"
|
|
||||||
session_id = args.session_id
|
|
||||||
first_text = f"{args.prefix}\n\n{args.first_question}"
|
|
||||||
second_text = args.second_question
|
|
||||||
|
|
||||||
print(f"Using session id: {session_id}")
|
|
||||||
print(f"POST target: {responses_url}")
|
|
||||||
print("This checks the raw Responses usage object returned through ChatMock.")
|
|
||||||
print()
|
|
||||||
|
|
||||||
first_payload = {
|
|
||||||
"model": args.model,
|
|
||||||
"store": False,
|
|
||||||
"stream": False,
|
|
||||||
"input": first_text,
|
|
||||||
}
|
|
||||||
first_response = _post(responses_url, args.api_key, session_id, first_payload)
|
|
||||||
assistant_item = _assistant_message_item(first_response)
|
|
||||||
|
|
||||||
second_payload = {
|
|
||||||
"model": args.model,
|
|
||||||
"store": False,
|
|
||||||
"stream": False,
|
|
||||||
"input": [
|
|
||||||
_user_message(first_text),
|
|
||||||
assistant_item,
|
|
||||||
_user_message(second_text),
|
|
||||||
],
|
|
||||||
}
|
|
||||||
second_response = _post(responses_url, args.api_key, session_id, second_payload)
|
|
||||||
|
|
||||||
first_usage = _usage_summary(first_response)
|
|
||||||
second_usage = _usage_summary(second_response)
|
|
||||||
first_cached = _cached_tokens(first_response)
|
|
||||||
second_cached = _cached_tokens(second_response)
|
|
||||||
|
|
||||||
print("Turn 1")
|
|
||||||
print(json.dumps(first_usage, indent=2, ensure_ascii=False) if first_usage else " no usage object")
|
|
||||||
print()
|
|
||||||
print("Turn 2")
|
|
||||||
print(json.dumps(second_usage, indent=2, ensure_ascii=False) if second_usage else " no usage object")
|
|
||||||
print()
|
|
||||||
|
|
||||||
if second_cached is None:
|
|
||||||
first_input_tokens = first_usage.get("input_tokens") if isinstance(first_usage, dict) else None
|
|
||||||
second_input_tokens = second_usage.get("input_tokens") if isinstance(second_usage, dict) else None
|
|
||||||
print("Result: inconclusive")
|
|
||||||
print("Reason: upstream did not include `usage.input_tokens_details.cached_tokens`.")
|
|
||||||
if isinstance(first_input_tokens, int) and isinstance(second_input_tokens, int):
|
|
||||||
print(f"Observed input_tokens delta: first={first_input_tokens}, second={second_input_tokens}")
|
|
||||||
print("Codex treats cached-token reporting as the direct cache-hit signal; without it, this script cannot prove caching.")
|
|
||||||
return 2
|
|
||||||
|
|
||||||
if second_cached > 0:
|
|
||||||
print(f"Result: success, follow-up turn reported cached_tokens={second_cached}.")
|
|
||||||
return 0
|
|
||||||
|
|
||||||
print("Result: failure, follow-up turn reported cached_tokens=0.")
|
|
||||||
return 1
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
try:
|
|
||||||
raise SystemExit(main())
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
raise SystemExit(130)
|
|
||||||
except Exception as exc:
|
|
||||||
print(f"error: {exc}", file=sys.stderr)
|
|
||||||
raise SystemExit(1)
|
|
||||||
@@ -1,143 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import json
|
|
||||||
import sys
|
|
||||||
import uuid
|
|
||||||
from typing import Any, Dict, Tuple
|
|
||||||
|
|
||||||
from websockets.sync.client import connect
|
|
||||||
|
|
||||||
|
|
||||||
def _user_message(text: str) -> Dict[str, Any]:
|
|
||||||
return {
|
|
||||||
"type": "message",
|
|
||||||
"role": "user",
|
|
||||||
"content": [{"type": "input_text", "text": text}],
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def _receive_turn(ws) -> Tuple[str, Dict[str, Any]]:
|
|
||||||
response_id: str | None = None
|
|
||||||
assistant_item: Dict[str, Any] | None = None
|
|
||||||
|
|
||||||
while True:
|
|
||||||
raw = ws.recv(timeout=120)
|
|
||||||
event = json.loads(raw)
|
|
||||||
event_type = event.get("type")
|
|
||||||
if event_type == "error":
|
|
||||||
raise RuntimeError(f"websocket error: {json.dumps(event, ensure_ascii=False)}")
|
|
||||||
if event_type == "response.created":
|
|
||||||
response = event.get("response")
|
|
||||||
if isinstance(response, dict) and isinstance(response.get("id"), str):
|
|
||||||
response_id = response["id"]
|
|
||||||
elif event_type == "response.output_item.done":
|
|
||||||
item = event.get("item")
|
|
||||||
if (
|
|
||||||
isinstance(item, dict)
|
|
||||||
and item.get("type") == "message"
|
|
||||||
and item.get("role") == "assistant"
|
|
||||||
):
|
|
||||||
assistant_item = item
|
|
||||||
elif event_type == "response.completed":
|
|
||||||
if not response_id:
|
|
||||||
response = event.get("response")
|
|
||||||
if isinstance(response, dict) and isinstance(response.get("id"), str):
|
|
||||||
response_id = response["id"]
|
|
||||||
if not response_id:
|
|
||||||
raise RuntimeError("turn completed without a response id")
|
|
||||||
if assistant_item is None:
|
|
||||||
raise RuntimeError("turn completed without an assistant message item")
|
|
||||||
return response_id, assistant_item
|
|
||||||
|
|
||||||
|
|
||||||
def main() -> int:
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description="Exercise ChatMock websocket reuse the same way Codex does."
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--ws-url",
|
|
||||||
default="ws://127.0.0.1:8000/v1/responses",
|
|
||||||
help="ChatMock websocket URL.",
|
|
||||||
)
|
|
||||||
parser.add_argument("--model", default="gpt-5.4", help="Model to request.")
|
|
||||||
parser.add_argument(
|
|
||||||
"--session-id",
|
|
||||||
default=f"reuse-demo-{uuid.uuid4()}",
|
|
||||||
help="Fixed X-Session-Id for the whole run.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--first-prompt",
|
|
||||||
default="Say exactly: alpha",
|
|
||||||
help="Prompt for the first turn.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--second-prompt",
|
|
||||||
default="Now say exactly: beta",
|
|
||||||
help="Prompt appended in the reuse-candidate turn.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--no-fast-mode",
|
|
||||||
action="store_true",
|
|
||||||
help="Do not send fast_mode=true.",
|
|
||||||
)
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
headers = {"X-Session-Id": args.session_id}
|
|
||||||
fast_mode = not args.no_fast_mode
|
|
||||||
|
|
||||||
print(f"Using websocket session id: {args.session_id}")
|
|
||||||
print(f"Connecting to: {args.ws_url}")
|
|
||||||
print("Run ChatMock with `python3 chatmock.py serve --verbose` in another terminal.")
|
|
||||||
print("This verifies the Codex-aligned path: websocket `response.create` reuse.")
|
|
||||||
print("HTTP `/v1/responses` is not expected to send `previous_response_id`.")
|
|
||||||
print()
|
|
||||||
|
|
||||||
with connect(args.ws_url, additional_headers=headers, open_timeout=15) as ws:
|
|
||||||
first_request = {
|
|
||||||
"type": "response.create",
|
|
||||||
"model": args.model,
|
|
||||||
"store": False,
|
|
||||||
"input": args.first_prompt,
|
|
||||||
"fast_mode": fast_mode,
|
|
||||||
}
|
|
||||||
ws.send(json.dumps(first_request))
|
|
||||||
first_response_id, assistant_item = _receive_turn(ws)
|
|
||||||
|
|
||||||
second_request = {
|
|
||||||
"type": "response.create",
|
|
||||||
"model": args.model,
|
|
||||||
"store": False,
|
|
||||||
"input": [
|
|
||||||
_user_message(args.first_prompt),
|
|
||||||
assistant_item,
|
|
||||||
_user_message(args.second_prompt),
|
|
||||||
],
|
|
||||||
"fast_mode": fast_mode,
|
|
||||||
}
|
|
||||||
ws.send(json.dumps(second_request))
|
|
||||||
second_response_id, _ = _receive_turn(ws)
|
|
||||||
|
|
||||||
print("Turn 1 completed.")
|
|
||||||
print(f" response id: {first_response_id}")
|
|
||||||
print("Turn 2 completed.")
|
|
||||||
print(f" response id: {second_response_id}")
|
|
||||||
print()
|
|
||||||
print("Expected in the verbose ChatMock server log for turn 2:")
|
|
||||||
print(" - outbound websocket payload includes `previous_response_id`")
|
|
||||||
print(" - `previous_response_id` equals the first response id")
|
|
||||||
print(" - outbound `input` only contains the new trailing user message")
|
|
||||||
print()
|
|
||||||
print("If turn 2 still shows the full conversation in the outbound websocket payload, reuse is not working.")
|
|
||||||
return 0
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
try:
|
|
||||||
raise SystemExit(main())
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
raise SystemExit(130)
|
|
||||||
except Exception as exc:
|
|
||||||
print(f"error: {exc}", file=sys.stderr)
|
|
||||||
raise SystemExit(1)
|
|
||||||
@@ -91,6 +91,26 @@ class RouteTests(unittest.TestCase):
|
|||||||
self.assertEqual(body["choices"][0]["message"]["content"], "hello")
|
self.assertEqual(body["choices"][0]["message"]["content"], "hello")
|
||||||
self.assertEqual(body["model"], "gpt5.4-mini")
|
self.assertEqual(body["model"], "gpt5.4-mini")
|
||||||
|
|
||||||
|
@patch("chatmock.routes_openai.start_upstream_request")
|
||||||
|
def test_chat_completions_honors_debug_model_override(self, mock_start) -> None:
|
||||||
|
app = create_app(debug_model="gpt-5.4")
|
||||||
|
client = app.test_client()
|
||||||
|
mock_start.return_value = (
|
||||||
|
FakeUpstream(
|
||||||
|
[
|
||||||
|
{"type": "response.output_text.delta", "delta": "hello"},
|
||||||
|
{"type": "response.completed", "response": {"id": "resp-openai"}},
|
||||||
|
]
|
||||||
|
),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
response = client.post(
|
||||||
|
"/v1/chat/completions",
|
||||||
|
json={"model": "gpt-5.3-codex", "messages": [{"role": "user", "content": "hi"}]},
|
||||||
|
)
|
||||||
|
self.assertEqual(response.status_code, 200)
|
||||||
|
self.assertEqual(mock_start.call_args.args[0], "gpt-5.4")
|
||||||
|
|
||||||
@patch("chatmock.routes_ollama.start_upstream_request")
|
@patch("chatmock.routes_ollama.start_upstream_request")
|
||||||
def test_ollama_chat(self, mock_start) -> None:
|
def test_ollama_chat(self, mock_start) -> None:
|
||||||
mock_start.return_value = (
|
mock_start.return_value = (
|
||||||
@@ -111,6 +131,28 @@ class RouteTests(unittest.TestCase):
|
|||||||
self.assertEqual(body["message"]["content"], "hello")
|
self.assertEqual(body["message"]["content"], "hello")
|
||||||
self.assertEqual(body["model"], "gpt-5.4")
|
self.assertEqual(body["model"], "gpt-5.4")
|
||||||
|
|
||||||
|
@patch("chatmock.routes_ollama.start_upstream_request")
|
||||||
|
def test_ollama_chat_honors_debug_model_override(self, mock_start) -> None:
|
||||||
|
app = create_app(debug_model="gpt-5.4")
|
||||||
|
client = app.test_client()
|
||||||
|
mock_start.return_value = (
|
||||||
|
FakeUpstream(
|
||||||
|
[
|
||||||
|
{"type": "response.output_text.delta", "delta": "hello"},
|
||||||
|
{"type": "response.completed"},
|
||||||
|
]
|
||||||
|
),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
response = client.post(
|
||||||
|
"/api/chat",
|
||||||
|
json={"model": "gpt-5.3-codex", "messages": [{"role": "user", "content": "hi"}], "stream": False},
|
||||||
|
)
|
||||||
|
body = response.get_json()
|
||||||
|
self.assertEqual(response.status_code, 200)
|
||||||
|
self.assertEqual(mock_start.call_args.args[0], "gpt-5.4")
|
||||||
|
self.assertEqual(body["model"], "gpt-5.4")
|
||||||
|
|
||||||
@patch("chatmock.routes_openai.start_upstream_request")
|
@patch("chatmock.routes_openai.start_upstream_request")
|
||||||
def test_chat_completions_fast_mode_sets_priority_service_tier(self, mock_start) -> None:
|
def test_chat_completions_fast_mode_sets_priority_service_tier(self, mock_start) -> None:
|
||||||
mock_start.return_value = (
|
mock_start.return_value = (
|
||||||
@@ -212,6 +254,70 @@ class RouteTests(unittest.TestCase):
|
|||||||
self.assertEqual(outbound_payload["reasoning"]["effort"], "medium")
|
self.assertEqual(outbound_payload["reasoning"]["effort"], "medium")
|
||||||
self.assertIsInstance(outbound_payload["prompt_cache_key"], str)
|
self.assertIsInstance(outbound_payload["prompt_cache_key"], str)
|
||||||
|
|
||||||
|
@patch("chatmock.routes_openai.start_upstream_raw_request")
|
||||||
|
def test_responses_route_honors_debug_model_override(self, mock_start) -> None:
|
||||||
|
app = create_app(debug_model="gpt-5.4")
|
||||||
|
client = app.test_client()
|
||||||
|
mock_start.return_value = (
|
||||||
|
FakeUpstream(
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"type": "response.created",
|
||||||
|
"response": {"id": "resp_debug", "object": "response", "status": "in_progress"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "response.completed",
|
||||||
|
"response": {
|
||||||
|
"id": "resp_debug",
|
||||||
|
"object": "response",
|
||||||
|
"status": "completed",
|
||||||
|
"output": [],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
headers={"Content-Type": "text/event-stream"},
|
||||||
|
),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
response = client.post(
|
||||||
|
"/v1/responses",
|
||||||
|
json={"model": "gpt-5.3-codex", "input": "hello"},
|
||||||
|
)
|
||||||
|
self.assertEqual(response.status_code, 200)
|
||||||
|
outbound_payload = mock_start.call_args.args[0]
|
||||||
|
self.assertEqual(outbound_payload["model"], "gpt-5.4")
|
||||||
|
|
||||||
|
@patch("chatmock.routes_openai.start_upstream_raw_request")
|
||||||
|
def test_responses_route_strips_unsupported_max_output_tokens(self, mock_start) -> None:
|
||||||
|
mock_start.return_value = (
|
||||||
|
FakeUpstream(
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"type": "response.created",
|
||||||
|
"response": {"id": "resp_limit", "object": "response", "status": "in_progress"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "response.completed",
|
||||||
|
"response": {
|
||||||
|
"id": "resp_limit",
|
||||||
|
"object": "response",
|
||||||
|
"status": "completed",
|
||||||
|
"output": [],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
headers={"Content-Type": "text/event-stream"},
|
||||||
|
),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
response = self.client.post(
|
||||||
|
"/v1/responses",
|
||||||
|
json={"model": "gpt-5.4", "input": "hello", "max_output_tokens": 20},
|
||||||
|
)
|
||||||
|
self.assertEqual(response.status_code, 200)
|
||||||
|
outbound_payload = mock_start.call_args.args[0]
|
||||||
|
self.assertNotIn("max_output_tokens", outbound_payload)
|
||||||
|
|
||||||
@patch("chatmock.routes_openai.start_upstream_raw_request")
|
@patch("chatmock.routes_openai.start_upstream_raw_request")
|
||||||
def test_responses_route_does_not_use_previous_response_id_for_http_follow_up(self, mock_start) -> None:
|
def test_responses_route_does_not_use_previous_response_id_for_http_follow_up(self, mock_start) -> None:
|
||||||
mock_start.side_effect = [
|
mock_start.side_effect = [
|
||||||
@@ -496,7 +602,6 @@ class RouteTests(unittest.TestCase):
|
|||||||
kwargs={
|
kwargs={
|
||||||
"host": host,
|
"host": host,
|
||||||
"port": port,
|
"port": port,
|
||||||
"debug": False,
|
|
||||||
"use_reloader": False,
|
"use_reloader": False,
|
||||||
"threaded": True,
|
"threaded": True,
|
||||||
},
|
},
|
||||||
|
|||||||
Reference in New Issue
Block a user