feat: add responses api, websocket support, and fast mode

2026-03-23 15:41:42 +05:00
parent e96db19538
commit 8754203ec6
22 changed files with 2148 additions and 119 deletions
--- a/scripts/test_responses_cached_tokens.py
+++ b/scripts/test_responses_cached_tokens.py
@@ -0,0 +1,176 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+import uuid
+from typing import Any, Dict
+
+import requests
+
+
+def _post(url: str, api_key: str, session_id: str, payload: Dict[str, Any]) -> Dict[str, Any]:
+    response = requests.post(
+        url,
+        headers={
+            "Authorization": f"Bearer {api_key}",
+            "Content-Type": "application/json",
+            "X-Session-Id": session_id,
+        },
+        json=payload,
+        timeout=180,
+    )
+    try:
+        body = response.json()
+    except Exception:
+        body = {"raw": response.text}
+    if response.status_code >= 400:
+        raise RuntimeError(
+            f"POST {url} failed with {response.status_code}: {json.dumps(body, ensure_ascii=False)}"
+        )
+    if not isinstance(body, dict):
+        raise RuntimeError(f"Expected JSON object response, got: {body!r}")
+    return body
+
+
+def _usage_summary(body: Dict[str, Any]) -> Dict[str, Any]:
+    usage = body.get("usage")
+    if not isinstance(usage, dict):
+        return {}
+    return usage
+
+
+def _cached_tokens(body: Dict[str, Any]) -> int | None:
+    usage = _usage_summary(body)
+    details = usage.get("input_tokens_details")
+    if not isinstance(details, dict):
+        return None
+    value = details.get("cached_tokens")
+    try:
+        return int(value)
+    except Exception:
+        return None
+
+
+def _assistant_message_item(body: Dict[str, Any]) -> Dict[str, Any]:
+    output = body.get("output")
+    if not isinstance(output, list):
+        raise RuntimeError("Response did not include an output list.")
+    for item in output:
+        if isinstance(item, dict) and item.get("type") == "message" and item.get("role") == "assistant":
+            return item
+    raise RuntimeError("Response did not include an assistant message item.")
+
+
+def _user_message(text: str) -> Dict[str, Any]:
+    return {
+        "type": "message",
+        "role": "user",
+        "content": [{"type": "input_text", "text": text}],
+    }
+
+
+def _default_prefix() -> str:
+    seed = "Cache test prefix. Repeat this context exactly for cache measurement. "
+    return "".join(seed for _ in range(220))
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Drive two raw /v1/responses turns through ChatMock and check cached input tokens."
+    )
+    parser.add_argument("--base-url", default="http://127.0.0.1:8000", help="ChatMock base URL.")
+    parser.add_argument("--api-key", default="key", help="Bearer token to send to ChatMock.")
+    parser.add_argument("--model", default="gpt-5.4", help="Model to request.")
+    parser.add_argument(
+        "--session-id",
+        default=f"cache-check-{uuid.uuid4()}",
+        help="Fixed X-Session-Id for both turns.",
+    )
+    parser.add_argument(
+        "--prefix",
+        default=_default_prefix(),
+        help="Large repeated first-turn prompt prefix.",
+    )
+    parser.add_argument(
+        "--first-question",
+        default="Reply with exactly: alpha",
+        help="Trailing instruction for the first turn.",
+    )
+    parser.add_argument(
+        "--second-question",
+        default="Reply with exactly: beta",
+        help="Trailing instruction for the second turn.",
+    )
+    args = parser.parse_args()
+
+    responses_url = args.base_url.rstrip("/") + "/v1/responses"
+    session_id = args.session_id
+    first_text = f"{args.prefix}\n\n{args.first_question}"
+    second_text = args.second_question
+
+    print(f"Using session id: {session_id}")
+    print(f"POST target: {responses_url}")
+    print("This checks the raw Responses usage object returned through ChatMock.")
+    print()
+
+    first_payload = {
+        "model": args.model,
+        "store": False,
+        "stream": False,
+        "input": first_text,
+    }
+    first_response = _post(responses_url, args.api_key, session_id, first_payload)
+    assistant_item = _assistant_message_item(first_response)
+
+    second_payload = {
+        "model": args.model,
+        "store": False,
+        "stream": False,
+        "input": [
+            _user_message(first_text),
+            assistant_item,
+            _user_message(second_text),
+        ],
+    }
+    second_response = _post(responses_url, args.api_key, session_id, second_payload)
+
+    first_usage = _usage_summary(first_response)
+    second_usage = _usage_summary(second_response)
+    first_cached = _cached_tokens(first_response)
+    second_cached = _cached_tokens(second_response)
+
+    print("Turn 1")
+    print(json.dumps(first_usage, indent=2, ensure_ascii=False) if first_usage else "  no usage object")
+    print()
+    print("Turn 2")
+    print(json.dumps(second_usage, indent=2, ensure_ascii=False) if second_usage else "  no usage object")
+    print()
+
+    if second_cached is None:
+        first_input_tokens = first_usage.get("input_tokens") if isinstance(first_usage, dict) else None
+        second_input_tokens = second_usage.get("input_tokens") if isinstance(second_usage, dict) else None
+        print("Result: inconclusive")
+        print("Reason: upstream did not include `usage.input_tokens_details.cached_tokens`.")
+        if isinstance(first_input_tokens, int) and isinstance(second_input_tokens, int):
+            print(f"Observed input_tokens delta: first={first_input_tokens}, second={second_input_tokens}")
+        print("Codex treats cached-token reporting as the direct cache-hit signal; without it, this script cannot prove caching.")
+        return 2
+
+    if second_cached > 0:
+        print(f"Result: success, follow-up turn reported cached_tokens={second_cached}.")
+        return 0
+
+    print("Result: failure, follow-up turn reported cached_tokens=0.")
+    return 1
+
+
+if __name__ == "__main__":
+    try:
+        raise SystemExit(main())
+    except KeyboardInterrupt:
+        raise SystemExit(130)
+    except Exception as exc:
+        print(f"error: {exc}", file=sys.stderr)
+        raise SystemExit(1)
--- a/scripts/test_responses_reuse.py
+++ b/scripts/test_responses_reuse.py
@@ -0,0 +1,143 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+import uuid
+from typing import Any, Dict, Tuple
+
+from websockets.sync.client import connect
+
+
+def _user_message(text: str) -> Dict[str, Any]:
+    return {
+        "type": "message",
+        "role": "user",
+        "content": [{"type": "input_text", "text": text}],
+    }
+
+
+def _receive_turn(ws) -> Tuple[str, Dict[str, Any]]:
+    response_id: str | None = None
+    assistant_item: Dict[str, Any] | None = None
+
+    while True:
+        raw = ws.recv(timeout=120)
+        event = json.loads(raw)
+        event_type = event.get("type")
+        if event_type == "error":
+            raise RuntimeError(f"websocket error: {json.dumps(event, ensure_ascii=False)}")
+        if event_type == "response.created":
+            response = event.get("response")
+            if isinstance(response, dict) and isinstance(response.get("id"), str):
+                response_id = response["id"]
+        elif event_type == "response.output_item.done":
+            item = event.get("item")
+            if (
+                isinstance(item, dict)
+                and item.get("type") == "message"
+                and item.get("role") == "assistant"
+            ):
+                assistant_item = item
+        elif event_type == "response.completed":
+            if not response_id:
+                response = event.get("response")
+                if isinstance(response, dict) and isinstance(response.get("id"), str):
+                    response_id = response["id"]
+            if not response_id:
+                raise RuntimeError("turn completed without a response id")
+            if assistant_item is None:
+                raise RuntimeError("turn completed without an assistant message item")
+            return response_id, assistant_item
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Exercise ChatMock websocket reuse the same way Codex does."
+    )
+    parser.add_argument(
+        "--ws-url",
+        default="ws://127.0.0.1:8000/v1/responses",
+        help="ChatMock websocket URL.",
+    )
+    parser.add_argument("--model", default="gpt-5.4", help="Model to request.")
+    parser.add_argument(
+        "--session-id",
+        default=f"reuse-demo-{uuid.uuid4()}",
+        help="Fixed X-Session-Id for the whole run.",
+    )
+    parser.add_argument(
+        "--first-prompt",
+        default="Say exactly: alpha",
+        help="Prompt for the first turn.",
+    )
+    parser.add_argument(
+        "--second-prompt",
+        default="Now say exactly: beta",
+        help="Prompt appended in the reuse-candidate turn.",
+    )
+    parser.add_argument(
+        "--no-fast-mode",
+        action="store_true",
+        help="Do not send fast_mode=true.",
+    )
+    args = parser.parse_args()
+
+    headers = {"X-Session-Id": args.session_id}
+    fast_mode = not args.no_fast_mode
+
+    print(f"Using websocket session id: {args.session_id}")
+    print(f"Connecting to: {args.ws_url}")
+    print("Run ChatMock with `python3 chatmock.py serve --verbose` in another terminal.")
+    print("This verifies the Codex-aligned path: websocket `response.create` reuse.")
+    print("HTTP `/v1/responses` is not expected to send `previous_response_id`.")
+    print()
+
+    with connect(args.ws_url, additional_headers=headers, open_timeout=15) as ws:
+        first_request = {
+            "type": "response.create",
+            "model": args.model,
+            "store": False,
+            "input": args.first_prompt,
+            "fast_mode": fast_mode,
+        }
+        ws.send(json.dumps(first_request))
+        first_response_id, assistant_item = _receive_turn(ws)
+
+        second_request = {
+            "type": "response.create",
+            "model": args.model,
+            "store": False,
+            "input": [
+                _user_message(args.first_prompt),
+                assistant_item,
+                _user_message(args.second_prompt),
+            ],
+            "fast_mode": fast_mode,
+        }
+        ws.send(json.dumps(second_request))
+        second_response_id, _ = _receive_turn(ws)
+
+    print("Turn 1 completed.")
+    print(f"  response id: {first_response_id}")
+    print("Turn 2 completed.")
+    print(f"  response id: {second_response_id}")
+    print()
+    print("Expected in the verbose ChatMock server log for turn 2:")
+    print("  - outbound websocket payload includes `previous_response_id`")
+    print("  - `previous_response_id` equals the first response id")
+    print("  - outbound `input` only contains the new trailing user message")
+    print()
+    print("If turn 2 still shows the full conversation in the outbound websocket payload, reuse is not working.")
+    return 0
+
+
+if __name__ == "__main__":
+    try:
+        raise SystemExit(main())
+    except KeyboardInterrupt:
+        raise SystemExit(130)
+    except Exception as exc:
+        print(f"error: {exc}", file=sys.stderr)
+        raise SystemExit(1)