feat: add responses api, websocket support, and fast mode
This commit is contained in:
176
scripts/test_responses_cached_tokens.py
Normal file
176
scripts/test_responses_cached_tokens.py
Normal file
@@ -0,0 +1,176 @@
|
||||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import uuid
|
||||
from typing import Any, Dict
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
def _post(url: str, api_key: str, session_id: str, payload: Dict[str, Any]) -> Dict[str, Any]:
|
||||
response = requests.post(
|
||||
url,
|
||||
headers={
|
||||
"Authorization": f"Bearer {api_key}",
|
||||
"Content-Type": "application/json",
|
||||
"X-Session-Id": session_id,
|
||||
},
|
||||
json=payload,
|
||||
timeout=180,
|
||||
)
|
||||
try:
|
||||
body = response.json()
|
||||
except Exception:
|
||||
body = {"raw": response.text}
|
||||
if response.status_code >= 400:
|
||||
raise RuntimeError(
|
||||
f"POST {url} failed with {response.status_code}: {json.dumps(body, ensure_ascii=False)}"
|
||||
)
|
||||
if not isinstance(body, dict):
|
||||
raise RuntimeError(f"Expected JSON object response, got: {body!r}")
|
||||
return body
|
||||
|
||||
|
||||
def _usage_summary(body: Dict[str, Any]) -> Dict[str, Any]:
|
||||
usage = body.get("usage")
|
||||
if not isinstance(usage, dict):
|
||||
return {}
|
||||
return usage
|
||||
|
||||
|
||||
def _cached_tokens(body: Dict[str, Any]) -> int | None:
|
||||
usage = _usage_summary(body)
|
||||
details = usage.get("input_tokens_details")
|
||||
if not isinstance(details, dict):
|
||||
return None
|
||||
value = details.get("cached_tokens")
|
||||
try:
|
||||
return int(value)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _assistant_message_item(body: Dict[str, Any]) -> Dict[str, Any]:
|
||||
output = body.get("output")
|
||||
if not isinstance(output, list):
|
||||
raise RuntimeError("Response did not include an output list.")
|
||||
for item in output:
|
||||
if isinstance(item, dict) and item.get("type") == "message" and item.get("role") == "assistant":
|
||||
return item
|
||||
raise RuntimeError("Response did not include an assistant message item.")
|
||||
|
||||
|
||||
def _user_message(text: str) -> Dict[str, Any]:
|
||||
return {
|
||||
"type": "message",
|
||||
"role": "user",
|
||||
"content": [{"type": "input_text", "text": text}],
|
||||
}
|
||||
|
||||
|
||||
def _default_prefix() -> str:
|
||||
seed = "Cache test prefix. Repeat this context exactly for cache measurement. "
|
||||
return "".join(seed for _ in range(220))
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Drive two raw /v1/responses turns through ChatMock and check cached input tokens."
|
||||
)
|
||||
parser.add_argument("--base-url", default="http://127.0.0.1:8000", help="ChatMock base URL.")
|
||||
parser.add_argument("--api-key", default="key", help="Bearer token to send to ChatMock.")
|
||||
parser.add_argument("--model", default="gpt-5.4", help="Model to request.")
|
||||
parser.add_argument(
|
||||
"--session-id",
|
||||
default=f"cache-check-{uuid.uuid4()}",
|
||||
help="Fixed X-Session-Id for both turns.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--prefix",
|
||||
default=_default_prefix(),
|
||||
help="Large repeated first-turn prompt prefix.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--first-question",
|
||||
default="Reply with exactly: alpha",
|
||||
help="Trailing instruction for the first turn.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--second-question",
|
||||
default="Reply with exactly: beta",
|
||||
help="Trailing instruction for the second turn.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
responses_url = args.base_url.rstrip("/") + "/v1/responses"
|
||||
session_id = args.session_id
|
||||
first_text = f"{args.prefix}\n\n{args.first_question}"
|
||||
second_text = args.second_question
|
||||
|
||||
print(f"Using session id: {session_id}")
|
||||
print(f"POST target: {responses_url}")
|
||||
print("This checks the raw Responses usage object returned through ChatMock.")
|
||||
print()
|
||||
|
||||
first_payload = {
|
||||
"model": args.model,
|
||||
"store": False,
|
||||
"stream": False,
|
||||
"input": first_text,
|
||||
}
|
||||
first_response = _post(responses_url, args.api_key, session_id, first_payload)
|
||||
assistant_item = _assistant_message_item(first_response)
|
||||
|
||||
second_payload = {
|
||||
"model": args.model,
|
||||
"store": False,
|
||||
"stream": False,
|
||||
"input": [
|
||||
_user_message(first_text),
|
||||
assistant_item,
|
||||
_user_message(second_text),
|
||||
],
|
||||
}
|
||||
second_response = _post(responses_url, args.api_key, session_id, second_payload)
|
||||
|
||||
first_usage = _usage_summary(first_response)
|
||||
second_usage = _usage_summary(second_response)
|
||||
first_cached = _cached_tokens(first_response)
|
||||
second_cached = _cached_tokens(second_response)
|
||||
|
||||
print("Turn 1")
|
||||
print(json.dumps(first_usage, indent=2, ensure_ascii=False) if first_usage else " no usage object")
|
||||
print()
|
||||
print("Turn 2")
|
||||
print(json.dumps(second_usage, indent=2, ensure_ascii=False) if second_usage else " no usage object")
|
||||
print()
|
||||
|
||||
if second_cached is None:
|
||||
first_input_tokens = first_usage.get("input_tokens") if isinstance(first_usage, dict) else None
|
||||
second_input_tokens = second_usage.get("input_tokens") if isinstance(second_usage, dict) else None
|
||||
print("Result: inconclusive")
|
||||
print("Reason: upstream did not include `usage.input_tokens_details.cached_tokens`.")
|
||||
if isinstance(first_input_tokens, int) and isinstance(second_input_tokens, int):
|
||||
print(f"Observed input_tokens delta: first={first_input_tokens}, second={second_input_tokens}")
|
||||
print("Codex treats cached-token reporting as the direct cache-hit signal; without it, this script cannot prove caching.")
|
||||
return 2
|
||||
|
||||
if second_cached > 0:
|
||||
print(f"Result: success, follow-up turn reported cached_tokens={second_cached}.")
|
||||
return 0
|
||||
|
||||
print("Result: failure, follow-up turn reported cached_tokens=0.")
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
raise SystemExit(main())
|
||||
except KeyboardInterrupt:
|
||||
raise SystemExit(130)
|
||||
except Exception as exc:
|
||||
print(f"error: {exc}", file=sys.stderr)
|
||||
raise SystemExit(1)
|
||||
143
scripts/test_responses_reuse.py
Normal file
143
scripts/test_responses_reuse.py
Normal file
@@ -0,0 +1,143 @@
|
||||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import uuid
|
||||
from typing import Any, Dict, Tuple
|
||||
|
||||
from websockets.sync.client import connect
|
||||
|
||||
|
||||
def _user_message(text: str) -> Dict[str, Any]:
|
||||
return {
|
||||
"type": "message",
|
||||
"role": "user",
|
||||
"content": [{"type": "input_text", "text": text}],
|
||||
}
|
||||
|
||||
|
||||
def _receive_turn(ws) -> Tuple[str, Dict[str, Any]]:
|
||||
response_id: str | None = None
|
||||
assistant_item: Dict[str, Any] | None = None
|
||||
|
||||
while True:
|
||||
raw = ws.recv(timeout=120)
|
||||
event = json.loads(raw)
|
||||
event_type = event.get("type")
|
||||
if event_type == "error":
|
||||
raise RuntimeError(f"websocket error: {json.dumps(event, ensure_ascii=False)}")
|
||||
if event_type == "response.created":
|
||||
response = event.get("response")
|
||||
if isinstance(response, dict) and isinstance(response.get("id"), str):
|
||||
response_id = response["id"]
|
||||
elif event_type == "response.output_item.done":
|
||||
item = event.get("item")
|
||||
if (
|
||||
isinstance(item, dict)
|
||||
and item.get("type") == "message"
|
||||
and item.get("role") == "assistant"
|
||||
):
|
||||
assistant_item = item
|
||||
elif event_type == "response.completed":
|
||||
if not response_id:
|
||||
response = event.get("response")
|
||||
if isinstance(response, dict) and isinstance(response.get("id"), str):
|
||||
response_id = response["id"]
|
||||
if not response_id:
|
||||
raise RuntimeError("turn completed without a response id")
|
||||
if assistant_item is None:
|
||||
raise RuntimeError("turn completed without an assistant message item")
|
||||
return response_id, assistant_item
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Exercise ChatMock websocket reuse the same way Codex does."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ws-url",
|
||||
default="ws://127.0.0.1:8000/v1/responses",
|
||||
help="ChatMock websocket URL.",
|
||||
)
|
||||
parser.add_argument("--model", default="gpt-5.4", help="Model to request.")
|
||||
parser.add_argument(
|
||||
"--session-id",
|
||||
default=f"reuse-demo-{uuid.uuid4()}",
|
||||
help="Fixed X-Session-Id for the whole run.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--first-prompt",
|
||||
default="Say exactly: alpha",
|
||||
help="Prompt for the first turn.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--second-prompt",
|
||||
default="Now say exactly: beta",
|
||||
help="Prompt appended in the reuse-candidate turn.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-fast-mode",
|
||||
action="store_true",
|
||||
help="Do not send fast_mode=true.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
headers = {"X-Session-Id": args.session_id}
|
||||
fast_mode = not args.no_fast_mode
|
||||
|
||||
print(f"Using websocket session id: {args.session_id}")
|
||||
print(f"Connecting to: {args.ws_url}")
|
||||
print("Run ChatMock with `python3 chatmock.py serve --verbose` in another terminal.")
|
||||
print("This verifies the Codex-aligned path: websocket `response.create` reuse.")
|
||||
print("HTTP `/v1/responses` is not expected to send `previous_response_id`.")
|
||||
print()
|
||||
|
||||
with connect(args.ws_url, additional_headers=headers, open_timeout=15) as ws:
|
||||
first_request = {
|
||||
"type": "response.create",
|
||||
"model": args.model,
|
||||
"store": False,
|
||||
"input": args.first_prompt,
|
||||
"fast_mode": fast_mode,
|
||||
}
|
||||
ws.send(json.dumps(first_request))
|
||||
first_response_id, assistant_item = _receive_turn(ws)
|
||||
|
||||
second_request = {
|
||||
"type": "response.create",
|
||||
"model": args.model,
|
||||
"store": False,
|
||||
"input": [
|
||||
_user_message(args.first_prompt),
|
||||
assistant_item,
|
||||
_user_message(args.second_prompt),
|
||||
],
|
||||
"fast_mode": fast_mode,
|
||||
}
|
||||
ws.send(json.dumps(second_request))
|
||||
second_response_id, _ = _receive_turn(ws)
|
||||
|
||||
print("Turn 1 completed.")
|
||||
print(f" response id: {first_response_id}")
|
||||
print("Turn 2 completed.")
|
||||
print(f" response id: {second_response_id}")
|
||||
print()
|
||||
print("Expected in the verbose ChatMock server log for turn 2:")
|
||||
print(" - outbound websocket payload includes `previous_response_id`")
|
||||
print(" - `previous_response_id` equals the first response id")
|
||||
print(" - outbound `input` only contains the new trailing user message")
|
||||
print()
|
||||
print("If turn 2 still shows the full conversation in the outbound websocket payload, reuse is not working.")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
raise SystemExit(main())
|
||||
except KeyboardInterrupt:
|
||||
raise SystemExit(130)
|
||||
except Exception as exc:
|
||||
print(f"error: {exc}", file=sys.stderr)
|
||||
raise SystemExit(1)
|
||||
Reference in New Issue
Block a user