feat: add responses api, websocket support, and fast mode

This commit is contained in:
Game_Time
2026-03-23 15:41:42 +05:00
parent e96db19538
commit 8754203ec6
22 changed files with 2148 additions and 119 deletions

View File

@@ -7,16 +7,31 @@ from typing import Any, Dict, List
from flask import Blueprint, Response, current_app, jsonify, make_response, request
from .config import BASE_INSTRUCTIONS, GPT5_CODEX_INSTRUCTIONS
from .fast_mode import resolve_service_tier
from .limits import record_rate_limits_from_response
from .http import build_cors_headers
from .model_registry import list_public_models, uses_codex_instructions
from .responses_api import (
ResponsesRequestError,
aggregate_response_from_sse,
extract_client_session_id,
instructions_for_model,
normalize_responses_payload,
stream_upstream_bytes,
)
from .reasoning import (
allowed_efforts_for_model,
apply_reasoning_to_message,
build_reasoning_param,
extract_reasoning_from_model_name,
)
from .upstream import normalize_model_name, start_upstream_request
from .session import (
clear_responses_reuse_state,
note_responses_final_response,
note_responses_stream_event,
prepare_responses_request_for_session,
)
from .upstream import normalize_model_name, start_upstream_raw_request, start_upstream_request
from .utils import (
convert_chat_messages_to_responses_input,
convert_tools_chat_to_responses,
@@ -59,12 +74,32 @@ def _wrap_stream_logging(label: str, iterator, enabled: bool):
def _instructions_for_model(model: str) -> str:
base = current_app.config.get("BASE_INSTRUCTIONS", BASE_INSTRUCTIONS)
if uses_codex_instructions(model):
codex = current_app.config.get("GPT5_CODEX_INSTRUCTIONS") or GPT5_CODEX_INSTRUCTIONS
if isinstance(codex, str) and codex.strip():
return codex
return base
return instructions_for_model(current_app.config, model)
def _service_tier_from_payload(
model: str,
payload: Dict[str, Any],
*,
verbose: bool = False,
) -> tuple[str | None, Response | None]:
resolution = resolve_service_tier(
model,
request_fast_mode=payload.get("fast_mode"),
request_service_tier=payload.get("service_tier"),
server_fast_mode=bool(current_app.config.get("FAST_MODE")),
)
if resolution.warning_message and verbose:
print(f"[FastMode] {resolution.warning_message}")
if resolution.error_message:
err = {"error": {"message": resolution.error_message}}
if verbose:
_log_json("OUT POST service_tier resolution", err)
resp = make_response(jsonify(err), 400)
for k, v in build_cors_headers().items():
resp.headers.setdefault(k, v)
return None, resp
return resolution.service_tier, None
@openai_bp.route("/v1/chat/completions", methods=["POST"])
@@ -178,6 +213,9 @@ def chat_completions() -> Response:
reasoning_overrides,
allowed_efforts=allowed_efforts_for_model(model),
)
service_tier, tier_error = _service_tier_from_payload(model, payload, verbose=verbose)
if tier_error is not None:
return tier_error
upstream, error_resp = start_upstream_request(
model,
@@ -187,6 +225,7 @@ def chat_completions() -> Response:
tool_choice=tool_choice,
parallel_tool_calls=parallel_tool_calls,
reasoning_param=reasoning_param,
service_tier=service_tier,
)
if error_resp is not None:
if verbose:
@@ -224,6 +263,7 @@ def chat_completions() -> Response:
tool_choice=safe_choice,
parallel_tool_calls=parallel_tool_calls,
reasoning_param=reasoning_param,
service_tier=service_tier,
)
record_rate_limits_from_response(upstream2)
if err2 is None and upstream2 is not None and upstream2.status_code < 400:
@@ -413,11 +453,15 @@ def completions() -> Response:
reasoning_overrides,
allowed_efforts=allowed_efforts_for_model(model),
)
service_tier, tier_error = _service_tier_from_payload(model, payload, verbose=verbose)
if tier_error is not None:
return tier_error
upstream, error_resp = start_upstream_request(
model,
input_items,
instructions=_instructions_for_model(model),
reasoning_param=reasoning_param,
service_tier=service_tier,
)
if error_resp is not None:
if verbose:
@@ -529,6 +573,161 @@ def completions() -> Response:
return resp
@openai_bp.route("/v1/responses", methods=["POST"])
def responses_create() -> Response:
verbose = bool(current_app.config.get("VERBOSE"))
raw = request.get_data(cache=True, as_text=True) or ""
if verbose:
try:
print("IN POST /v1/responses\n" + raw)
except Exception:
pass
try:
payload = json.loads(raw) if raw else {}
except Exception:
err = {"error": {"message": "Invalid JSON body"}}
if verbose:
_log_json("OUT POST /v1/responses", err)
return jsonify(err), 400
if not isinstance(payload, dict):
err = {"error": {"message": "Request body must be a JSON object"}}
if verbose:
_log_json("OUT POST /v1/responses", err)
return jsonify(err), 400
try:
normalized = normalize_responses_payload(
payload,
config=current_app.config,
client_session_id=extract_client_session_id(request.headers),
)
except ResponsesRequestError as exc:
err: Dict[str, Any] = {"error": {"message": str(exc)}}
if exc.code:
err["error"]["code"] = exc.code
if verbose:
_log_json("OUT POST /v1/responses", err)
return jsonify(err), exc.status_code
if normalized.service_tier_resolution.warning_message and verbose:
print(f"[FastMode] {normalized.service_tier_resolution.warning_message}")
prepared = prepare_responses_request_for_session(
normalized.session_id,
normalized.payload,
allow_previous_response_id=False,
)
stream_req = bool(prepared.payload.get("stream", False))
upstream_payload = dict(prepared.payload)
upstream_payload["stream"] = True
upstream, error_resp = start_upstream_raw_request(
upstream_payload,
session_id=normalized.session_id,
stream=True,
)
if error_resp is not None:
clear_responses_reuse_state(normalized.session_id)
if verbose:
try:
body = error_resp.get_data(as_text=True)
if body:
try:
parsed = json.loads(body)
except Exception:
parsed = body
_log_json("OUT POST /v1/responses", parsed)
except Exception:
pass
return error_resp
record_rate_limits_from_response(upstream)
if upstream.status_code >= 400:
try:
err_body = json.loads(upstream.content.decode("utf-8", errors="ignore")) if upstream.content else {"error": {"message": upstream.text}}
except Exception:
err_body = {"error": {"message": upstream.text or "Upstream error"}}
finally:
upstream.close()
clear_responses_reuse_state(normalized.session_id)
if verbose:
_log_json("OUT POST /v1/responses", err_body)
resp = make_response(jsonify(err_body), upstream.status_code)
for k, v in build_cors_headers().items():
resp.headers.setdefault(k, v)
return resp
if stream_req:
if verbose:
print("OUT POST /v1/responses (streaming response)")
stream_iter = _wrap_stream_logging(
"STREAM OUT /v1/responses",
stream_upstream_bytes(
upstream,
on_event=lambda evt: note_responses_stream_event(normalized.session_id, evt),
),
verbose,
)
resp = Response(
stream_iter,
status=upstream.status_code,
mimetype="text/event-stream",
headers={"Cache-Control": "no-cache", "Connection": "keep-alive"},
)
for k, v in build_cors_headers().items():
resp.headers.setdefault(k, v)
return resp
content_type = upstream.headers.get("Content-Type", "")
if "application/json" in content_type.lower():
try:
body = upstream.json()
except Exception:
body = None
finally:
upstream.close()
if isinstance(body, dict):
note_responses_final_response(normalized.session_id, body)
if verbose:
_log_json("OUT POST /v1/responses", body)
resp = make_response(jsonify(body), upstream.status_code)
for k, v in build_cors_headers().items():
resp.headers.setdefault(k, v)
return resp
response_obj, error_obj = aggregate_response_from_sse(
upstream,
on_event=lambda evt: note_responses_stream_event(normalized.session_id, evt),
)
if error_obj is not None:
clear_responses_reuse_state(normalized.session_id)
if verbose:
_log_json("OUT POST /v1/responses", error_obj)
resp = make_response(jsonify(error_obj), 502)
for k, v in build_cors_headers().items():
resp.headers.setdefault(k, v)
return resp
if response_obj is None:
clear_responses_reuse_state(normalized.session_id)
err = {"error": {"message": "Upstream response stream did not contain a completed response object"}}
if verbose:
_log_json("OUT POST /v1/responses", err)
resp = make_response(jsonify(err), 502)
for k, v in build_cors_headers().items():
resp.headers.setdefault(k, v)
return resp
if verbose:
_log_json("OUT POST /v1/responses", response_obj)
resp = make_response(jsonify(response_obj), upstream.status_code)
for k, v in build_cors_headers().items():
resp.headers.setdefault(k, v)
return resp
@openai_bp.route("/v1/models", methods=["GET"])
def list_models() -> Response:
expose_variants = bool(current_app.config.get("EXPOSE_REASONING_MODELS"))