reasoning effort as models support
This commit is contained in:
@@ -14,6 +14,7 @@ def create_app(
|
|||||||
reasoning_summary: str = "auto",
|
reasoning_summary: str = "auto",
|
||||||
reasoning_compat: str = "think-tags",
|
reasoning_compat: str = "think-tags",
|
||||||
debug_model: str | None = None,
|
debug_model: str | None = None,
|
||||||
|
expose_reasoning_models: bool = False,
|
||||||
) -> Flask:
|
) -> Flask:
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
|
|
||||||
@@ -24,6 +25,7 @@ def create_app(
|
|||||||
REASONING_COMPAT=reasoning_compat,
|
REASONING_COMPAT=reasoning_compat,
|
||||||
DEBUG_MODEL=debug_model,
|
DEBUG_MODEL=debug_model,
|
||||||
BASE_INSTRUCTIONS=BASE_INSTRUCTIONS,
|
BASE_INSTRUCTIONS=BASE_INSTRUCTIONS,
|
||||||
|
EXPOSE_REASONING_MODELS=bool(expose_reasoning_models),
|
||||||
)
|
)
|
||||||
|
|
||||||
@app.get("/")
|
@app.get("/")
|
||||||
@@ -41,4 +43,3 @@ def create_app(
|
|||||||
app.register_blueprint(ollama_bp)
|
app.register_blueprint(ollama_bp)
|
||||||
|
|
||||||
return app
|
return app
|
||||||
|
|
||||||
|
|||||||
@@ -54,6 +54,7 @@ def cmd_serve(
|
|||||||
reasoning_summary: str,
|
reasoning_summary: str,
|
||||||
reasoning_compat: str,
|
reasoning_compat: str,
|
||||||
debug_model: str | None,
|
debug_model: str | None,
|
||||||
|
expose_reasoning_models: bool,
|
||||||
) -> int:
|
) -> int:
|
||||||
app = create_app(
|
app = create_app(
|
||||||
verbose=verbose,
|
verbose=verbose,
|
||||||
@@ -61,6 +62,7 @@ def cmd_serve(
|
|||||||
reasoning_summary=reasoning_summary,
|
reasoning_summary=reasoning_summary,
|
||||||
reasoning_compat=reasoning_compat,
|
reasoning_compat=reasoning_compat,
|
||||||
debug_model=debug_model,
|
debug_model=debug_model,
|
||||||
|
expose_reasoning_models=expose_reasoning_models,
|
||||||
)
|
)
|
||||||
|
|
||||||
app.run(host=host, debug=False, use_reloader=False, port=port, threaded=True)
|
app.run(host=host, debug=False, use_reloader=False, port=port, threaded=True)
|
||||||
@@ -106,6 +108,15 @@ def main() -> None:
|
|||||||
"'current' is accepted as an alias for 'legacy'"
|
"'current' is accepted as an alias for 'legacy'"
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
p_serve.add_argument(
|
||||||
|
"--expose-reasoning-models",
|
||||||
|
action="store_true",
|
||||||
|
default=os.getenv("CHATGPT_LOCAL_EXPOSE_REASONING_MODELS", "").strip().lower() in ("1", "true", "yes", "on"),
|
||||||
|
help=(
|
||||||
|
"Expose gpt-5 reasoning effort variants (minimal|low|medium|high) as separate models from /v1/models. "
|
||||||
|
"This allows choosing effort via model selection in compatible UIs."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
p_info = sub.add_parser("info", help="Print current stored tokens and derived account id")
|
p_info = sub.add_parser("info", help="Print current stored tokens and derived account id")
|
||||||
p_info.add_argument("--json", action="store_true", help="Output raw auth.json contents")
|
p_info.add_argument("--json", action="store_true", help="Output raw auth.json contents")
|
||||||
@@ -124,6 +135,7 @@ def main() -> None:
|
|||||||
reasoning_summary=args.reasoning_summary,
|
reasoning_summary=args.reasoning_summary,
|
||||||
reasoning_compat=args.reasoning_compat,
|
reasoning_compat=args.reasoning_compat,
|
||||||
debug_model=args.debug_model,
|
debug_model=args.debug_model,
|
||||||
|
expose_reasoning_models=args.expose_reasoning_models,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
elif args.command == "info":
|
elif args.command == "info":
|
||||||
|
|||||||
@@ -72,3 +72,29 @@ def apply_reasoning_to_message(
|
|||||||
message["content"] = think_block + (content_text or "")
|
message["content"] = think_block + (content_text or "")
|
||||||
return message
|
return message
|
||||||
|
|
||||||
|
|
||||||
|
def extract_reasoning_from_model_name(model: str | None) -> Dict[str, Any] | None:
|
||||||
|
"""Infer reasoning overrides from a model."""
|
||||||
|
if not isinstance(model, str) or not model:
|
||||||
|
return None
|
||||||
|
s = model.strip().lower()
|
||||||
|
if not s:
|
||||||
|
return None
|
||||||
|
efforts = {"minimal", "low", "medium", "high"}
|
||||||
|
|
||||||
|
if ":" in s:
|
||||||
|
maybe = s.rsplit(":", 1)[-1].strip()
|
||||||
|
if maybe in efforts:
|
||||||
|
return {"effort": maybe}
|
||||||
|
|
||||||
|
for sep in ("-", "_"):
|
||||||
|
if s.endswith(sep + "minimal"):
|
||||||
|
return {"effort": "minimal"}
|
||||||
|
if s.endswith(sep + "low"):
|
||||||
|
return {"effort": "low"}
|
||||||
|
if s.endswith(sep + "medium"):
|
||||||
|
return {"effort": "medium"}
|
||||||
|
if s.endswith(sep + "high"):
|
||||||
|
return {"effort": "high"}
|
||||||
|
|
||||||
|
return None
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ from flask import Blueprint, Response, current_app, jsonify, make_response, requ
|
|||||||
|
|
||||||
from .config import BASE_INSTRUCTIONS
|
from .config import BASE_INSTRUCTIONS
|
||||||
from .http import build_cors_headers
|
from .http import build_cors_headers
|
||||||
from .reasoning import build_reasoning_param
|
from .reasoning import build_reasoning_param, extract_reasoning_from_model_name
|
||||||
from .transform import convert_ollama_messages, normalize_ollama_tools
|
from .transform import convert_ollama_messages, normalize_ollama_tools
|
||||||
from .upstream import normalize_model_name, start_upstream_request
|
from .upstream import normalize_model_name, start_upstream_request
|
||||||
from .utils import convert_chat_messages_to_responses_input, convert_tools_chat_to_responses
|
from .utils import convert_chat_messages_to_responses_input, convert_tools_chat_to_responses
|
||||||
@@ -32,24 +32,39 @@ _OLLAMA_FAKE_EVAL = {
|
|||||||
def ollama_tags() -> Response:
|
def ollama_tags() -> Response:
|
||||||
if bool(current_app.config.get("VERBOSE")):
|
if bool(current_app.config.get("VERBOSE")):
|
||||||
print("IN GET /api/tags")
|
print("IN GET /api/tags")
|
||||||
model_id = "gpt-5"
|
expose_variants = bool(current_app.config.get("EXPOSE_REASONING_MODELS"))
|
||||||
models = [
|
model_ids = [
|
||||||
{
|
"gpt-5",
|
||||||
"name": model_id,
|
*(
|
||||||
"model": model_id,
|
[
|
||||||
"modified_at": "2023-10-01T00:00:00Z",
|
"gpt-5-high",
|
||||||
"size": 815319791,
|
"gpt-5-medium",
|
||||||
"digest": "8648f39daa8fbf5b18c7b4e6a8fb4990c692751d49917417b8842ca5758e7ffc",
|
"gpt-5-low",
|
||||||
"details": {
|
"gpt-5-minimal",
|
||||||
"parent_model": "",
|
]
|
||||||
"format": "gguf",
|
if expose_variants
|
||||||
"family": "llama",
|
else []
|
||||||
"families": ["llama"],
|
),
|
||||||
"parameter_size": "8.0B",
|
|
||||||
"quantization_level": "Q4_0",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
]
|
]
|
||||||
|
models = []
|
||||||
|
for model_id in model_ids:
|
||||||
|
models.append(
|
||||||
|
{
|
||||||
|
"name": model_id,
|
||||||
|
"model": model_id,
|
||||||
|
"modified_at": "2023-10-01T00:00:00Z",
|
||||||
|
"size": 815319791,
|
||||||
|
"digest": "8648f39daa8fbf5b18c7b4e6a8fb4990c692751d49917417b8842ca5758e7ffc",
|
||||||
|
"details": {
|
||||||
|
"parent_model": "",
|
||||||
|
"format": "gguf",
|
||||||
|
"family": "llama",
|
||||||
|
"families": ["llama"],
|
||||||
|
"parameter_size": "8.0B",
|
||||||
|
"quantization_level": "Q4_0",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
)
|
||||||
resp = make_response(jsonify({"models": models}), 200)
|
resp = make_response(jsonify({"models": models}), 200)
|
||||||
for k, v in build_cors_headers().items():
|
for k, v in build_cors_headers().items():
|
||||||
resp.headers.setdefault(k, v)
|
resp.headers.setdefault(k, v)
|
||||||
@@ -137,6 +152,8 @@ def ollama_chat() -> Response:
|
|||||||
|
|
||||||
input_items = convert_chat_messages_to_responses_input(messages)
|
input_items = convert_chat_messages_to_responses_input(messages)
|
||||||
|
|
||||||
|
# Infer effort from model variant (gpt-5-high, etc.) but send base model upstream
|
||||||
|
model_reasoning = extract_reasoning_from_model_name(model)
|
||||||
upstream, error_resp = start_upstream_request(
|
upstream, error_resp = start_upstream_request(
|
||||||
normalize_model_name(model),
|
normalize_model_name(model),
|
||||||
input_items,
|
input_items,
|
||||||
@@ -144,7 +161,7 @@ def ollama_chat() -> Response:
|
|||||||
tools=tools_responses,
|
tools=tools_responses,
|
||||||
tool_choice=tool_choice,
|
tool_choice=tool_choice,
|
||||||
parallel_tool_calls=parallel_tool_calls,
|
parallel_tool_calls=parallel_tool_calls,
|
||||||
reasoning_param=build_reasoning_param(reasoning_effort, reasoning_summary, None),
|
reasoning_param=build_reasoning_param(reasoning_effort, reasoning_summary, model_reasoning),
|
||||||
)
|
)
|
||||||
if error_resp is not None:
|
if error_resp is not None:
|
||||||
return error_resp
|
return error_resp
|
||||||
@@ -162,7 +179,7 @@ def ollama_chat() -> Response:
|
|||||||
)
|
)
|
||||||
|
|
||||||
created_at = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
|
created_at = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||||
model_out = normalize_model_name(model)
|
model_out = model if isinstance(model, str) and model.strip() else normalize_model_name(model)
|
||||||
|
|
||||||
if stream_req:
|
if stream_req:
|
||||||
def _gen():
|
def _gen():
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ from flask import Blueprint, Response, current_app, jsonify, make_response, requ
|
|||||||
|
|
||||||
from .config import BASE_INSTRUCTIONS
|
from .config import BASE_INSTRUCTIONS
|
||||||
from .http import build_cors_headers
|
from .http import build_cors_headers
|
||||||
from .reasoning import apply_reasoning_to_message, build_reasoning_param
|
from .reasoning import apply_reasoning_to_message, build_reasoning_param, extract_reasoning_from_model_name
|
||||||
from .upstream import normalize_model_name, start_upstream_request
|
from .upstream import normalize_model_name, start_upstream_request
|
||||||
from .utils import (
|
from .utils import (
|
||||||
convert_chat_messages_to_responses_input,
|
convert_chat_messages_to_responses_input,
|
||||||
@@ -45,7 +45,8 @@ def chat_completions() -> Response:
|
|||||||
except Exception:
|
except Exception:
|
||||||
return jsonify({"error": {"message": "Invalid JSON body"}}), 400
|
return jsonify({"error": {"message": "Invalid JSON body"}}), 400
|
||||||
|
|
||||||
model = normalize_model_name(payload.get("model"), debug_model)
|
requested_model = payload.get("model")
|
||||||
|
model = normalize_model_name(requested_model, debug_model)
|
||||||
messages = payload.get("messages")
|
messages = payload.get("messages")
|
||||||
if messages is None and isinstance(payload.get("prompt"), str):
|
if messages is None and isinstance(payload.get("prompt"), str):
|
||||||
messages = [{"role": "user", "content": payload.get("prompt") or ""}]
|
messages = [{"role": "user", "content": payload.get("prompt") or ""}]
|
||||||
@@ -76,7 +77,8 @@ def chat_completions() -> Response:
|
|||||||
{"type": "message", "role": "user", "content": [{"type": "input_text", "text": payload.get("prompt")}]}
|
{"type": "message", "role": "user", "content": [{"type": "input_text", "text": payload.get("prompt")}]}
|
||||||
]
|
]
|
||||||
|
|
||||||
reasoning_overrides = payload.get("reasoning") if isinstance(payload.get("reasoning"), dict) else None
|
model_reasoning = extract_reasoning_from_model_name(requested_model)
|
||||||
|
reasoning_overrides = payload.get("reasoning") if isinstance(payload.get("reasoning"), dict) else model_reasoning
|
||||||
reasoning_param = build_reasoning_param(reasoning_effort, reasoning_summary, reasoning_overrides)
|
reasoning_param = build_reasoning_param(reasoning_effort, reasoning_summary, reasoning_overrides)
|
||||||
|
|
||||||
upstream, error_resp = start_upstream_request(
|
upstream, error_resp = start_upstream_request(
|
||||||
@@ -109,7 +111,7 @@ def chat_completions() -> Response:
|
|||||||
resp = Response(
|
resp = Response(
|
||||||
sse_translate_chat(
|
sse_translate_chat(
|
||||||
upstream,
|
upstream,
|
||||||
model,
|
requested_model or model,
|
||||||
created,
|
created,
|
||||||
verbose=verbose,
|
verbose=verbose,
|
||||||
vlog=print if verbose else None,
|
vlog=print if verbose else None,
|
||||||
@@ -206,7 +208,7 @@ def chat_completions() -> Response:
|
|||||||
"id": response_id or "chatcmpl",
|
"id": response_id or "chatcmpl",
|
||||||
"object": "chat.completion",
|
"object": "chat.completion",
|
||||||
"created": created,
|
"created": created,
|
||||||
"model": model,
|
"model": requested_model or model,
|
||||||
"choices": [
|
"choices": [
|
||||||
{
|
{
|
||||||
"index": 0,
|
"index": 0,
|
||||||
@@ -235,7 +237,8 @@ def completions() -> Response:
|
|||||||
except Exception:
|
except Exception:
|
||||||
return jsonify({"error": {"message": "Invalid JSON body"}}), 400
|
return jsonify({"error": {"message": "Invalid JSON body"}}), 400
|
||||||
|
|
||||||
model = normalize_model_name(payload.get("model"), debug_model)
|
requested_model = payload.get("model")
|
||||||
|
model = normalize_model_name(requested_model, debug_model)
|
||||||
prompt = payload.get("prompt")
|
prompt = payload.get("prompt")
|
||||||
if isinstance(prompt, list):
|
if isinstance(prompt, list):
|
||||||
prompt = "".join([p if isinstance(p, str) else "" for p in prompt])
|
prompt = "".join([p if isinstance(p, str) else "" for p in prompt])
|
||||||
@@ -248,7 +251,8 @@ def completions() -> Response:
|
|||||||
messages = [{"role": "user", "content": prompt or ""}]
|
messages = [{"role": "user", "content": prompt or ""}]
|
||||||
input_items = convert_chat_messages_to_responses_input(messages)
|
input_items = convert_chat_messages_to_responses_input(messages)
|
||||||
|
|
||||||
reasoning_overrides = payload.get("reasoning") if isinstance(payload.get("reasoning"), dict) else None
|
model_reasoning = extract_reasoning_from_model_name(requested_model)
|
||||||
|
reasoning_overrides = payload.get("reasoning") if isinstance(payload.get("reasoning"), dict) else model_reasoning
|
||||||
reasoning_param = build_reasoning_param(reasoning_effort, reasoning_summary, reasoning_overrides)
|
reasoning_param = build_reasoning_param(reasoning_effort, reasoning_summary, reasoning_overrides)
|
||||||
upstream, error_resp = start_upstream_request(
|
upstream, error_resp = start_upstream_request(
|
||||||
model,
|
model,
|
||||||
@@ -274,7 +278,7 @@ def completions() -> Response:
|
|||||||
resp = Response(
|
resp = Response(
|
||||||
sse_translate_text(
|
sse_translate_text(
|
||||||
upstream,
|
upstream,
|
||||||
model,
|
requested_model or model,
|
||||||
created,
|
created,
|
||||||
verbose=verbose,
|
verbose=verbose,
|
||||||
vlog=(print if verbose else None),
|
vlog=(print if verbose else None),
|
||||||
@@ -335,7 +339,7 @@ def completions() -> Response:
|
|||||||
"id": response_id or "cmpl",
|
"id": response_id or "cmpl",
|
||||||
"object": "text_completion",
|
"object": "text_completion",
|
||||||
"created": created,
|
"created": created,
|
||||||
"model": model,
|
"model": requested_model or model,
|
||||||
"choices": [
|
"choices": [
|
||||||
{"index": 0, "text": full_text, "finish_reason": "stop", "logprobs": None}
|
{"index": 0, "text": full_text, "finish_reason": "stop", "logprobs": None}
|
||||||
],
|
],
|
||||||
@@ -349,7 +353,20 @@ def completions() -> Response:
|
|||||||
|
|
||||||
@openai_bp.route("/v1/models", methods=["GET"])
|
@openai_bp.route("/v1/models", methods=["GET"])
|
||||||
def list_models() -> Response:
|
def list_models() -> Response:
|
||||||
models = {"object": "list", "data": [{"id": "gpt-5", "object": "model", "owned_by": "owner"}]}
|
expose_variants = bool(current_app.config.get("EXPOSE_REASONING_MODELS"))
|
||||||
|
data = []
|
||||||
|
if expose_variants:
|
||||||
|
variant_ids = [
|
||||||
|
"gpt-5",
|
||||||
|
"gpt-5-high",
|
||||||
|
"gpt-5-medium",
|
||||||
|
"gpt-5-low",
|
||||||
|
"gpt-5-minimal",
|
||||||
|
]
|
||||||
|
data = [{"id": mid, "object": "model", "owned_by": "owner"} for mid in variant_ids]
|
||||||
|
else:
|
||||||
|
data = [{"id": "gpt-5", "object": "model", "owned_by": "owner"}]
|
||||||
|
models = {"object": "list", "data": data}
|
||||||
resp = make_response(jsonify(models), 200)
|
resp = make_response(jsonify(models), 200)
|
||||||
for k, v in build_cors_headers().items():
|
for k, v in build_cors_headers().items():
|
||||||
resp.headers.setdefault(k, v)
|
resp.headers.setdefault(k, v)
|
||||||
|
|||||||
@@ -20,6 +20,13 @@ def normalize_model_name(name: str | None, debug_model: str | None = None) -> st
|
|||||||
if not isinstance(name, str) or not name.strip():
|
if not isinstance(name, str) or not name.strip():
|
||||||
return "gpt-5"
|
return "gpt-5"
|
||||||
base = name.split(":", 1)[0].strip()
|
base = name.split(":", 1)[0].strip()
|
||||||
|
for sep in ("-", "_"):
|
||||||
|
lowered = base.lower()
|
||||||
|
for effort in ("minimal", "low", "medium", "high"):
|
||||||
|
suffix = f"{sep}{effort}"
|
||||||
|
if lowered.endswith(suffix):
|
||||||
|
base = base[: -len(suffix)]
|
||||||
|
break
|
||||||
mapping = {
|
mapping = {
|
||||||
"gpt5": "gpt-5",
|
"gpt5": "gpt-5",
|
||||||
"gpt-5-latest": "gpt-5",
|
"gpt-5-latest": "gpt-5",
|
||||||
|
|||||||
Reference in New Issue
Block a user