From 7d944548cd8b9596f6700def5e055cb0aa176321 Mon Sep 17 00:00:00 2001 From: Game_Time <108236317+RayBytes@users.noreply.github.com> Date: Sun, 24 Aug 2025 23:53:37 +0500 Subject: [PATCH] reasoning effort as models support --- chatmock/app.py | 3 ++- chatmock/cli.py | 12 +++++++++ chatmock/reasoning.py | 26 ++++++++++++++++++ chatmock/routes_ollama.py | 57 +++++++++++++++++++++++++-------------- chatmock/routes_openai.py | 37 ++++++++++++++++++------- chatmock/upstream.py | 7 +++++ 6 files changed, 111 insertions(+), 31 deletions(-) diff --git a/chatmock/app.py b/chatmock/app.py index 5831fc8..5499e59 100644 --- a/chatmock/app.py +++ b/chatmock/app.py @@ -14,6 +14,7 @@ def create_app( reasoning_summary: str = "auto", reasoning_compat: str = "think-tags", debug_model: str | None = None, + expose_reasoning_models: bool = False, ) -> Flask: app = Flask(__name__) @@ -24,6 +25,7 @@ def create_app( REASONING_COMPAT=reasoning_compat, DEBUG_MODEL=debug_model, BASE_INSTRUCTIONS=BASE_INSTRUCTIONS, + EXPOSE_REASONING_MODELS=bool(expose_reasoning_models), ) @app.get("/") @@ -41,4 +43,3 @@ def create_app( app.register_blueprint(ollama_bp) return app - diff --git a/chatmock/cli.py b/chatmock/cli.py index a02e08a..80e12dd 100644 --- a/chatmock/cli.py +++ b/chatmock/cli.py @@ -54,6 +54,7 @@ def cmd_serve( reasoning_summary: str, reasoning_compat: str, debug_model: str | None, + expose_reasoning_models: bool, ) -> int: app = create_app( verbose=verbose, @@ -61,6 +62,7 @@ def cmd_serve( reasoning_summary=reasoning_summary, reasoning_compat=reasoning_compat, debug_model=debug_model, + expose_reasoning_models=expose_reasoning_models, ) app.run(host=host, debug=False, use_reloader=False, port=port, threaded=True) @@ -106,6 +108,15 @@ def main() -> None: "'current' is accepted as an alias for 'legacy'" ), ) + p_serve.add_argument( + "--expose-reasoning-models", + action="store_true", + default=os.getenv("CHATGPT_LOCAL_EXPOSE_REASONING_MODELS", "").strip().lower() in ("1", "true", "yes", "on"), + help=( + "Expose gpt-5 reasoning effort variants (minimal|low|medium|high) as separate models from /v1/models. " + "This allows choosing effort via model selection in compatible UIs." + ), + ) p_info = sub.add_parser("info", help="Print current stored tokens and derived account id") p_info.add_argument("--json", action="store_true", help="Output raw auth.json contents") @@ -124,6 +135,7 @@ def main() -> None: reasoning_summary=args.reasoning_summary, reasoning_compat=args.reasoning_compat, debug_model=args.debug_model, + expose_reasoning_models=args.expose_reasoning_models, ) ) elif args.command == "info": diff --git a/chatmock/reasoning.py b/chatmock/reasoning.py index 7918ce2..7aeabf2 100644 --- a/chatmock/reasoning.py +++ b/chatmock/reasoning.py @@ -72,3 +72,29 @@ def apply_reasoning_to_message( message["content"] = think_block + (content_text or "") return message + +def extract_reasoning_from_model_name(model: str | None) -> Dict[str, Any] | None: + """Infer reasoning overrides from a model.""" + if not isinstance(model, str) or not model: + return None + s = model.strip().lower() + if not s: + return None + efforts = {"minimal", "low", "medium", "high"} + + if ":" in s: + maybe = s.rsplit(":", 1)[-1].strip() + if maybe in efforts: + return {"effort": maybe} + + for sep in ("-", "_"): + if s.endswith(sep + "minimal"): + return {"effort": "minimal"} + if s.endswith(sep + "low"): + return {"effort": "low"} + if s.endswith(sep + "medium"): + return {"effort": "medium"} + if s.endswith(sep + "high"): + return {"effort": "high"} + + return None diff --git a/chatmock/routes_ollama.py b/chatmock/routes_ollama.py index 00311b5..a6ae950 100644 --- a/chatmock/routes_ollama.py +++ b/chatmock/routes_ollama.py @@ -9,7 +9,7 @@ from flask import Blueprint, Response, current_app, jsonify, make_response, requ from .config import BASE_INSTRUCTIONS from .http import build_cors_headers -from .reasoning import build_reasoning_param +from .reasoning import build_reasoning_param, extract_reasoning_from_model_name from .transform import convert_ollama_messages, normalize_ollama_tools from .upstream import normalize_model_name, start_upstream_request from .utils import convert_chat_messages_to_responses_input, convert_tools_chat_to_responses @@ -32,24 +32,39 @@ _OLLAMA_FAKE_EVAL = { def ollama_tags() -> Response: if bool(current_app.config.get("VERBOSE")): print("IN GET /api/tags") - model_id = "gpt-5" - models = [ - { - "name": model_id, - "model": model_id, - "modified_at": "2023-10-01T00:00:00Z", - "size": 815319791, - "digest": "8648f39daa8fbf5b18c7b4e6a8fb4990c692751d49917417b8842ca5758e7ffc", - "details": { - "parent_model": "", - "format": "gguf", - "family": "llama", - "families": ["llama"], - "parameter_size": "8.0B", - "quantization_level": "Q4_0", - }, - } + expose_variants = bool(current_app.config.get("EXPOSE_REASONING_MODELS")) + model_ids = [ + "gpt-5", + *( + [ + "gpt-5-high", + "gpt-5-medium", + "gpt-5-low", + "gpt-5-minimal", + ] + if expose_variants + else [] + ), ] + models = [] + for model_id in model_ids: + models.append( + { + "name": model_id, + "model": model_id, + "modified_at": "2023-10-01T00:00:00Z", + "size": 815319791, + "digest": "8648f39daa8fbf5b18c7b4e6a8fb4990c692751d49917417b8842ca5758e7ffc", + "details": { + "parent_model": "", + "format": "gguf", + "family": "llama", + "families": ["llama"], + "parameter_size": "8.0B", + "quantization_level": "Q4_0", + }, + } + ) resp = make_response(jsonify({"models": models}), 200) for k, v in build_cors_headers().items(): resp.headers.setdefault(k, v) @@ -137,6 +152,8 @@ def ollama_chat() -> Response: input_items = convert_chat_messages_to_responses_input(messages) + # Infer effort from model variant (gpt-5-high, etc.) but send base model upstream + model_reasoning = extract_reasoning_from_model_name(model) upstream, error_resp = start_upstream_request( normalize_model_name(model), input_items, @@ -144,7 +161,7 @@ def ollama_chat() -> Response: tools=tools_responses, tool_choice=tool_choice, parallel_tool_calls=parallel_tool_calls, - reasoning_param=build_reasoning_param(reasoning_effort, reasoning_summary, None), + reasoning_param=build_reasoning_param(reasoning_effort, reasoning_summary, model_reasoning), ) if error_resp is not None: return error_resp @@ -162,7 +179,7 @@ def ollama_chat() -> Response: ) created_at = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ") - model_out = normalize_model_name(model) + model_out = model if isinstance(model, str) and model.strip() else normalize_model_name(model) if stream_req: def _gen(): diff --git a/chatmock/routes_openai.py b/chatmock/routes_openai.py index 446df69..fc0b711 100644 --- a/chatmock/routes_openai.py +++ b/chatmock/routes_openai.py @@ -8,7 +8,7 @@ from flask import Blueprint, Response, current_app, jsonify, make_response, requ from .config import BASE_INSTRUCTIONS from .http import build_cors_headers -from .reasoning import apply_reasoning_to_message, build_reasoning_param +from .reasoning import apply_reasoning_to_message, build_reasoning_param, extract_reasoning_from_model_name from .upstream import normalize_model_name, start_upstream_request from .utils import ( convert_chat_messages_to_responses_input, @@ -45,7 +45,8 @@ def chat_completions() -> Response: except Exception: return jsonify({"error": {"message": "Invalid JSON body"}}), 400 - model = normalize_model_name(payload.get("model"), debug_model) + requested_model = payload.get("model") + model = normalize_model_name(requested_model, debug_model) messages = payload.get("messages") if messages is None and isinstance(payload.get("prompt"), str): messages = [{"role": "user", "content": payload.get("prompt") or ""}] @@ -76,7 +77,8 @@ def chat_completions() -> Response: {"type": "message", "role": "user", "content": [{"type": "input_text", "text": payload.get("prompt")}]} ] - reasoning_overrides = payload.get("reasoning") if isinstance(payload.get("reasoning"), dict) else None + model_reasoning = extract_reasoning_from_model_name(requested_model) + reasoning_overrides = payload.get("reasoning") if isinstance(payload.get("reasoning"), dict) else model_reasoning reasoning_param = build_reasoning_param(reasoning_effort, reasoning_summary, reasoning_overrides) upstream, error_resp = start_upstream_request( @@ -109,7 +111,7 @@ def chat_completions() -> Response: resp = Response( sse_translate_chat( upstream, - model, + requested_model or model, created, verbose=verbose, vlog=print if verbose else None, @@ -206,7 +208,7 @@ def chat_completions() -> Response: "id": response_id or "chatcmpl", "object": "chat.completion", "created": created, - "model": model, + "model": requested_model or model, "choices": [ { "index": 0, @@ -235,7 +237,8 @@ def completions() -> Response: except Exception: return jsonify({"error": {"message": "Invalid JSON body"}}), 400 - model = normalize_model_name(payload.get("model"), debug_model) + requested_model = payload.get("model") + model = normalize_model_name(requested_model, debug_model) prompt = payload.get("prompt") if isinstance(prompt, list): prompt = "".join([p if isinstance(p, str) else "" for p in prompt]) @@ -248,7 +251,8 @@ def completions() -> Response: messages = [{"role": "user", "content": prompt or ""}] input_items = convert_chat_messages_to_responses_input(messages) - reasoning_overrides = payload.get("reasoning") if isinstance(payload.get("reasoning"), dict) else None + model_reasoning = extract_reasoning_from_model_name(requested_model) + reasoning_overrides = payload.get("reasoning") if isinstance(payload.get("reasoning"), dict) else model_reasoning reasoning_param = build_reasoning_param(reasoning_effort, reasoning_summary, reasoning_overrides) upstream, error_resp = start_upstream_request( model, @@ -274,7 +278,7 @@ def completions() -> Response: resp = Response( sse_translate_text( upstream, - model, + requested_model or model, created, verbose=verbose, vlog=(print if verbose else None), @@ -335,7 +339,7 @@ def completions() -> Response: "id": response_id or "cmpl", "object": "text_completion", "created": created, - "model": model, + "model": requested_model or model, "choices": [ {"index": 0, "text": full_text, "finish_reason": "stop", "logprobs": None} ], @@ -349,7 +353,20 @@ def completions() -> Response: @openai_bp.route("/v1/models", methods=["GET"]) def list_models() -> Response: - models = {"object": "list", "data": [{"id": "gpt-5", "object": "model", "owned_by": "owner"}]} + expose_variants = bool(current_app.config.get("EXPOSE_REASONING_MODELS")) + data = [] + if expose_variants: + variant_ids = [ + "gpt-5", + "gpt-5-high", + "gpt-5-medium", + "gpt-5-low", + "gpt-5-minimal", + ] + data = [{"id": mid, "object": "model", "owned_by": "owner"} for mid in variant_ids] + else: + data = [{"id": "gpt-5", "object": "model", "owned_by": "owner"}] + models = {"object": "list", "data": data} resp = make_response(jsonify(models), 200) for k, v in build_cors_headers().items(): resp.headers.setdefault(k, v) diff --git a/chatmock/upstream.py b/chatmock/upstream.py index ccbb882..75a74c3 100644 --- a/chatmock/upstream.py +++ b/chatmock/upstream.py @@ -20,6 +20,13 @@ def normalize_model_name(name: str | None, debug_model: str | None = None) -> st if not isinstance(name, str) or not name.strip(): return "gpt-5" base = name.split(":", 1)[0].strip() + for sep in ("-", "_"): + lowered = base.lower() + for effort in ("minimal", "low", "medium", "high"): + suffix = f"{sep}{effort}" + if lowered.endswith(suffix): + base = base[: -len(suffix)] + break mapping = { "gpt5": "gpt-5", "gpt-5-latest": "gpt-5",