from __future__ import annotations import json import datetime import time from typing import Any, Dict, List from flask import Blueprint, Response, current_app, jsonify, make_response, request, stream_with_context from .config import BASE_INSTRUCTIONS from .http import build_cors_headers from .reasoning import build_reasoning_param, extract_reasoning_from_model_name from .transform import convert_ollama_messages, normalize_ollama_tools from .upstream import normalize_model_name, start_upstream_request from .utils import convert_chat_messages_to_responses_input, convert_tools_chat_to_responses ollama_bp = Blueprint("ollama", __name__) _OLLAMA_FAKE_EVAL = { "total_duration": 8497226791, "load_duration": 1747193958, "prompt_eval_count": 24, "prompt_eval_duration": 269219750, "eval_count": 247, "eval_duration": 6413802458, } @ollama_bp.route("/api/tags", methods=["GET"]) def ollama_tags() -> Response: if bool(current_app.config.get("VERBOSE")): print("IN GET /api/tags") expose_variants = bool(current_app.config.get("EXPOSE_REASONING_MODELS")) model_ids = [ "gpt-5", *( [ "gpt-5-high", "gpt-5-medium", "gpt-5-low", "gpt-5-minimal", ] if expose_variants else [] ), ] models = [] for model_id in model_ids: models.append( { "name": model_id, "model": model_id, "modified_at": "2023-10-01T00:00:00Z", "size": 815319791, "digest": "8648f39daa8fbf5b18c7b4e6a8fb4990c692751d49917417b8842ca5758e7ffc", "details": { "parent_model": "", "format": "gguf", "family": "llama", "families": ["llama"], "parameter_size": "8.0B", "quantization_level": "Q4_0", }, } ) resp = make_response(jsonify({"models": models}), 200) for k, v in build_cors_headers().items(): resp.headers.setdefault(k, v) return resp @ollama_bp.route("/api/show", methods=["POST"]) def ollama_show() -> Response: verbose = bool(current_app.config.get("VERBOSE")) try: if verbose: body_preview = (request.get_data(cache=True, as_text=True) or "")[:2000] print("IN POST /api/show\n" + body_preview) except Exception: pass try: payload = request.get_json(silent=True) or {} except Exception: payload = {} model = payload.get("model") if not isinstance(model, str) or not model.strip(): return jsonify({"error": "Model not found"}), 400 v1_show_response = { "modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llava:latest\n\nFROM /models/blobs/sha256:placeholder\nTEMPLATE \"\"\"{{ .System }}\nUSER: {{ .Prompt }}\nASSISTANT: \"\"\"\nPARAMETER num_ctx 100000\nPARAMETER stop \"\"\nPARAMETER stop \"USER:\"\nPARAMETER stop \"ASSISTANT:\"", "parameters": "num_keep 24\nstop \"<|start_header_id|>\"\nstop \"<|end_header_id|>\"\nstop \"<|eot_id|>\"", "template": "{{ if .System }}<|start_header_id|>system<|end_header_id|>\n\n{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>\n\n{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>\n\n{{ .Response }}<|eot_id|>", "details": { "parent_model": "", "format": "gguf", "family": "llama", "families": ["llama"], "parameter_size": "8.0B", "quantization_level": "Q4_0", }, "model_info": { "general.architecture": "llama", "general.file_type": 2, "llama.context_length": 2000000, }, "capabilities": ["completion", "vision", "tools", "thinking"], } resp = make_response(jsonify(v1_show_response), 200) for k, v in build_cors_headers().items(): resp.headers.setdefault(k, v) return resp @ollama_bp.route("/api/chat", methods=["POST"]) def ollama_chat() -> Response: verbose = bool(current_app.config.get("VERBOSE")) reasoning_effort = current_app.config.get("REASONING_EFFORT", "medium") reasoning_summary = current_app.config.get("REASONING_SUMMARY", "auto") reasoning_compat = current_app.config.get("REASONING_COMPAT", "think-tags") try: raw = request.get_data(cache=True, as_text=True) or "" if verbose: print("IN POST /api/chat\n" + (raw[:2000] if isinstance(raw, str) else "")) payload = json.loads(raw) if raw else {} except Exception: return jsonify({"error": "Invalid JSON body"}), 400 model = payload.get("model") raw_messages = payload.get("messages") messages = convert_ollama_messages( raw_messages, payload.get("images") if isinstance(payload.get("images"), list) else None ) if isinstance(messages, list): sys_idx = next((i for i, m in enumerate(messages) if isinstance(m, dict) and m.get("role") == "system"), None) if isinstance(sys_idx, int): sys_msg = messages.pop(sys_idx) content = sys_msg.get("content") if isinstance(sys_msg, dict) else "" messages.insert(0, {"role": "user", "content": content}) stream_req = payload.get("stream") if stream_req is None: stream_req = True stream_req = bool(stream_req) tools_req = payload.get("tools") if isinstance(payload.get("tools"), list) else [] tools_responses = convert_tools_chat_to_responses(normalize_ollama_tools(tools_req)) tool_choice = payload.get("tool_choice", "auto") parallel_tool_calls = bool(payload.get("parallel_tool_calls", False)) # Passthrough Responses API tools (web_search) via ChatMock extension fields extra_tools: List[Dict[str, Any]] = [] had_responses_tools = False rt_payload = payload.get("responses_tools") if isinstance(payload.get("responses_tools"), list) else [] if isinstance(rt_payload, list): for _t in rt_payload: if not (isinstance(_t, dict) and isinstance(_t.get("type"), str)): continue if _t.get("type") not in ("web_search", "web_search_preview"): return jsonify({"error": "Only web_search/web_search_preview are supported in responses_tools"}), 400 extra_tools.append(_t) if not extra_tools and bool(current_app.config.get("DEFAULT_WEB_SEARCH")): rtc = payload.get("responses_tool_choice") if not (isinstance(rtc, str) and rtc == "none"): extra_tools = [{"type": "web_search"}] if extra_tools: import json as _json MAX_TOOLS_BYTES = 32768 try: size = len(_json.dumps(extra_tools)) except Exception: size = 0 if size > MAX_TOOLS_BYTES: return jsonify({"error": "responses_tools too large"}), 400 had_responses_tools = True tools_responses = (tools_responses or []) + extra_tools rtc = payload.get("responses_tool_choice") if isinstance(rtc, str) and rtc in ("auto", "none"): tool_choice = rtc if not isinstance(model, str) or not isinstance(messages, list) or not messages: return jsonify({"error": "Invalid request format"}), 400 input_items = convert_chat_messages_to_responses_input(messages) model_reasoning = extract_reasoning_from_model_name(model) upstream, error_resp = start_upstream_request( normalize_model_name(model), input_items, instructions=BASE_INSTRUCTIONS, tools=tools_responses, tool_choice=tool_choice, parallel_tool_calls=parallel_tool_calls, reasoning_param=build_reasoning_param(reasoning_effort, reasoning_summary, model_reasoning), ) if error_resp is not None: return error_resp if upstream.status_code >= 400: try: err_body = json.loads(upstream.content.decode("utf-8", errors="ignore")) if upstream.content else {"raw": upstream.text} except Exception: err_body = {"raw": upstream.text} if had_responses_tools: if verbose: print("[Passthrough] Upstream rejected tools; retrying without extras (args redacted)") base_tools_only = convert_tools_chat_to_responses(normalize_ollama_tools(tools_req)) safe_choice = payload.get("tool_choice", "auto") upstream2, err2 = start_upstream_request( normalize_model_name(model), input_items, instructions=BASE_INSTRUCTIONS, tools=base_tools_only, tool_choice=safe_choice, parallel_tool_calls=parallel_tool_calls, reasoning_param=build_reasoning_param(reasoning_effort, reasoning_summary, model_reasoning), ) if err2 is None and upstream2 is not None and upstream2.status_code < 400: upstream = upstream2 else: return ( jsonify({"error": {"message": (err_body.get("error", {}) or {}).get("message", "Upstream error"), "code": "RESPONSES_TOOLS_REJECTED"}}), (upstream2.status_code if upstream2 is not None else upstream.status_code), ) else: if verbose: print("/api/chat upstream error status=", upstream.status_code, " body:", json.dumps(err_body)[:2000]) return ( jsonify({"error": (err_body.get("error", {}) or {}).get("message", "Upstream error")}), upstream.status_code, ) created_at = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ") model_out = model if isinstance(model, str) and model.strip() else normalize_model_name(model) if stream_req: def _gen(): compat = (current_app.config.get("REASONING_COMPAT", "think-tags") or "think-tags").strip().lower() think_open = False think_closed = False saw_any_summary = False pending_summary_paragraph = False full_parts: List[str] = [] try: for raw_line in upstream.iter_lines(decode_unicode=False): if not raw_line: continue line = raw_line.decode("utf-8", errors="ignore") if isinstance(raw_line, (bytes, bytearray)) else raw_line if not line.startswith("data: "): continue data = line[len("data: "):].strip() if not data: continue if data == "[DONE]": break try: evt = json.loads(data) except Exception: continue kind = evt.get("type") if kind == "response.reasoning_summary_part.added": if compat in ("think-tags", "o3"): if saw_any_summary: pending_summary_paragraph = True else: saw_any_summary = True elif kind in ("response.reasoning_summary_text.delta", "response.reasoning_text.delta"): delta_txt = evt.get("delta") or "" if compat == "o3": if kind == "response.reasoning_summary_text.delta" and pending_summary_paragraph: yield ( json.dumps( { "model": model_out, "created_at": created_at, "message": {"role": "assistant", "content": "\n"}, "done": False, } ) + "\n" ) full_parts.append("\n") pending_summary_paragraph = False if delta_txt: yield ( json.dumps( { "model": model_out, "created_at": created_at, "message": {"role": "assistant", "content": delta_txt}, "done": False, } ) + "\n" ) full_parts.append(delta_txt) elif compat == "think-tags": if not think_open and not think_closed: yield ( json.dumps( { "model": model_out, "created_at": created_at, "message": {"role": "assistant", "content": ""}, "done": False, } ) + "\n" ) full_parts.append("") think_open = True if think_open and not think_closed: if kind == "response.reasoning_summary_text.delta" and pending_summary_paragraph: yield ( json.dumps( { "model": model_out, "created_at": created_at, "message": {"role": "assistant", "content": "\n"}, "done": False, } ) + "\n" ) full_parts.append("\n") pending_summary_paragraph = False if delta_txt: yield ( json.dumps( { "model": model_out, "created_at": created_at, "message": {"role": "assistant", "content": delta_txt}, "done": False, } ) + "\n" ) full_parts.append(delta_txt) else: pass elif kind == "response.output_text.delta": delta = evt.get("delta") or "" if compat == "think-tags" and think_open and not think_closed: yield ( json.dumps( { "model": model_out, "created_at": created_at, "message": {"role": "assistant", "content": ""}, "done": False, } ) + "\n" ) full_parts.append("") think_open = False think_closed = True if delta: yield ( json.dumps( { "model": model_out, "created_at": created_at, "message": {"role": "assistant", "content": delta}, "done": False, } ) + "\n" ) full_parts.append(delta) elif kind == "response.completed": break finally: upstream.close() if compat == "think-tags" and think_open and not think_closed: yield ( json.dumps( { "model": model_out, "created_at": created_at, "message": {"role": "assistant", "content": ""}, "done": False, } ) + "\n" ) full_parts.append("") done_obj = { "model": model_out, "created_at": created_at, "message": {"role": "assistant", "content": "".join(full_parts)}, "done": True, } done_obj.update(_OLLAMA_FAKE_EVAL) yield json.dumps(done_obj) + "\n" resp = current_app.response_class( stream_with_context(_gen()), status=200, mimetype="application/x-ndjson", ) for k, v in build_cors_headers().items(): resp.headers.setdefault(k, v) return resp full_text = "" reasoning_summary_text = "" reasoning_full_text = "" tool_calls: List[Dict[str, Any]] = [] try: for raw in upstream.iter_lines(decode_unicode=False): if not raw: continue line = raw.decode("utf-8", errors="ignore") if isinstance(raw, (bytes, bytearray)) else raw if not line.startswith("data: "): continue data = line[len("data: "):].strip() if not data: continue if data == "[DONE]": break try: evt = json.loads(data) except Exception: continue kind = evt.get("type") if kind == "response.output_text.delta": full_text += evt.get("delta") or "" elif kind == "response.reasoning_summary_text.delta": reasoning_summary_text += evt.get("delta") or "" elif kind == "response.reasoning_text.delta": reasoning_full_text += evt.get("delta") or "" elif kind == "response.output_item.done": item = evt.get("item") or {} if isinstance(item, dict) and item.get("type") == "function_call": call_id = item.get("call_id") or item.get("id") or "" name = item.get("name") or "" args = item.get("arguments") or "" if isinstance(call_id, str) and isinstance(name, str) and isinstance(args, str): tool_calls.append( { "id": call_id, "type": "function", "function": {"name": name, "arguments": args}, } ) elif kind == "response.completed": break finally: upstream.close() if (current_app.config.get("REASONING_COMPAT", "think-tags") or "think-tags").strip().lower() == "think-tags": rtxt_parts = [] if isinstance(reasoning_summary_text, str) and reasoning_summary_text.strip(): rtxt_parts.append(reasoning_summary_text) if isinstance(reasoning_full_text, str) and reasoning_full_text.strip(): rtxt_parts.append(reasoning_full_text) rtxt = "\n\n".join([p for p in rtxt_parts if p]) if rtxt: full_text = f"{rtxt}" + (full_text or "") out_json = { "model": normalize_model_name(model), "created_at": created_at, "message": {"role": "assistant", "content": full_text, **({"tool_calls": tool_calls} if tool_calls else {})}, "done": True, "done_reason": "stop", } out_json.update(_OLLAMA_FAKE_EVAL) resp = make_response(jsonify(out_json), 200) for k, v in build_cors_headers().items(): resp.headers.setdefault(k, v) return resp