ChatMock/chatmock/routes_ollama.py

from __future__ import annotations

import json
import datetime
import time
from typing import Any, Dict, List

from flask import Blueprint, Response, current_app, jsonify, make_response, request, stream_with_context

from .config import BASE_INSTRUCTIONS, GPT5_CODEX_INSTRUCTIONS
from .fast_mode import resolve_service_tier
from .limits import record_rate_limits_from_response
from .http import build_cors_headers
from .model_registry import list_public_models, uses_codex_instructions
from .responses_api import instructions_for_model
from .reasoning import (
    allowed_efforts_for_model,
    build_reasoning_param,
    extract_reasoning_from_model_name,
)
from .transform import convert_ollama_messages, normalize_ollama_tools
from .upstream import normalize_model_name, start_upstream_request
from .utils import convert_chat_messages_to_responses_input, convert_tools_chat_to_responses


ollama_bp = Blueprint("ollama", __name__)


def _log_json(prefix: str, payload: Any) -> None:
    try:
        print(f"{prefix}\n{json.dumps(payload, indent=2, ensure_ascii=False)}")
    except Exception:
        try:
            print(f"{prefix}\n{payload}")
        except Exception:
            pass


def _wrap_stream_logging(label: str, iterator, enabled: bool):
    if not enabled:
        return iterator

    def _gen():
        for chunk in iterator:
            try:
                text = (
                    chunk.decode("utf-8", errors="replace")
                    if isinstance(chunk, (bytes, bytearray))
                    else str(chunk)
                )
                print(f"{label}\n{text}")
            except Exception:
                pass
            yield chunk

    return _gen()


@ollama_bp.route("/api/version", methods=["GET"])
def ollama_version() -> Response:
    if bool(current_app.config.get("VERBOSE")):
        print("IN GET /api/version")
    version = current_app.config.get("OLLAMA_VERSION", "0.12.10")
    if not isinstance(version, str) or not version.strip():
        version = "0.12.10"
    payload = {"version": version}
    resp = make_response(jsonify(payload), 200)
    for k, v in build_cors_headers().items():
        resp.headers.setdefault(k, v)
    if bool(current_app.config.get("VERBOSE")):
        _log_json("OUT GET /api/version", payload)
    return resp


def _instructions_for_model(model: str) -> str:
    return instructions_for_model(current_app.config, model)


_OLLAMA_FAKE_EVAL = {
    "total_duration": 8497226791,
    "load_duration": 1747193958,
    "prompt_eval_count": 24,
    "prompt_eval_duration": 269219750,
    "eval_count": 247,
    "eval_duration": 6413802458,
}


@ollama_bp.route("/api/tags", methods=["GET"])
def ollama_tags() -> Response:
    if bool(current_app.config.get("VERBOSE")):
        print("IN GET /api/tags")
    expose_variants = bool(current_app.config.get("EXPOSE_REASONING_MODELS"))
    model_ids = list_public_models(expose_reasoning_models=expose_variants)
    models = []
    for model_id in model_ids:
        models.append(
            {
                "name": model_id,
                "model": model_id,
                "modified_at": "2023-10-01T00:00:00Z",
                "size": 815319791,
                "digest": "8648f39daa8fbf5b18c7b4e6a8fb4990c692751d49917417b8842ca5758e7ffc",
                "details": {
                    "parent_model": "",
                    "format": "gguf",
                    "family": "llama",
                    "families": ["llama"],
                    "parameter_size": "8.0B",
                    "quantization_level": "Q4_0",
                },
            }
        )
    payload = {"models": models}
    resp = make_response(jsonify(payload), 200)
    for k, v in build_cors_headers().items():
        resp.headers.setdefault(k, v)
    if bool(current_app.config.get("VERBOSE")):
        _log_json("OUT GET /api/tags", payload)
    return resp


@ollama_bp.route("/api/show", methods=["POST"])
def ollama_show() -> Response:
    verbose = bool(current_app.config.get("VERBOSE"))
    raw_body = request.get_data(cache=True, as_text=True) or ""
    if verbose:
        try:
            print("IN POST /api/show\n" + raw_body)
        except Exception:
            pass
    try:
        payload = json.loads(raw_body) if raw_body else (request.get_json(silent=True) or {})
    except Exception:
        payload = request.get_json(silent=True) or {}
    model = payload.get("model")
    if not isinstance(model, str) or not model.strip():
        err = {"error": "Model not found"}
        if verbose:
            _log_json("OUT POST /api/show", err)
        return jsonify(err), 400
    v1_show_response = {
        "modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llava:latest\n\nFROM /models/blobs/sha256:placeholder\nTEMPLATE \"\"\"{{ .System }}\nUSER: {{ .Prompt }}\nASSISTANT: \"\"\"\nPARAMETER num_ctx 100000\nPARAMETER stop \"</s>\"\nPARAMETER stop \"USER:\"\nPARAMETER stop \"ASSISTANT:\"",
        "parameters": "num_keep 24\nstop \"<|start_header_id|>\"\nstop \"<|end_header_id|>\"\nstop \"<|eot_id|>\"",
        "template": "{{ if .System }}<|start_header_id|>system<|end_header_id|>\n\n{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>\n\n{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>\n\n{{ .Response }}<|eot_id|>",
        "details": {
            "parent_model": "",
            "format": "gguf",
            "family": "llama",
            "families": ["llama"],
            "parameter_size": "8.0B",
            "quantization_level": "Q4_0",
        },
        "model_info": {
            "general.architecture": "llama",
            "general.file_type": 2,
            "llama.context_length": 2000000,
        },
        "capabilities": ["completion", "vision", "tools", "thinking"],
    }
    if verbose:
        _log_json("OUT POST /api/show", v1_show_response)
    resp = make_response(jsonify(v1_show_response), 200)
    for k, v in build_cors_headers().items():
        resp.headers.setdefault(k, v)
    return resp


@ollama_bp.route("/api/chat", methods=["POST"])
def ollama_chat() -> Response:
    verbose = bool(current_app.config.get("VERBOSE"))
    reasoning_effort = current_app.config.get("REASONING_EFFORT", "medium")
    reasoning_summary = current_app.config.get("REASONING_SUMMARY", "auto")
    reasoning_compat = current_app.config.get("REASONING_COMPAT", "think-tags")

    try:
        raw = request.get_data(cache=True, as_text=True) or ""
        if verbose:
            print("IN POST /api/chat\n" + (raw if isinstance(raw, str) else ""))
        payload = json.loads(raw) if raw else {}
    except Exception:
        err = {"error": "Invalid JSON body"}
        if verbose:
            _log_json("OUT POST /api/chat", err)
        return jsonify(err), 400

    model = payload.get("model")
    raw_messages = payload.get("messages")
    messages = convert_ollama_messages(
        raw_messages, payload.get("images") if isinstance(payload.get("images"), list) else None
    )
    if isinstance(messages, list):
        sys_idx = next((i for i, m in enumerate(messages) if isinstance(m, dict) and m.get("role") == "system"), None)
        if isinstance(sys_idx, int):
            sys_msg = messages.pop(sys_idx)
            content = sys_msg.get("content") if isinstance(sys_msg, dict) else ""
            messages.insert(0, {"role": "user", "content": content})
    stream_req = payload.get("stream")
    if stream_req is None:
        stream_req = True
    stream_req = bool(stream_req)
    tools_req = payload.get("tools") if isinstance(payload.get("tools"), list) else []
    tools_responses = convert_tools_chat_to_responses(normalize_ollama_tools(tools_req))
    tool_choice = payload.get("tool_choice", "auto")
    parallel_tool_calls = bool(payload.get("parallel_tool_calls", False))

    # Passthrough Responses API tools (web_search) via ChatMock extension fields
    extra_tools: List[Dict[str, Any]] = []
    had_responses_tools = False
    rt_payload = payload.get("responses_tools") if isinstance(payload.get("responses_tools"), list) else []
    if isinstance(rt_payload, list):
        for _t in rt_payload:
            if not (isinstance(_t, dict) and isinstance(_t.get("type"), str)):
                continue
            if _t.get("type") not in ("web_search", "web_search_preview"):
                err = {"error": "Only web_search/web_search_preview are supported in responses_tools"}
                if verbose:
                    _log_json("OUT POST /api/chat", err)
                return jsonify(err), 400
            extra_tools.append(_t)
        if not extra_tools and bool(current_app.config.get("DEFAULT_WEB_SEARCH")):
            rtc = payload.get("responses_tool_choice")
            if not (isinstance(rtc, str) and rtc == "none"):
                extra_tools = [{"type": "web_search"}]
        if extra_tools:
            import json as _json
            MAX_TOOLS_BYTES = 32768
            try:
                size = len(_json.dumps(extra_tools))
            except Exception:
                size = 0
            if size > MAX_TOOLS_BYTES:
                err = {"error": "responses_tools too large"}
                if verbose:
                    _log_json("OUT POST /api/chat", err)
                return jsonify(err), 400
            had_responses_tools = True
            tools_responses = (tools_responses or []) + extra_tools

    rtc = payload.get("responses_tool_choice")
    if isinstance(rtc, str) and rtc in ("auto", "none"):
        tool_choice = rtc

    if not isinstance(model, str) or not isinstance(messages, list) or not messages:
        err = {"error": "Invalid request format"}
        if verbose:
            _log_json("OUT POST /api/chat", err)
        return jsonify(err), 400

    input_items = convert_chat_messages_to_responses_input(messages)

    model_reasoning = extract_reasoning_from_model_name(model)
    normalized_model = normalize_model_name(model)
    service_tier_resolution = resolve_service_tier(
        normalized_model,
        request_fast_mode=payload.get("fast_mode"),
        request_service_tier=payload.get("service_tier"),
        server_fast_mode=bool(current_app.config.get("FAST_MODE")),
    )
    if service_tier_resolution.warning_message and verbose:
        print(f"[FastMode] {service_tier_resolution.warning_message}")
    if service_tier_resolution.error_message:
        err = {"error": service_tier_resolution.error_message}
        if verbose:
            _log_json("OUT POST /api/chat", err)
        return jsonify(err), 400
    upstream, error_resp = start_upstream_request(
        normalized_model,
        input_items,
        instructions=_instructions_for_model(normalized_model),
        tools=tools_responses,
        tool_choice=tool_choice,
        parallel_tool_calls=parallel_tool_calls,
        reasoning_param=build_reasoning_param(
            reasoning_effort,
            reasoning_summary,
            model_reasoning,
            allowed_efforts=allowed_efforts_for_model(model),
        ),
        service_tier=service_tier_resolution.service_tier,
    )
    if error_resp is not None:
        if verbose:
            try:
                body = error_resp.get_data(as_text=True)
                if body:
                    try:
                        parsed = json.loads(body)
                    except Exception:
                        parsed = body
                    _log_json("OUT POST /api/chat", parsed)
            except Exception:
                pass
        return error_resp

    record_rate_limits_from_response(upstream)

    if upstream.status_code >= 400:
        try:
            err_body = json.loads(upstream.content.decode("utf-8", errors="ignore")) if upstream.content else {"raw": upstream.text}
        except Exception:
            err_body = {"raw": upstream.text}
        if had_responses_tools:
            if verbose:
                print("[Passthrough] Upstream rejected tools; retrying without extras (args redacted)")
            base_tools_only = convert_tools_chat_to_responses(normalize_ollama_tools(tools_req))
            safe_choice = payload.get("tool_choice", "auto")
            upstream2, err2 = start_upstream_request(
                normalize_model_name(model),
                input_items,
                instructions=BASE_INSTRUCTIONS,
                tools=base_tools_only,
                tool_choice=safe_choice,
                parallel_tool_calls=parallel_tool_calls,
                reasoning_param=build_reasoning_param(
                    reasoning_effort,
                    reasoning_summary,
                    model_reasoning,
                    allowed_efforts=allowed_efforts_for_model(model),
                ),
                service_tier=service_tier_resolution.service_tier,
            )
            record_rate_limits_from_response(upstream2)
            if err2 is None and upstream2 is not None and upstream2.status_code < 400:
                upstream = upstream2
            else:
                err = {"error": {"message": (err_body.get("error", {}) or {}).get("message", "Upstream error"), "code": "RESPONSES_TOOLS_REJECTED"}}
                if verbose:
                    _log_json("OUT POST /api/chat", err)
                return jsonify(err), (upstream2.status_code if upstream2 is not None else upstream.status_code)
        else:
            if verbose:
                print("/api/chat upstream error status=", upstream.status_code, " body:", json.dumps(err_body)[:2000])
            err = {"error": (err_body.get("error", {}) or {}).get("message", "Upstream error")}
            if verbose:
                _log_json("OUT POST /api/chat", err)
            return jsonify(err), upstream.status_code

    created_at = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
    model_out = model if isinstance(model, str) and model.strip() else normalized_model

    if stream_req:
        def _gen():
            compat = (current_app.config.get("REASONING_COMPAT", "think-tags") or "think-tags").strip().lower()
            think_open = False
            think_closed = False
            saw_any_summary = False
            pending_summary_paragraph = False
            full_parts: List[str] = []
            try:
                for raw_line in upstream.iter_lines(decode_unicode=False):
                    if not raw_line:
                        continue
                    line = raw_line.decode("utf-8", errors="ignore") if isinstance(raw_line, (bytes, bytearray)) else raw_line
                    if not line.startswith("data: "):
                        continue
                    data = line[len("data: "):].strip()
                    if not data:
                        continue
                    if data == "[DONE]":
                        break
                    try:
                        evt = json.loads(data)
                    except Exception:
                        continue
                    kind = evt.get("type")
                    if kind == "response.reasoning_summary_part.added":
                        if compat in ("think-tags", "o3"):
                            if saw_any_summary:
                                pending_summary_paragraph = True
                            else:
                                saw_any_summary = True
                    elif kind in ("response.reasoning_summary_text.delta", "response.reasoning_text.delta"):
                        delta_txt = evt.get("delta") or ""
                        if compat == "o3":
                            if kind == "response.reasoning_summary_text.delta" and pending_summary_paragraph:
                                yield (
                                    json.dumps(
                                        {
                                            "model": model_out,
                                            "created_at": created_at,
                                            "message": {"role": "assistant", "content": "\n"},
                                            "done": False,
                                        }
                                    )
                                    + "\n"
                                )
                                full_parts.append("\n")
                                pending_summary_paragraph = False
                            if delta_txt:
                                yield (
                                    json.dumps(
                                        {
                                            "model": model_out,
                                            "created_at": created_at,
                                            "message": {"role": "assistant", "content": delta_txt},
                                            "done": False,
                                        }
                                    )
                                    + "\n"
                                )
                                full_parts.append(delta_txt)
                        elif compat == "think-tags":
                            if not think_open and not think_closed:
                                yield (
                                    json.dumps(
                                        {
                                            "model": model_out,
                                            "created_at": created_at,
                                            "message": {"role": "assistant", "content": "<think>"},
                                            "done": False,
                                        }
                                    )
                                    + "\n"
                                )
                                full_parts.append("<think>")
                                think_open = True
                            if think_open and not think_closed:
                                if kind == "response.reasoning_summary_text.delta" and pending_summary_paragraph:
                                    yield (
                                        json.dumps(
                                            {
                                                "model": model_out,
                                                "created_at": created_at,
                                                "message": {"role": "assistant", "content": "\n"},
                                                "done": False,
                                            }
                                        )
                                        + "\n"
                                    )
                                    full_parts.append("\n")
                                    pending_summary_paragraph = False
                                if delta_txt:
                                    yield (
                                        json.dumps(
                                            {
                                                "model": model_out,
                                                "created_at": created_at,
                                                "message": {"role": "assistant", "content": delta_txt},
                                                "done": False,
                                            }
                                        )
                                        + "\n"
                                    )
                                    full_parts.append(delta_txt)
                        else:
                            pass
                    elif kind == "response.output_text.delta":
                        delta = evt.get("delta") or ""
                        if compat == "think-tags" and think_open and not think_closed:
                            yield (
                                json.dumps(
                                    {
                                        "model": model_out,
                                        "created_at": created_at,
                                        "message": {"role": "assistant", "content": "</think>"},
                                        "done": False,
                                    }
                                )
                                + "\n"
                            )
                            full_parts.append("</think>")
                            think_open = False
                            think_closed = True
                        if delta:
                            yield (
                                json.dumps(
                                    {
                                        "model": model_out,
                                        "created_at": created_at,
                                        "message": {"role": "assistant", "content": delta},
                                        "done": False,
                                    }
                                )
                                + "\n"
                            )
                            full_parts.append(delta)
                    elif kind == "response.completed":
                        break
            finally:
                upstream.close()
                if compat == "think-tags" and think_open and not think_closed:
                    yield (
                        json.dumps(
                            {
                                "model": model_out,
                                "created_at": created_at,
                                "message": {"role": "assistant", "content": "</think>"},
                                "done": False,
                            }
                        )
                        + "\n"
                    )
                    full_parts.append("</think>")
                done_obj = {
                    "model": model_out,
                    "created_at": created_at,
                    "message": {"role": "assistant", "content": ""},
                    "done": True,
                }
                done_obj.update(_OLLAMA_FAKE_EVAL)
                yield json.dumps(done_obj) + "\n"
        if verbose:
            print("OUT POST /api/chat (streaming response)")
        stream_iter = stream_with_context(_gen())
        stream_iter = _wrap_stream_logging("STREAM OUT /api/chat", stream_iter, verbose)
        resp = current_app.response_class(
            stream_iter,
            status=200,
            mimetype="application/x-ndjson",
        )
        for k, v in build_cors_headers().items():
            resp.headers.setdefault(k, v)
        return resp

    full_text = ""
    reasoning_summary_text = ""
    reasoning_full_text = ""
    tool_calls: List[Dict[str, Any]] = []
    try:
        for raw in upstream.iter_lines(decode_unicode=False):
            if not raw:
                continue
            line = raw.decode("utf-8", errors="ignore") if isinstance(raw, (bytes, bytearray)) else raw
            if not line.startswith("data: "):
                continue
            data = line[len("data: "):].strip()
            if not data:
                continue
            if data == "[DONE]":
                break
            try:
                evt = json.loads(data)
            except Exception:
                continue
            kind = evt.get("type")
            if kind == "response.output_text.delta":
                full_text += evt.get("delta") or ""
            elif kind == "response.reasoning_summary_text.delta":
                reasoning_summary_text += evt.get("delta") or ""
            elif kind == "response.reasoning_text.delta":
                reasoning_full_text += evt.get("delta") or ""
            elif kind == "response.output_item.done":
                item = evt.get("item") or {}
                if isinstance(item, dict) and item.get("type") == "function_call":
                    call_id = item.get("call_id") or item.get("id") or ""
                    name = item.get("name") or ""
                    args = item.get("arguments") or ""
                    if isinstance(call_id, str) and isinstance(name, str) and isinstance(args, str):
                        tool_calls.append(
                            {
                                "id": call_id,
                                "type": "function",
                                "function": {"name": name, "arguments": args},
                            }
                        )
            elif kind == "response.completed":
                break
    finally:
        upstream.close()

    if (current_app.config.get("REASONING_COMPAT", "think-tags") or "think-tags").strip().lower() == "think-tags":
        rtxt_parts = []
        if isinstance(reasoning_summary_text, str) and reasoning_summary_text.strip():
            rtxt_parts.append(reasoning_summary_text)
        if isinstance(reasoning_full_text, str) and reasoning_full_text.strip():
            rtxt_parts.append(reasoning_full_text)
        rtxt = "\n\n".join([p for p in rtxt_parts if p])
        if rtxt:
            full_text = f"<think>{rtxt}</think>" + (full_text or "")

    out_json = {
        "model": normalize_model_name(model),
        "created_at": created_at,
        "message": {"role": "assistant", "content": full_text, **({"tool_calls": tool_calls} if tool_calls else {})},
        "done": True,
        "done_reason": "stop",
    }
    out_json.update(_OLLAMA_FAKE_EVAL)
    if verbose:
        _log_json("OUT POST /api/chat", out_json)
    resp = make_response(jsonify(out_json), 200)
    for k, v in build_cors_headers().items():
        resp.headers.setdefault(k, v)
    return resp