Files
ChatMock/chatmock/routes_ollama.py
2025-12-12 01:39:56 +05:00

609 lines
26 KiB
Python

from __future__ import annotations
import json
import datetime
import time
from typing import Any, Dict, List
from flask import Blueprint, Response, current_app, jsonify, make_response, request, stream_with_context
from .config import BASE_INSTRUCTIONS, GPT5_CODEX_INSTRUCTIONS
from .limits import record_rate_limits_from_response
from .http import build_cors_headers
from .reasoning import (
allowed_efforts_for_model,
build_reasoning_param,
extract_reasoning_from_model_name,
)
from .transform import convert_ollama_messages, normalize_ollama_tools
from .upstream import normalize_model_name, start_upstream_request
from .utils import convert_chat_messages_to_responses_input, convert_tools_chat_to_responses
ollama_bp = Blueprint("ollama", __name__)
def _log_json(prefix: str, payload: Any) -> None:
try:
print(f"{prefix}\n{json.dumps(payload, indent=2, ensure_ascii=False)}")
except Exception:
try:
print(f"{prefix}\n{payload}")
except Exception:
pass
def _wrap_stream_logging(label: str, iterator, enabled: bool):
if not enabled:
return iterator
def _gen():
for chunk in iterator:
try:
text = (
chunk.decode("utf-8", errors="replace")
if isinstance(chunk, (bytes, bytearray))
else str(chunk)
)
print(f"{label}\n{text}")
except Exception:
pass
yield chunk
return _gen()
@ollama_bp.route("/api/version", methods=["GET"])
def ollama_version() -> Response:
if bool(current_app.config.get("VERBOSE")):
print("IN GET /api/version")
version = current_app.config.get("OLLAMA_VERSION", "0.12.10")
if not isinstance(version, str) or not version.strip():
version = "0.12.10"
payload = {"version": version}
resp = make_response(jsonify(payload), 200)
for k, v in build_cors_headers().items():
resp.headers.setdefault(k, v)
if bool(current_app.config.get("VERBOSE")):
_log_json("OUT GET /api/version", payload)
return resp
def _instructions_for_model(model: str) -> str:
base = current_app.config.get("BASE_INSTRUCTIONS", BASE_INSTRUCTIONS)
if model.startswith("gpt-5-codex") or model.startswith("gpt-5.1-codex"):
codex = current_app.config.get("GPT5_CODEX_INSTRUCTIONS") or GPT5_CODEX_INSTRUCTIONS
if isinstance(codex, str) and codex.strip():
return codex
return base
_OLLAMA_FAKE_EVAL = {
"total_duration": 8497226791,
"load_duration": 1747193958,
"prompt_eval_count": 24,
"prompt_eval_duration": 269219750,
"eval_count": 247,
"eval_duration": 6413802458,
}
@ollama_bp.route("/api/tags", methods=["GET"])
def ollama_tags() -> Response:
if bool(current_app.config.get("VERBOSE")):
print("IN GET /api/tags")
expose_variants = bool(current_app.config.get("EXPOSE_REASONING_MODELS"))
model_ids = [
"gpt-5",
"gpt-5.1",
"gpt-5.2",
"gpt-5-codex",
"gpt-5.1-codex",
"gpt-5.1-codex-max",
"gpt-5.1-codex-mini",
"codex-mini",
]
if expose_variants:
model_ids.extend(
[
"gpt-5-high",
"gpt-5-medium",
"gpt-5-low",
"gpt-5-minimal",
"gpt-5.1-high",
"gpt-5.1-medium",
"gpt-5.1-low",
"gpt-5.2-xhigh",
"gpt-5.2-high",
"gpt-5.2-medium",
"gpt-5.2-low",
"gpt-5.2-none",
"gpt-5-codex-high",
"gpt-5-codex-medium",
"gpt-5-codex-low",
"gpt-5.1-codex-high",
"gpt-5.1-codex-medium",
"gpt-5.1-codex-low",
"gpt-5.1-codex-max-xhigh",
"gpt-5.1-codex-max-high",
"gpt-5.1-codex-max-medium",
"gpt-5.1-codex-max-low",
]
)
models = []
for model_id in model_ids:
models.append(
{
"name": model_id,
"model": model_id,
"modified_at": "2023-10-01T00:00:00Z",
"size": 815319791,
"digest": "8648f39daa8fbf5b18c7b4e6a8fb4990c692751d49917417b8842ca5758e7ffc",
"details": {
"parent_model": "",
"format": "gguf",
"family": "llama",
"families": ["llama"],
"parameter_size": "8.0B",
"quantization_level": "Q4_0",
},
}
)
payload = {"models": models}
resp = make_response(jsonify(payload), 200)
for k, v in build_cors_headers().items():
resp.headers.setdefault(k, v)
if bool(current_app.config.get("VERBOSE")):
_log_json("OUT GET /api/tags", payload)
return resp
@ollama_bp.route("/api/show", methods=["POST"])
def ollama_show() -> Response:
verbose = bool(current_app.config.get("VERBOSE"))
raw_body = request.get_data(cache=True, as_text=True) or ""
if verbose:
try:
print("IN POST /api/show\n" + raw_body)
except Exception:
pass
try:
payload = json.loads(raw_body) if raw_body else (request.get_json(silent=True) or {})
except Exception:
payload = request.get_json(silent=True) or {}
model = payload.get("model")
if not isinstance(model, str) or not model.strip():
err = {"error": "Model not found"}
if verbose:
_log_json("OUT POST /api/show", err)
return jsonify(err), 400
v1_show_response = {
"modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llava:latest\n\nFROM /models/blobs/sha256:placeholder\nTEMPLATE \"\"\"{{ .System }}\nUSER: {{ .Prompt }}\nASSISTANT: \"\"\"\nPARAMETER num_ctx 100000\nPARAMETER stop \"</s>\"\nPARAMETER stop \"USER:\"\nPARAMETER stop \"ASSISTANT:\"",
"parameters": "num_keep 24\nstop \"<|start_header_id|>\"\nstop \"<|end_header_id|>\"\nstop \"<|eot_id|>\"",
"template": "{{ if .System }}<|start_header_id|>system<|end_header_id|>\n\n{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>\n\n{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>\n\n{{ .Response }}<|eot_id|>",
"details": {
"parent_model": "",
"format": "gguf",
"family": "llama",
"families": ["llama"],
"parameter_size": "8.0B",
"quantization_level": "Q4_0",
},
"model_info": {
"general.architecture": "llama",
"general.file_type": 2,
"llama.context_length": 2000000,
},
"capabilities": ["completion", "vision", "tools", "thinking"],
}
if verbose:
_log_json("OUT POST /api/show", v1_show_response)
resp = make_response(jsonify(v1_show_response), 200)
for k, v in build_cors_headers().items():
resp.headers.setdefault(k, v)
return resp
@ollama_bp.route("/api/chat", methods=["POST"])
def ollama_chat() -> Response:
verbose = bool(current_app.config.get("VERBOSE"))
reasoning_effort = current_app.config.get("REASONING_EFFORT", "medium")
reasoning_summary = current_app.config.get("REASONING_SUMMARY", "auto")
reasoning_compat = current_app.config.get("REASONING_COMPAT", "think-tags")
try:
raw = request.get_data(cache=True, as_text=True) or ""
if verbose:
print("IN POST /api/chat\n" + (raw if isinstance(raw, str) else ""))
payload = json.loads(raw) if raw else {}
except Exception:
err = {"error": "Invalid JSON body"}
if verbose:
_log_json("OUT POST /api/chat", err)
return jsonify(err), 400
model = payload.get("model")
raw_messages = payload.get("messages")
messages = convert_ollama_messages(
raw_messages, payload.get("images") if isinstance(payload.get("images"), list) else None
)
if isinstance(messages, list):
sys_idx = next((i for i, m in enumerate(messages) if isinstance(m, dict) and m.get("role") == "system"), None)
if isinstance(sys_idx, int):
sys_msg = messages.pop(sys_idx)
content = sys_msg.get("content") if isinstance(sys_msg, dict) else ""
messages.insert(0, {"role": "user", "content": content})
stream_req = payload.get("stream")
if stream_req is None:
stream_req = True
stream_req = bool(stream_req)
tools_req = payload.get("tools") if isinstance(payload.get("tools"), list) else []
tools_responses = convert_tools_chat_to_responses(normalize_ollama_tools(tools_req))
tool_choice = payload.get("tool_choice", "auto")
parallel_tool_calls = bool(payload.get("parallel_tool_calls", False))
# Passthrough Responses API tools (web_search) via ChatMock extension fields
extra_tools: List[Dict[str, Any]] = []
had_responses_tools = False
rt_payload = payload.get("responses_tools") if isinstance(payload.get("responses_tools"), list) else []
if isinstance(rt_payload, list):
for _t in rt_payload:
if not (isinstance(_t, dict) and isinstance(_t.get("type"), str)):
continue
if _t.get("type") not in ("web_search", "web_search_preview"):
err = {"error": "Only web_search/web_search_preview are supported in responses_tools"}
if verbose:
_log_json("OUT POST /api/chat", err)
return jsonify(err), 400
extra_tools.append(_t)
if not extra_tools and bool(current_app.config.get("DEFAULT_WEB_SEARCH")):
rtc = payload.get("responses_tool_choice")
if not (isinstance(rtc, str) and rtc == "none"):
extra_tools = [{"type": "web_search"}]
if extra_tools:
import json as _json
MAX_TOOLS_BYTES = 32768
try:
size = len(_json.dumps(extra_tools))
except Exception:
size = 0
if size > MAX_TOOLS_BYTES:
err = {"error": "responses_tools too large"}
if verbose:
_log_json("OUT POST /api/chat", err)
return jsonify(err), 400
had_responses_tools = True
tools_responses = (tools_responses or []) + extra_tools
rtc = payload.get("responses_tool_choice")
if isinstance(rtc, str) and rtc in ("auto", "none"):
tool_choice = rtc
if not isinstance(model, str) or not isinstance(messages, list) or not messages:
err = {"error": "Invalid request format"}
if verbose:
_log_json("OUT POST /api/chat", err)
return jsonify(err), 400
input_items = convert_chat_messages_to_responses_input(messages)
model_reasoning = extract_reasoning_from_model_name(model)
normalized_model = normalize_model_name(model)
upstream, error_resp = start_upstream_request(
normalized_model,
input_items,
instructions=_instructions_for_model(normalized_model),
tools=tools_responses,
tool_choice=tool_choice,
parallel_tool_calls=parallel_tool_calls,
reasoning_param=build_reasoning_param(
reasoning_effort,
reasoning_summary,
model_reasoning,
allowed_efforts=allowed_efforts_for_model(model),
),
)
if error_resp is not None:
if verbose:
try:
body = error_resp.get_data(as_text=True)
if body:
try:
parsed = json.loads(body)
except Exception:
parsed = body
_log_json("OUT POST /api/chat", parsed)
except Exception:
pass
return error_resp
record_rate_limits_from_response(upstream)
if upstream.status_code >= 400:
try:
err_body = json.loads(upstream.content.decode("utf-8", errors="ignore")) if upstream.content else {"raw": upstream.text}
except Exception:
err_body = {"raw": upstream.text}
if had_responses_tools:
if verbose:
print("[Passthrough] Upstream rejected tools; retrying without extras (args redacted)")
base_tools_only = convert_tools_chat_to_responses(normalize_ollama_tools(tools_req))
safe_choice = payload.get("tool_choice", "auto")
upstream2, err2 = start_upstream_request(
normalize_model_name(model),
input_items,
instructions=BASE_INSTRUCTIONS,
tools=base_tools_only,
tool_choice=safe_choice,
parallel_tool_calls=parallel_tool_calls,
reasoning_param=build_reasoning_param(
reasoning_effort,
reasoning_summary,
model_reasoning,
allowed_efforts=allowed_efforts_for_model(model),
),
)
record_rate_limits_from_response(upstream2)
if err2 is None and upstream2 is not None and upstream2.status_code < 400:
upstream = upstream2
else:
err = {"error": {"message": (err_body.get("error", {}) or {}).get("message", "Upstream error"), "code": "RESPONSES_TOOLS_REJECTED"}}
if verbose:
_log_json("OUT POST /api/chat", err)
return jsonify(err), (upstream2.status_code if upstream2 is not None else upstream.status_code)
else:
if verbose:
print("/api/chat upstream error status=", upstream.status_code, " body:", json.dumps(err_body)[:2000])
err = {"error": (err_body.get("error", {}) or {}).get("message", "Upstream error")}
if verbose:
_log_json("OUT POST /api/chat", err)
return jsonify(err), upstream.status_code
created_at = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
model_out = model if isinstance(model, str) and model.strip() else normalized_model
if stream_req:
def _gen():
compat = (current_app.config.get("REASONING_COMPAT", "think-tags") or "think-tags").strip().lower()
think_open = False
think_closed = False
saw_any_summary = False
pending_summary_paragraph = False
full_parts: List[str] = []
try:
for raw_line in upstream.iter_lines(decode_unicode=False):
if not raw_line:
continue
line = raw_line.decode("utf-8", errors="ignore") if isinstance(raw_line, (bytes, bytearray)) else raw_line
if not line.startswith("data: "):
continue
data = line[len("data: "):].strip()
if not data:
continue
if data == "[DONE]":
break
try:
evt = json.loads(data)
except Exception:
continue
kind = evt.get("type")
if kind == "response.reasoning_summary_part.added":
if compat in ("think-tags", "o3"):
if saw_any_summary:
pending_summary_paragraph = True
else:
saw_any_summary = True
elif kind in ("response.reasoning_summary_text.delta", "response.reasoning_text.delta"):
delta_txt = evt.get("delta") or ""
if compat == "o3":
if kind == "response.reasoning_summary_text.delta" and pending_summary_paragraph:
yield (
json.dumps(
{
"model": model_out,
"created_at": created_at,
"message": {"role": "assistant", "content": "\n"},
"done": False,
}
)
+ "\n"
)
full_parts.append("\n")
pending_summary_paragraph = False
if delta_txt:
yield (
json.dumps(
{
"model": model_out,
"created_at": created_at,
"message": {"role": "assistant", "content": delta_txt},
"done": False,
}
)
+ "\n"
)
full_parts.append(delta_txt)
elif compat == "think-tags":
if not think_open and not think_closed:
yield (
json.dumps(
{
"model": model_out,
"created_at": created_at,
"message": {"role": "assistant", "content": "<think>"},
"done": False,
}
)
+ "\n"
)
full_parts.append("<think>")
think_open = True
if think_open and not think_closed:
if kind == "response.reasoning_summary_text.delta" and pending_summary_paragraph:
yield (
json.dumps(
{
"model": model_out,
"created_at": created_at,
"message": {"role": "assistant", "content": "\n"},
"done": False,
}
)
+ "\n"
)
full_parts.append("\n")
pending_summary_paragraph = False
if delta_txt:
yield (
json.dumps(
{
"model": model_out,
"created_at": created_at,
"message": {"role": "assistant", "content": delta_txt},
"done": False,
}
)
+ "\n"
)
full_parts.append(delta_txt)
else:
pass
elif kind == "response.output_text.delta":
delta = evt.get("delta") or ""
if compat == "think-tags" and think_open and not think_closed:
yield (
json.dumps(
{
"model": model_out,
"created_at": created_at,
"message": {"role": "assistant", "content": "</think>"},
"done": False,
}
)
+ "\n"
)
full_parts.append("</think>")
think_open = False
think_closed = True
if delta:
yield (
json.dumps(
{
"model": model_out,
"created_at": created_at,
"message": {"role": "assistant", "content": delta},
"done": False,
}
)
+ "\n"
)
full_parts.append(delta)
elif kind == "response.completed":
break
finally:
upstream.close()
if compat == "think-tags" and think_open and not think_closed:
yield (
json.dumps(
{
"model": model_out,
"created_at": created_at,
"message": {"role": "assistant", "content": "</think>"},
"done": False,
}
)
+ "\n"
)
full_parts.append("</think>")
done_obj = {
"model": model_out,
"created_at": created_at,
"message": {"role": "assistant", "content": ""},
"done": True,
}
done_obj.update(_OLLAMA_FAKE_EVAL)
yield json.dumps(done_obj) + "\n"
if verbose:
print("OUT POST /api/chat (streaming response)")
stream_iter = stream_with_context(_gen())
stream_iter = _wrap_stream_logging("STREAM OUT /api/chat", stream_iter, verbose)
resp = current_app.response_class(
stream_iter,
status=200,
mimetype="application/x-ndjson",
)
for k, v in build_cors_headers().items():
resp.headers.setdefault(k, v)
return resp
full_text = ""
reasoning_summary_text = ""
reasoning_full_text = ""
tool_calls: List[Dict[str, Any]] = []
try:
for raw in upstream.iter_lines(decode_unicode=False):
if not raw:
continue
line = raw.decode("utf-8", errors="ignore") if isinstance(raw, (bytes, bytearray)) else raw
if not line.startswith("data: "):
continue
data = line[len("data: "):].strip()
if not data:
continue
if data == "[DONE]":
break
try:
evt = json.loads(data)
except Exception:
continue
kind = evt.get("type")
if kind == "response.output_text.delta":
full_text += evt.get("delta") or ""
elif kind == "response.reasoning_summary_text.delta":
reasoning_summary_text += evt.get("delta") or ""
elif kind == "response.reasoning_text.delta":
reasoning_full_text += evt.get("delta") or ""
elif kind == "response.output_item.done":
item = evt.get("item") or {}
if isinstance(item, dict) and item.get("type") == "function_call":
call_id = item.get("call_id") or item.get("id") or ""
name = item.get("name") or ""
args = item.get("arguments") or ""
if isinstance(call_id, str) and isinstance(name, str) and isinstance(args, str):
tool_calls.append(
{
"id": call_id,
"type": "function",
"function": {"name": name, "arguments": args},
}
)
elif kind == "response.completed":
break
finally:
upstream.close()
if (current_app.config.get("REASONING_COMPAT", "think-tags") or "think-tags").strip().lower() == "think-tags":
rtxt_parts = []
if isinstance(reasoning_summary_text, str) and reasoning_summary_text.strip():
rtxt_parts.append(reasoning_summary_text)
if isinstance(reasoning_full_text, str) and reasoning_full_text.strip():
rtxt_parts.append(reasoning_full_text)
rtxt = "\n\n".join([p for p in rtxt_parts if p])
if rtxt:
full_text = f"<think>{rtxt}</think>" + (full_text or "")
out_json = {
"model": normalize_model_name(model),
"created_at": created_at,
"message": {"role": "assistant", "content": full_text, **({"tool_calls": tool_calls} if tool_calls else {})},
"done": True,
"done_reason": "stop",
}
out_json.update(_OLLAMA_FAKE_EVAL)
if verbose:
_log_json("OUT POST /api/chat", out_json)
resp = make_response(jsonify(out_json), 200)
for k, v in build_cors_headers().items():
resp.headers.setdefault(k, v)
return resp