300 lines
13 KiB
Python
300 lines
13 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
import time
|
|
from typing import Any, Dict, List
|
|
|
|
from flask import Blueprint, Response, current_app, jsonify, make_response, request
|
|
|
|
from .config import BASE_INSTRUCTIONS
|
|
from .http import build_cors_headers
|
|
from .reasoning import build_reasoning_param
|
|
from .transform import convert_ollama_messages, normalize_ollama_tools
|
|
from .upstream import normalize_model_name, start_upstream_request
|
|
from .utils import convert_chat_messages_to_responses_input, convert_tools_chat_to_responses
|
|
|
|
|
|
ollama_bp = Blueprint("ollama", __name__)
|
|
|
|
|
|
_OLLAMA_FAKE_EVAL = {
|
|
"total_duration": 8497226791,
|
|
"load_duration": 1747193958,
|
|
"prompt_eval_count": 24,
|
|
"prompt_eval_duration": 269219750,
|
|
"eval_count": 247,
|
|
"eval_duration": 6413802458,
|
|
}
|
|
|
|
|
|
@ollama_bp.route("/api/tags", methods=["GET"])
|
|
def ollama_tags() -> Response:
|
|
if bool(current_app.config.get("VERBOSE")):
|
|
print("IN GET /api/tags")
|
|
model_id = "gpt-5"
|
|
models = [
|
|
{
|
|
"name": model_id,
|
|
"model": model_id,
|
|
"modified_at": "2023-10-01T00:00:00Z",
|
|
"size": 815319791,
|
|
"digest": "8648f39daa8fbf5b18c7b4e6a8fb4990c692751d49917417b8842ca5758e7ffc",
|
|
"details": {
|
|
"parent_model": "",
|
|
"format": "gguf",
|
|
"family": "llama",
|
|
"families": ["llama"],
|
|
"parameter_size": "8.0B",
|
|
"quantization_level": "Q4_0",
|
|
},
|
|
}
|
|
]
|
|
resp = make_response(jsonify({"models": models}), 200)
|
|
for k, v in build_cors_headers().items():
|
|
resp.headers.setdefault(k, v)
|
|
return resp
|
|
|
|
|
|
@ollama_bp.route("/api/show", methods=["POST"])
|
|
def ollama_show() -> Response:
|
|
verbose = bool(current_app.config.get("VERBOSE"))
|
|
try:
|
|
if verbose:
|
|
body_preview = (request.get_data(cache=True, as_text=True) or "")[:2000]
|
|
print("IN POST /api/show\n" + body_preview)
|
|
except Exception:
|
|
pass
|
|
try:
|
|
payload = request.get_json(silent=True) or {}
|
|
except Exception:
|
|
payload = {}
|
|
model = payload.get("model")
|
|
if not isinstance(model, str) or not model.strip():
|
|
return jsonify({"error": "Model not found"}), 400
|
|
v1_show_response = {
|
|
"modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llava:latest\n\nFROM /models/blobs/sha256:placeholder\nTEMPLATE \"\"\"{{ .System }}\nUSER: {{ .Prompt }}\nASSISTANT: \"\"\"\nPARAMETER num_ctx 100000\nPARAMETER stop \"</s>\"\nPARAMETER stop \"USER:\"\nPARAMETER stop \"ASSISTANT:\"",
|
|
"parameters": "num_keep 24\nstop \"<|start_header_id|>\"\nstop \"<|end_header_id|>\"\nstop \"<|eot_id|>\"",
|
|
"template": "{{ if .System }}<|start_header_id|>system<|end_header_id|>\n\n{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>\n\n{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>\n\n{{ .Response }}<|eot_id|>",
|
|
"details": {
|
|
"parent_model": "",
|
|
"format": "gguf",
|
|
"family": "llama",
|
|
"families": ["llama"],
|
|
"parameter_size": "8.0B",
|
|
"quantization_level": "Q4_0",
|
|
},
|
|
"model_info": {
|
|
"general.architecture": "llama",
|
|
"general.file_type": 2,
|
|
"llama.context_length": 2000000,
|
|
},
|
|
"capabilities": ["completion", "vision", "tools", "thinking"],
|
|
}
|
|
resp = make_response(jsonify(v1_show_response), 200)
|
|
for k, v in build_cors_headers().items():
|
|
resp.headers.setdefault(k, v)
|
|
return resp
|
|
|
|
|
|
@ollama_bp.route("/api/chat", methods=["POST"])
|
|
def ollama_chat() -> Response:
|
|
verbose = bool(current_app.config.get("VERBOSE"))
|
|
reasoning_effort = current_app.config.get("REASONING_EFFORT", "medium")
|
|
reasoning_summary = current_app.config.get("REASONING_SUMMARY", "auto")
|
|
reasoning_compat = current_app.config.get("REASONING_COMPAT", "think-tags")
|
|
|
|
try:
|
|
raw = request.get_data(cache=True, as_text=True) or ""
|
|
if verbose:
|
|
print("IN POST /api/chat\n" + (raw[:2000] if isinstance(raw, str) else ""))
|
|
payload = json.loads(raw) if raw else {}
|
|
except Exception:
|
|
return jsonify({"error": "Invalid JSON body"}), 400
|
|
|
|
model = payload.get("model")
|
|
raw_messages = payload.get("messages")
|
|
messages = convert_ollama_messages(
|
|
raw_messages, payload.get("images") if isinstance(payload.get("images"), list) else None
|
|
)
|
|
if isinstance(messages, list):
|
|
sys_idx = next((i for i, m in enumerate(messages) if isinstance(m, dict) and m.get("role") == "system"), None)
|
|
if isinstance(sys_idx, int):
|
|
sys_msg = messages.pop(sys_idx)
|
|
content = sys_msg.get("content") if isinstance(sys_msg, dict) else ""
|
|
messages.insert(0, {"role": "user", "content": content})
|
|
stream_req = payload.get("stream")
|
|
if stream_req is None:
|
|
stream_req = True
|
|
stream_req = bool(stream_req)
|
|
tools_req = payload.get("tools") if isinstance(payload.get("tools"), list) else []
|
|
tools_responses = convert_tools_chat_to_responses(normalize_ollama_tools(tools_req))
|
|
tool_choice = payload.get("tool_choice", "auto")
|
|
parallel_tool_calls = bool(payload.get("parallel_tool_calls", False))
|
|
|
|
if not isinstance(model, str) or not isinstance(messages, list) or not messages:
|
|
return jsonify({"error": "Invalid request format"}), 400
|
|
|
|
input_items = convert_chat_messages_to_responses_input(messages)
|
|
|
|
upstream, error_resp = start_upstream_request(
|
|
normalize_model_name(model),
|
|
input_items,
|
|
instructions=BASE_INSTRUCTIONS,
|
|
tools=tools_responses,
|
|
tool_choice=tool_choice,
|
|
parallel_tool_calls=parallel_tool_calls,
|
|
reasoning_param=build_reasoning_param(reasoning_effort, reasoning_summary, None),
|
|
)
|
|
if error_resp is not None:
|
|
return error_resp
|
|
|
|
if upstream.status_code >= 400:
|
|
try:
|
|
err_body = json.loads(upstream.content.decode("utf-8", errors="ignore")) if upstream.content else {"raw": upstream.text}
|
|
except Exception:
|
|
err_body = {"raw": upstream.text}
|
|
if verbose:
|
|
print("/api/chat upstream error status=", upstream.status_code, " body:", json.dumps(err_body)[:2000])
|
|
return (
|
|
jsonify({"error": (err_body.get("error", {}) or {}).get("message", "Upstream error")}),
|
|
upstream.status_code,
|
|
)
|
|
|
|
created_at = str(int(time.time() * 1000))
|
|
|
|
if stream_req:
|
|
def _gen():
|
|
compat = (current_app.config.get("REASONING_COMPAT", "think-tags") or "think-tags").strip().lower()
|
|
think_open = False
|
|
think_closed = False
|
|
saw_any_summary = False
|
|
pending_summary_paragraph = False
|
|
try:
|
|
for raw_line in upstream.iter_lines(decode_unicode=False):
|
|
if not raw_line:
|
|
continue
|
|
line = raw_line.decode("utf-8", errors="ignore") if isinstance(raw_line, (bytes, bytearray)) else raw_line
|
|
if not line.startswith("data: "):
|
|
continue
|
|
data = line[len("data: "):].strip()
|
|
if not data:
|
|
continue
|
|
if data == "[DONE]":
|
|
break
|
|
try:
|
|
evt = json.loads(data)
|
|
except Exception:
|
|
continue
|
|
kind = evt.get("type")
|
|
if kind == "response.reasoning_summary_part.added":
|
|
if compat in ("think-tags", "o3"):
|
|
if saw_any_summary:
|
|
pending_summary_paragraph = True
|
|
else:
|
|
saw_any_summary = True
|
|
elif kind in ("response.reasoning_summary_text.delta", "response.reasoning_text.delta"):
|
|
delta_txt = evt.get("delta") or ""
|
|
if compat == "o3":
|
|
if kind == "response.reasoning_summary_text.delta" and pending_summary_paragraph:
|
|
yield json.dumps({"message": {"role": "assistant", "content": "\n"}}) + "\n"
|
|
pending_summary_paragraph = False
|
|
elif compat == "think-tags":
|
|
if not think_open and not think_closed:
|
|
yield json.dumps({"message": {"role": "assistant", "content": "<think>"}}) + "\n"
|
|
think_open = True
|
|
if think_open and not think_closed:
|
|
if kind == "response.reasoning_summary_text.delta" and pending_summary_paragraph:
|
|
yield json.dumps({"message": {"role": "assistant", "content": "\n"}}) + "\n"
|
|
pending_summary_paragraph = False
|
|
else:
|
|
pass
|
|
elif kind == "response.output_text.delta":
|
|
delta = evt.get("delta") or ""
|
|
if compat == "think-tags" and think_open and not think_closed:
|
|
yield json.dumps({"message": {"role": "assistant", "content": "</think>"}}) + "\n"
|
|
think_open = False
|
|
think_closed = True
|
|
yield json.dumps({"message": {"role": "assistant", "content": delta}}) + "\n"
|
|
elif kind == "response.completed":
|
|
break
|
|
finally:
|
|
upstream.close()
|
|
resp = current_app.response_class(
|
|
_gen(),
|
|
status=200,
|
|
mimetype="application/x-ndjson",
|
|
)
|
|
for k, v in build_cors_headers().items():
|
|
resp.headers.setdefault(k, v)
|
|
return resp
|
|
|
|
full_text = ""
|
|
reasoning_summary_text = ""
|
|
reasoning_full_text = ""
|
|
tool_calls: List[Dict[str, Any]] = []
|
|
try:
|
|
for raw in upstream.iter_lines(decode_unicode=False):
|
|
if not raw:
|
|
continue
|
|
line = raw.decode("utf-8", errors="ignore") if isinstance(raw, (bytes, bytearray)) else raw
|
|
if not line.startswith("data: "):
|
|
continue
|
|
data = line[len("data: "):].strip()
|
|
if not data:
|
|
continue
|
|
if data == "[DONE]":
|
|
break
|
|
try:
|
|
evt = json.loads(data)
|
|
except Exception:
|
|
continue
|
|
kind = evt.get("type")
|
|
if kind == "response.output_text.delta":
|
|
full_text += evt.get("delta") or ""
|
|
elif kind == "response.reasoning_summary_text.delta":
|
|
reasoning_summary_text += evt.get("delta") or ""
|
|
elif kind == "response.reasoning_text.delta":
|
|
reasoning_full_text += evt.get("delta") or ""
|
|
elif kind == "response.output_item.done":
|
|
item = evt.get("item") or {}
|
|
if isinstance(item, dict) and item.get("type") == "function_call":
|
|
call_id = item.get("call_id") or item.get("id") or ""
|
|
name = item.get("name") or ""
|
|
args = item.get("arguments") or ""
|
|
if isinstance(call_id, str) and isinstance(name, str) and isinstance(args, str):
|
|
tool_calls.append(
|
|
{
|
|
"id": call_id,
|
|
"type": "function",
|
|
"function": {"name": name, "arguments": args},
|
|
}
|
|
)
|
|
elif kind == "response.completed":
|
|
break
|
|
finally:
|
|
upstream.close()
|
|
|
|
if (current_app.config.get("REASONING_COMPAT", "think-tags") or "think-tags").strip().lower() == "think-tags":
|
|
rtxt_parts = []
|
|
if isinstance(reasoning_summary_text, str) and reasoning_summary_text.strip():
|
|
rtxt_parts.append(reasoning_summary_text)
|
|
if isinstance(reasoning_full_text, str) and reasoning_full_text.strip():
|
|
rtxt_parts.append(reasoning_full_text)
|
|
rtxt = "\n\n".join([p for p in rtxt_parts if p])
|
|
if rtxt:
|
|
full_text = f"<think>{rtxt}</think>" + (full_text or "")
|
|
|
|
out_json = {
|
|
"model": normalize_model_name(model),
|
|
"created_at": created_at,
|
|
"message": {"role": "assistant", "content": full_text, **({"tool_calls": tool_calls} if tool_calls else {})},
|
|
"done": True,
|
|
"done_reason": "stop",
|
|
}
|
|
out_json.update(_OLLAMA_FAKE_EVAL)
|
|
resp = make_response(jsonify(out_json), 200)
|
|
for k, v in build_cors_headers().items():
|
|
resp.headers.setdefault(k, v)
|
|
return resp
|
|
|