fix ollama regression

2025-08-19 21:26:47 +05:00
parent 554ec53a25
commit 2ed6dbe2d4
1 changed files with 114 additions and 9 deletions
--- a/chatmock/routes_ollama.py
+++ b/chatmock/routes_ollama.py
@@ -1,10 +1,11 @@
 from __future__ import annotations
 import json
 import datetime
 import time
 from typing import Any, Dict, List
-from flask import Blueprint, Response, current_app, jsonify, make_response, request
+from flask import Blueprint, Response, current_app, jsonify, make_response, request, stream_with_context
 from .config import BASE_INSTRUCTIONS
 from .http import build_cors_headers
@@ -160,7 +161,8 @@ def ollama_chat() -> Response:
            upstream.status_code,
        )
-    created_at = str(int(time.time() * 1000))
+    created_at = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
    model_out = normalize_model_name(model)
    if stream_req:
        def _gen():
@@ -169,6 +171,7 @@ def ollama_chat() -> Response:
            think_closed = False
            saw_any_summary = False
            pending_summary_paragraph = False
            full_parts: List[str] = []
            try:
                for raw_line in upstream.iter_lines(decode_unicode=False):
                    if not raw_line:
@@ -196,31 +199,134 @@ def ollama_chat() -> Response:
                        delta_txt = evt.get("delta") or ""
                        if compat == "o3":
                            if kind == "response.reasoning_summary_text.delta" and pending_summary_paragraph:
-                                yield json.dumps({"message": {"role": "assistant", "content": "\n"}}) + "\n"
+                                yield (
                                    json.dumps(
                                        {
                                            "model": model_out,
                                            "created_at": created_at,
                                            "message": {"role": "assistant", "content": "\n"},
                                            "done": False,
                                        }
                                    )
                                    + "\n"
                                )
                                full_parts.append("\n")
                                pending_summary_paragraph = False
                            if delta_txt:
                                yield (
                                    json.dumps(
                                        {
                                            "model": model_out,
                                            "created_at": created_at,
                                            "message": {"role": "assistant", "content": delta_txt},
                                            "done": False,
                                        }
                                    )
                                    + "\n"
                                )
                                full_parts.append(delta_txt)
                        elif compat == "think-tags":
                            if not think_open and not think_closed:
-                                yield json.dumps({"message": {"role": "assistant", "content": "<think>"}}) + "\n"
+                                yield (
                                    json.dumps(
                                        {
                                            "model": model_out,
                                            "created_at": created_at,
                                            "message": {"role": "assistant", "content": "<think>"},
                                            "done": False,
                                        }
                                    )
                                    + "\n"
                                )
                                full_parts.append("<think>")
                                think_open = True
                            if think_open and not think_closed:
                                if kind == "response.reasoning_summary_text.delta" and pending_summary_paragraph:
-                                    yield json.dumps({"message": {"role": "assistant", "content": "\n"}}) + "\n"
+                                    yield (
                                        json.dumps(
                                            {
                                                "model": model_out,
                                                "created_at": created_at,
                                                "message": {"role": "assistant", "content": "\n"},
                                                "done": False,
                                            }
                                        )
                                        + "\n"
                                    )
                                    full_parts.append("\n")
                                    pending_summary_paragraph = False
                                if delta_txt:
                                    yield (
                                        json.dumps(
                                            {
                                                "model": model_out,
                                                "created_at": created_at,
                                                "message": {"role": "assistant", "content": delta_txt},
                                                "done": False,
                                            }
                                        )
                                        + "\n"
                                    )
                                    full_parts.append(delta_txt)
                        else:
                            pass
                    elif kind == "response.output_text.delta":
                        delta = evt.get("delta") or ""
                        if compat == "think-tags" and think_open and not think_closed:
-                            yield json.dumps({"message": {"role": "assistant", "content": "</think>"}}) + "\n"
+                            yield (
                                json.dumps(
                                    {
                                        "model": model_out,
                                        "created_at": created_at,
                                        "message": {"role": "assistant", "content": "</think>"},
                                        "done": False,
                                    }
                                )
                                + "\n"
                            )
                            full_parts.append("</think>")
                            think_open = False
                            think_closed = True
-                        yield json.dumps({"message": {"role": "assistant", "content": delta}}) + "\n"
+                        if delta:
                            yield (
                                json.dumps(
                                    {
                                        "model": model_out,
                                        "created_at": created_at,
                                        "message": {"role": "assistant", "content": delta},
                                        "done": False,
                                    }
                                )
                                + "\n"
                            )
                            full_parts.append(delta)
                    elif kind == "response.completed":
                        break
            finally:
                upstream.close()
                if compat == "think-tags" and think_open and not think_closed:
                    yield (
                        json.dumps(
                            {
                                "model": model_out,
                                "created_at": created_at,
                                "message": {"role": "assistant", "content": "</think>"},
                                "done": False,
                            }
                        )
                        + "\n"
                    )
                    full_parts.append("</think>")
                done_obj = {
                    "model": model_out,
                    "created_at": created_at,
                    "message": {"role": "assistant", "content": "".join(full_parts)},
                    "done": True,
                }
                done_obj.update(_OLLAMA_FAKE_EVAL)
                yield json.dumps(done_obj) + "\n"
        resp = current_app.response_class(
-            _gen(),
+            stream_with_context(_gen()),
            status=200,
            mimetype="application/x-ndjson",
        )
@@ -296,4 +402,3 @@ def ollama_chat() -> Response:
    for k, v in build_cors_headers().items():
        resp.headers.setdefault(k, v)
    return resp