fix ollama regression

This commit is contained in:
Game_Time
2025-08-19 21:26:47 +05:00
parent 554ec53a25
commit 2ed6dbe2d4

View File

@@ -1,10 +1,11 @@
from __future__ import annotations from __future__ import annotations
import json import json
import datetime
import time import time
from typing import Any, Dict, List from typing import Any, Dict, List
from flask import Blueprint, Response, current_app, jsonify, make_response, request from flask import Blueprint, Response, current_app, jsonify, make_response, request, stream_with_context
from .config import BASE_INSTRUCTIONS from .config import BASE_INSTRUCTIONS
from .http import build_cors_headers from .http import build_cors_headers
@@ -160,7 +161,8 @@ def ollama_chat() -> Response:
upstream.status_code, upstream.status_code,
) )
created_at = str(int(time.time() * 1000)) created_at = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
model_out = normalize_model_name(model)
if stream_req: if stream_req:
def _gen(): def _gen():
@@ -169,6 +171,7 @@ def ollama_chat() -> Response:
think_closed = False think_closed = False
saw_any_summary = False saw_any_summary = False
pending_summary_paragraph = False pending_summary_paragraph = False
full_parts: List[str] = []
try: try:
for raw_line in upstream.iter_lines(decode_unicode=False): for raw_line in upstream.iter_lines(decode_unicode=False):
if not raw_line: if not raw_line:
@@ -196,31 +199,134 @@ def ollama_chat() -> Response:
delta_txt = evt.get("delta") or "" delta_txt = evt.get("delta") or ""
if compat == "o3": if compat == "o3":
if kind == "response.reasoning_summary_text.delta" and pending_summary_paragraph: if kind == "response.reasoning_summary_text.delta" and pending_summary_paragraph:
yield json.dumps({"message": {"role": "assistant", "content": "\n"}}) + "\n" yield (
json.dumps(
{
"model": model_out,
"created_at": created_at,
"message": {"role": "assistant", "content": "\n"},
"done": False,
}
)
+ "\n"
)
full_parts.append("\n")
pending_summary_paragraph = False pending_summary_paragraph = False
if delta_txt:
yield (
json.dumps(
{
"model": model_out,
"created_at": created_at,
"message": {"role": "assistant", "content": delta_txt},
"done": False,
}
)
+ "\n"
)
full_parts.append(delta_txt)
elif compat == "think-tags": elif compat == "think-tags":
if not think_open and not think_closed: if not think_open and not think_closed:
yield json.dumps({"message": {"role": "assistant", "content": "<think>"}}) + "\n" yield (
json.dumps(
{
"model": model_out,
"created_at": created_at,
"message": {"role": "assistant", "content": "<think>"},
"done": False,
}
)
+ "\n"
)
full_parts.append("<think>")
think_open = True think_open = True
if think_open and not think_closed: if think_open and not think_closed:
if kind == "response.reasoning_summary_text.delta" and pending_summary_paragraph: if kind == "response.reasoning_summary_text.delta" and pending_summary_paragraph:
yield json.dumps({"message": {"role": "assistant", "content": "\n"}}) + "\n" yield (
json.dumps(
{
"model": model_out,
"created_at": created_at,
"message": {"role": "assistant", "content": "\n"},
"done": False,
}
)
+ "\n"
)
full_parts.append("\n")
pending_summary_paragraph = False pending_summary_paragraph = False
if delta_txt:
yield (
json.dumps(
{
"model": model_out,
"created_at": created_at,
"message": {"role": "assistant", "content": delta_txt},
"done": False,
}
)
+ "\n"
)
full_parts.append(delta_txt)
else: else:
pass pass
elif kind == "response.output_text.delta": elif kind == "response.output_text.delta":
delta = evt.get("delta") or "" delta = evt.get("delta") or ""
if compat == "think-tags" and think_open and not think_closed: if compat == "think-tags" and think_open and not think_closed:
yield json.dumps({"message": {"role": "assistant", "content": "</think>"}}) + "\n" yield (
json.dumps(
{
"model": model_out,
"created_at": created_at,
"message": {"role": "assistant", "content": "</think>"},
"done": False,
}
)
+ "\n"
)
full_parts.append("</think>")
think_open = False think_open = False
think_closed = True think_closed = True
yield json.dumps({"message": {"role": "assistant", "content": delta}}) + "\n" if delta:
yield (
json.dumps(
{
"model": model_out,
"created_at": created_at,
"message": {"role": "assistant", "content": delta},
"done": False,
}
)
+ "\n"
)
full_parts.append(delta)
elif kind == "response.completed": elif kind == "response.completed":
break break
finally: finally:
upstream.close() upstream.close()
if compat == "think-tags" and think_open and not think_closed:
yield (
json.dumps(
{
"model": model_out,
"created_at": created_at,
"message": {"role": "assistant", "content": "</think>"},
"done": False,
}
)
+ "\n"
)
full_parts.append("</think>")
done_obj = {
"model": model_out,
"created_at": created_at,
"message": {"role": "assistant", "content": "".join(full_parts)},
"done": True,
}
done_obj.update(_OLLAMA_FAKE_EVAL)
yield json.dumps(done_obj) + "\n"
resp = current_app.response_class( resp = current_app.response_class(
_gen(), stream_with_context(_gen()),
status=200, status=200,
mimetype="application/x-ndjson", mimetype="application/x-ndjson",
) )
@@ -296,4 +402,3 @@ def ollama_chat() -> Response:
for k, v in build_cors_headers().items(): for k, v in build_cors_headers().items():
resp.headers.setdefault(k, v) resp.headers.setdefault(k, v)
return resp return resp