fix ollama regression
This commit is contained in:
@@ -1,10 +1,11 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
import datetime
|
||||||
import time
|
import time
|
||||||
from typing import Any, Dict, List
|
from typing import Any, Dict, List
|
||||||
|
|
||||||
from flask import Blueprint, Response, current_app, jsonify, make_response, request
|
from flask import Blueprint, Response, current_app, jsonify, make_response, request, stream_with_context
|
||||||
|
|
||||||
from .config import BASE_INSTRUCTIONS
|
from .config import BASE_INSTRUCTIONS
|
||||||
from .http import build_cors_headers
|
from .http import build_cors_headers
|
||||||
@@ -160,7 +161,8 @@ def ollama_chat() -> Response:
|
|||||||
upstream.status_code,
|
upstream.status_code,
|
||||||
)
|
)
|
||||||
|
|
||||||
created_at = str(int(time.time() * 1000))
|
created_at = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||||
|
model_out = normalize_model_name(model)
|
||||||
|
|
||||||
if stream_req:
|
if stream_req:
|
||||||
def _gen():
|
def _gen():
|
||||||
@@ -169,6 +171,7 @@ def ollama_chat() -> Response:
|
|||||||
think_closed = False
|
think_closed = False
|
||||||
saw_any_summary = False
|
saw_any_summary = False
|
||||||
pending_summary_paragraph = False
|
pending_summary_paragraph = False
|
||||||
|
full_parts: List[str] = []
|
||||||
try:
|
try:
|
||||||
for raw_line in upstream.iter_lines(decode_unicode=False):
|
for raw_line in upstream.iter_lines(decode_unicode=False):
|
||||||
if not raw_line:
|
if not raw_line:
|
||||||
@@ -196,31 +199,134 @@ def ollama_chat() -> Response:
|
|||||||
delta_txt = evt.get("delta") or ""
|
delta_txt = evt.get("delta") or ""
|
||||||
if compat == "o3":
|
if compat == "o3":
|
||||||
if kind == "response.reasoning_summary_text.delta" and pending_summary_paragraph:
|
if kind == "response.reasoning_summary_text.delta" and pending_summary_paragraph:
|
||||||
yield json.dumps({"message": {"role": "assistant", "content": "\n"}}) + "\n"
|
yield (
|
||||||
|
json.dumps(
|
||||||
|
{
|
||||||
|
"model": model_out,
|
||||||
|
"created_at": created_at,
|
||||||
|
"message": {"role": "assistant", "content": "\n"},
|
||||||
|
"done": False,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
+ "\n"
|
||||||
|
)
|
||||||
|
full_parts.append("\n")
|
||||||
pending_summary_paragraph = False
|
pending_summary_paragraph = False
|
||||||
|
if delta_txt:
|
||||||
|
yield (
|
||||||
|
json.dumps(
|
||||||
|
{
|
||||||
|
"model": model_out,
|
||||||
|
"created_at": created_at,
|
||||||
|
"message": {"role": "assistant", "content": delta_txt},
|
||||||
|
"done": False,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
+ "\n"
|
||||||
|
)
|
||||||
|
full_parts.append(delta_txt)
|
||||||
elif compat == "think-tags":
|
elif compat == "think-tags":
|
||||||
if not think_open and not think_closed:
|
if not think_open and not think_closed:
|
||||||
yield json.dumps({"message": {"role": "assistant", "content": "<think>"}}) + "\n"
|
yield (
|
||||||
|
json.dumps(
|
||||||
|
{
|
||||||
|
"model": model_out,
|
||||||
|
"created_at": created_at,
|
||||||
|
"message": {"role": "assistant", "content": "<think>"},
|
||||||
|
"done": False,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
+ "\n"
|
||||||
|
)
|
||||||
|
full_parts.append("<think>")
|
||||||
think_open = True
|
think_open = True
|
||||||
if think_open and not think_closed:
|
if think_open and not think_closed:
|
||||||
if kind == "response.reasoning_summary_text.delta" and pending_summary_paragraph:
|
if kind == "response.reasoning_summary_text.delta" and pending_summary_paragraph:
|
||||||
yield json.dumps({"message": {"role": "assistant", "content": "\n"}}) + "\n"
|
yield (
|
||||||
|
json.dumps(
|
||||||
|
{
|
||||||
|
"model": model_out,
|
||||||
|
"created_at": created_at,
|
||||||
|
"message": {"role": "assistant", "content": "\n"},
|
||||||
|
"done": False,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
+ "\n"
|
||||||
|
)
|
||||||
|
full_parts.append("\n")
|
||||||
pending_summary_paragraph = False
|
pending_summary_paragraph = False
|
||||||
|
if delta_txt:
|
||||||
|
yield (
|
||||||
|
json.dumps(
|
||||||
|
{
|
||||||
|
"model": model_out,
|
||||||
|
"created_at": created_at,
|
||||||
|
"message": {"role": "assistant", "content": delta_txt},
|
||||||
|
"done": False,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
+ "\n"
|
||||||
|
)
|
||||||
|
full_parts.append(delta_txt)
|
||||||
else:
|
else:
|
||||||
pass
|
pass
|
||||||
elif kind == "response.output_text.delta":
|
elif kind == "response.output_text.delta":
|
||||||
delta = evt.get("delta") or ""
|
delta = evt.get("delta") or ""
|
||||||
if compat == "think-tags" and think_open and not think_closed:
|
if compat == "think-tags" and think_open and not think_closed:
|
||||||
yield json.dumps({"message": {"role": "assistant", "content": "</think>"}}) + "\n"
|
yield (
|
||||||
|
json.dumps(
|
||||||
|
{
|
||||||
|
"model": model_out,
|
||||||
|
"created_at": created_at,
|
||||||
|
"message": {"role": "assistant", "content": "</think>"},
|
||||||
|
"done": False,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
+ "\n"
|
||||||
|
)
|
||||||
|
full_parts.append("</think>")
|
||||||
think_open = False
|
think_open = False
|
||||||
think_closed = True
|
think_closed = True
|
||||||
yield json.dumps({"message": {"role": "assistant", "content": delta}}) + "\n"
|
if delta:
|
||||||
|
yield (
|
||||||
|
json.dumps(
|
||||||
|
{
|
||||||
|
"model": model_out,
|
||||||
|
"created_at": created_at,
|
||||||
|
"message": {"role": "assistant", "content": delta},
|
||||||
|
"done": False,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
+ "\n"
|
||||||
|
)
|
||||||
|
full_parts.append(delta)
|
||||||
elif kind == "response.completed":
|
elif kind == "response.completed":
|
||||||
break
|
break
|
||||||
finally:
|
finally:
|
||||||
upstream.close()
|
upstream.close()
|
||||||
|
if compat == "think-tags" and think_open and not think_closed:
|
||||||
|
yield (
|
||||||
|
json.dumps(
|
||||||
|
{
|
||||||
|
"model": model_out,
|
||||||
|
"created_at": created_at,
|
||||||
|
"message": {"role": "assistant", "content": "</think>"},
|
||||||
|
"done": False,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
+ "\n"
|
||||||
|
)
|
||||||
|
full_parts.append("</think>")
|
||||||
|
done_obj = {
|
||||||
|
"model": model_out,
|
||||||
|
"created_at": created_at,
|
||||||
|
"message": {"role": "assistant", "content": "".join(full_parts)},
|
||||||
|
"done": True,
|
||||||
|
}
|
||||||
|
done_obj.update(_OLLAMA_FAKE_EVAL)
|
||||||
|
yield json.dumps(done_obj) + "\n"
|
||||||
resp = current_app.response_class(
|
resp = current_app.response_class(
|
||||||
_gen(),
|
stream_with_context(_gen()),
|
||||||
status=200,
|
status=200,
|
||||||
mimetype="application/x-ndjson",
|
mimetype="application/x-ndjson",
|
||||||
)
|
)
|
||||||
@@ -296,4 +402,3 @@ def ollama_chat() -> Response:
|
|||||||
for k, v in build_cors_headers().items():
|
for k, v in build_cors_headers().items():
|
||||||
resp.headers.setdefault(k, v)
|
resp.headers.setdefault(k, v)
|
||||||
return resp
|
return resp
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user