fixes #103: responses api max_output_tokens bug
This commit is contained in:
@@ -284,7 +284,7 @@ def cmd_serve(
|
||||
default_web_search=default_web_search,
|
||||
)
|
||||
|
||||
app.run(host=host, debug=False, use_reloader=False, port=port, threaded=True)
|
||||
app.run(host=host, use_reloader=False, port=port, threaded=True)
|
||||
return 0
|
||||
|
||||
|
||||
|
||||
@@ -88,6 +88,7 @@ def normalize_responses_payload(
|
||||
|
||||
normalized = dict(payload)
|
||||
normalized["model"] = normalized_model
|
||||
normalized.pop("max_output_tokens", None)
|
||||
|
||||
if "input" in normalized:
|
||||
normalized["input"] = canonicalize_responses_input(normalized.get("input"))
|
||||
|
||||
@@ -250,7 +250,7 @@ def ollama_chat() -> Response:
|
||||
input_items = convert_chat_messages_to_responses_input(messages)
|
||||
|
||||
model_reasoning = extract_reasoning_from_model_name(model)
|
||||
normalized_model = normalize_model_name(model)
|
||||
normalized_model = normalize_model_name(model, current_app.config.get("DEBUG_MODEL"))
|
||||
service_tier_resolution = resolve_service_tier(
|
||||
normalized_model,
|
||||
request_fast_mode=payload.get("fast_mode"),
|
||||
@@ -306,7 +306,7 @@ def ollama_chat() -> Response:
|
||||
base_tools_only = convert_tools_chat_to_responses(normalize_ollama_tools(tools_req))
|
||||
safe_choice = payload.get("tool_choice", "auto")
|
||||
upstream2, err2 = start_upstream_request(
|
||||
normalize_model_name(model),
|
||||
normalize_model_name(model, current_app.config.get("DEBUG_MODEL")),
|
||||
input_items,
|
||||
instructions=BASE_INSTRUCTIONS,
|
||||
tools=base_tools_only,
|
||||
@@ -570,7 +570,7 @@ def ollama_chat() -> Response:
|
||||
full_text = f"<think>{rtxt}</think>" + (full_text or "")
|
||||
|
||||
out_json = {
|
||||
"model": normalize_model_name(model),
|
||||
"model": normalize_model_name(model, current_app.config.get("DEBUG_MODEL")),
|
||||
"created_at": created_at,
|
||||
"message": {"role": "assistant", "content": full_text, **({"tool_calls": tool_calls} if tool_calls else {})},
|
||||
"done": True,
|
||||
|
||||
@@ -109,7 +109,6 @@ def chat_completions() -> Response:
|
||||
reasoning_effort = current_app.config.get("REASONING_EFFORT", "medium")
|
||||
reasoning_summary = current_app.config.get("REASONING_SUMMARY", "auto")
|
||||
reasoning_compat = current_app.config.get("REASONING_COMPAT", "think-tags")
|
||||
debug_model = current_app.config.get("DEBUG_MODEL")
|
||||
|
||||
raw = request.get_data(cache=True, as_text=True) or ""
|
||||
if verbose:
|
||||
@@ -129,7 +128,7 @@ def chat_completions() -> Response:
|
||||
return jsonify(err), 400
|
||||
|
||||
requested_model = payload.get("model")
|
||||
model = normalize_model_name(requested_model, debug_model)
|
||||
model = normalize_model_name(requested_model, current_app.config.get("DEBUG_MODEL"))
|
||||
messages = payload.get("messages")
|
||||
if messages is None and isinstance(payload.get("prompt"), str):
|
||||
messages = [{"role": "user", "content": payload.get("prompt") or ""}]
|
||||
@@ -413,7 +412,6 @@ def chat_completions() -> Response:
|
||||
def completions() -> Response:
|
||||
verbose = bool(current_app.config.get("VERBOSE"))
|
||||
verbose_obfuscation = bool(current_app.config.get("VERBOSE_OBFUSCATION"))
|
||||
debug_model = current_app.config.get("DEBUG_MODEL")
|
||||
reasoning_effort = current_app.config.get("REASONING_EFFORT", "medium")
|
||||
reasoning_summary = current_app.config.get("REASONING_SUMMARY", "auto")
|
||||
|
||||
@@ -432,7 +430,7 @@ def completions() -> Response:
|
||||
return jsonify(err), 400
|
||||
|
||||
requested_model = payload.get("model")
|
||||
model = normalize_model_name(requested_model, debug_model)
|
||||
model = normalize_model_name(requested_model, current_app.config.get("DEBUG_MODEL"))
|
||||
prompt = payload.get("prompt")
|
||||
if isinstance(prompt, list):
|
||||
prompt = "".join([p if isinstance(p, str) else "" for p in prompt])
|
||||
|
||||
Reference in New Issue
Block a user