Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
77060d46
Unverified
Commit
77060d46
authored
Oct 08, 2025
by
Parth Sareen
Committed by
GitHub
Oct 08, 2025
Browse files
routes: structured outputs for gpt-oss (#12460)
parent
1b91d4dd
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
374 additions
and
61 deletions
+374
-61
server/routes.go
server/routes.go
+140
-61
server/routes_generate_test.go
server/routes_generate_test.go
+234
-0
No files found.
server/routes.go
View file @
77060d46
...
...
@@ -1979,14 +1979,42 @@ func (s *Server) ChatHandler(c *gin.Context) {
toolParser
=
tools
.
NewParser
(
m
.
Template
.
Template
,
req
.
Tools
)
}
type
structuredOutputsState
int
const
(
structuredOutputsState_None
structuredOutputsState
=
iota
structuredOutputsState_ReadyToApply
structuredOutputsState_Applying
)
ch
:=
make
(
chan
any
)
go
func
()
{
defer
close
(
ch
)
if
err
:=
r
.
Completion
(
c
.
Request
.
Context
(),
llm
.
CompletionRequest
{
structuredOutputsState
:=
structuredOutputsState_None
for
{
var
tb
strings
.
Builder
currentFormat
:=
req
.
Format
// structured outputs via double request is enabled when:
// 1. the model supports the thinking capability and
// 2. it uses a built-in parser or our generic thinking parser
// Note that the current approach does not work for (potential future)
// non-thinking models that emit anything before actual content. This
// current approach uses the transition from parsed thinking content to
// parsed non-thinking content as the signal to turn constraining on
if
req
.
Format
!=
nil
&&
structuredOutputsState
==
structuredOutputsState_None
&&
((
builtinParser
!=
nil
||
thinkingState
!=
nil
)
&&
slices
.
Contains
(
m
.
Capabilities
(),
model
.
CapabilityThinking
))
{
currentFormat
=
nil
}
// sets up new context given parent context per request
ctx
,
cancel
:=
context
.
WithCancel
(
c
.
Request
.
Context
())
err
:=
r
.
Completion
(
ctx
,
llm
.
CompletionRequest
{
Prompt
:
prompt
,
Images
:
images
,
Format
:
req
.
Format
,
Format
:
current
Format
,
Options
:
opts
,
},
func
(
r
llm
.
CompletionResponse
)
{
res
:=
api
.
ChatResponse
{
...
...
@@ -2020,13 +2048,20 @@ func (s *Server) ChatHandler(c *gin.Context) {
res
.
Message
.
Thinking
=
thinking
res
.
Message
.
ToolCalls
=
toolCalls
tb
.
WriteString
(
thinking
)
// we are now receiving content from the model - we should start applying structured outputs
if
structuredOutputsState
==
structuredOutputsState_None
&&
req
.
Format
!=
nil
&&
tb
.
String
()
!=
""
&&
res
.
Message
.
Content
!=
""
{
structuredOutputsState
=
structuredOutputsState_ReadyToApply
cancel
()
return
}
if
res
.
Message
.
Content
!=
""
||
res
.
Message
.
Thinking
!=
""
||
len
(
res
.
Message
.
ToolCalls
)
>
0
||
r
.
Done
{
slog
.
Log
(
context
.
TODO
(),
logutil
.
LevelTrace
,
"builtin parser output"
,
"parser"
,
m
.
Config
.
Parser
,
"content"
,
content
,
"thinking"
,
thinking
,
"toolCalls"
,
toolCalls
,
"done"
,
r
.
Done
)
ch
<-
res
}
else
{
slog
.
Log
(
context
.
TODO
(),
logutil
.
LevelTrace
,
"builtin parser empty output"
,
"parser"
,
m
.
Config
.
Parser
)
}
return
}
...
...
@@ -2036,8 +2071,18 @@ func (s *Server) ChatHandler(c *gin.Context) {
// need to accumulate more to decide what to send
return
}
res
.
Message
.
Content
=
remainingContent
res
.
Message
.
Thinking
=
thinkingContent
tb
.
WriteString
(
thinkingContent
)
// emit the collected thinking text before restarting with structured outputs and clear unstructured content
// to avoid leaking mixed tokens like "</think>Hello"
if
structuredOutputsState
==
structuredOutputsState_None
&&
req
.
Format
!=
nil
&&
tb
.
String
()
!=
""
&&
remainingContent
!=
""
{
structuredOutputsState
=
structuredOutputsState_ReadyToApply
res
.
Message
.
Content
=
""
ch
<-
res
cancel
()
return
}
res
.
Message
.
Content
=
remainingContent
}
if
len
(
req
.
Tools
)
>
0
{
...
...
@@ -2059,8 +2104,42 @@ func (s *Server) ChatHandler(c *gin.Context) {
}
ch
<-
res
});
err
!=
nil
{
})
if
err
!=
nil
{
if
structuredOutputsState
==
structuredOutputsState_ReadyToApply
&&
strings
.
Contains
(
err
.
Error
(),
"context canceled"
)
&&
c
.
Request
.
Context
()
.
Err
()
==
nil
{
// only ignores error if it's a context cancellation due to setting structured outputs
}
else
{
ch
<-
gin
.
H
{
"error"
:
err
.
Error
()}
return
}
}
// ignored structured outputs cancellation falls through to here, start a new request with the structured outputs and updated prompt. use the
if
structuredOutputsState
==
structuredOutputsState_ReadyToApply
{
structuredOutputsState
=
structuredOutputsState_Applying
msg
:=
api
.
Message
{
Role
:
"assistant"
,
Thinking
:
tb
.
String
(),
}
msgs
=
append
(
msgs
,
msg
)
prompt
,
_
,
err
=
chatPrompt
(
c
.
Request
.
Context
(),
m
,
r
.
Tokenize
,
opts
,
msgs
,
processedTools
,
req
.
Think
)
if
err
!=
nil
{
slog
.
Error
(
"chat prompt error applying structured outputs"
,
"error"
,
err
)
ch
<-
gin
.
H
{
"error"
:
err
.
Error
()}
return
}
// force constraining by terminating thinking header, the parser is already at this state
// when the last message is thinking, the rendered for gpt-oss cannot disambiguate between having the
// model continue thinking or ending thinking and outputting the final message.
// TODO(parthsareen): consider adding prefill disambiguation logic to the renderer for structured outputs.
if
shouldUseHarmony
(
m
)
||
(
builtinParser
!=
nil
&&
m
.
Config
.
Parser
==
"harmony"
)
{
prompt
+=
"<|end|><|start|>assistant<|channel|>final<|message|>"
}
continue
}
break
}
}()
...
...
server/routes_generate_test.go
View file @
77060d46
...
...
@@ -1191,4 +1191,238 @@ func TestChatWithPromptEndingInThinkTag(t *testing.T) {
t
.
Errorf
(
"expected content %q, got %q"
,
"Based on my analysis, the solution is straightforward."
,
got
)
}
})
t
.
Run
(
"structured outputs restart non-stream"
,
func
(
t
*
testing
.
T
)
{
var
(
requestsMu
sync
.
Mutex
requests
[]
llm
.
CompletionRequest
wg
sync
.
WaitGroup
)
wg
.
Add
(
2
)
format
:=
json
.
RawMessage
(
`{"type":"object","properties":{"answer":{"type":"string"}}}`
)
mock
.
CompletionFn
=
func
(
ctx
context
.
Context
,
r
llm
.
CompletionRequest
,
fn
func
(
r
llm
.
CompletionResponse
))
error
{
defer
wg
.
Done
()
requestsMu
.
Lock
()
requests
=
append
(
requests
,
r
)
callNum
:=
len
(
requests
)
requestsMu
.
Unlock
()
switch
callNum
{
case
1
:
fn
(
llm
.
CompletionResponse
{
Content
:
" I am thinking through this problem. </think> {
\"
answer
\"
:
\"
42
\"
}"
,
Done
:
false
,
PromptEvalCount
:
1
,
PromptEvalDuration
:
1
,
})
select
{
case
<-
ctx
.
Done
()
:
return
ctx
.
Err
()
case
<-
time
.
After
(
time
.
Second
)
:
t
.
Fatalf
(
"timeout waiting for structured outputs cancellation"
)
return
nil
}
case
2
:
fn
(
llm
.
CompletionResponse
{
Content
:
`{"answer":"42"}`
,
Done
:
true
,
DoneReason
:
llm
.
DoneReasonStop
,
PromptEvalCount
:
1
,
PromptEvalDuration
:
1
,
EvalCount
:
1
,
EvalDuration
:
1
,
})
return
nil
default
:
t
.
Fatalf
(
"unexpected number of completion calls: %d"
,
callNum
)
return
nil
}
}
think
:=
true
streamRequest
:=
false
w
:=
createRequest
(
t
,
s
.
ChatHandler
,
api
.
ChatRequest
{
Model
:
"test-thinking"
,
Messages
:
[]
api
.
Message
{{
Role
:
"user"
,
Content
:
"Please respond in JSON."
}},
Think
:
&
api
.
ThinkValue
{
Value
:
think
},
Stream
:
&
streamRequest
,
Format
:
format
,
})
wg
.
Wait
()
mock
.
CompletionFn
=
nil
if
w
.
Code
!=
http
.
StatusOK
{
t
.
Fatalf
(
"expected status 200, got %d"
,
w
.
Code
)
}
if
len
(
requests
)
!=
2
{
t
.
Fatalf
(
"expected two completion calls, got %d"
,
len
(
requests
))
}
if
requests
[
0
]
.
Format
!=
nil
{
t
.
Errorf
(
"expected first completion format to be nil, got %q"
,
requests
[
0
]
.
Format
)
}
if
!
bytes
.
Equal
([]
byte
(
format
),
[]
byte
(
requests
[
1
]
.
Format
))
{
t
.
Errorf
(
"expected second completion format to match original format"
)
}
var
resp
api
.
ChatResponse
if
err
:=
json
.
NewDecoder
(
w
.
Body
)
.
Decode
(
&
resp
);
err
!=
nil
{
t
.
Fatal
(
err
)
}
if
resp
.
Message
.
Thinking
!=
"I am thinking through this problem. "
{
t
.
Errorf
(
"expected thinking %q, got %q"
,
"I am thinking through this problem. "
,
resp
.
Message
.
Thinking
)
}
if
resp
.
Message
.
Content
!=
`{"answer":"42"}`
{
t
.
Errorf
(
"expected content %q, got %q"
,
`{"answer":"42"}`
,
resp
.
Message
.
Content
)
}
if
!
resp
.
Done
{
t
.
Errorf
(
"expected response to be done"
)
}
if
resp
.
DoneReason
!=
"stop"
{
t
.
Errorf
(
"expected done reason stop, got %s"
,
resp
.
DoneReason
)
}
})
t
.
Run
(
"structured outputs restart streaming"
,
func
(
t
*
testing
.
T
)
{
var
(
requestsMu
sync
.
Mutex
requests
[]
llm
.
CompletionRequest
wg
sync
.
WaitGroup
)
wg
.
Add
(
2
)
format
:=
json
.
RawMessage
(
`{"type":"object","properties":{"answer":{"type":"string"}}}`
)
mock
.
CompletionFn
=
func
(
ctx
context
.
Context
,
r
llm
.
CompletionRequest
,
fn
func
(
r
llm
.
CompletionResponse
))
error
{
defer
wg
.
Done
()
requestsMu
.
Lock
()
requests
=
append
(
requests
,
r
)
callNum
:=
len
(
requests
)
requestsMu
.
Unlock
()
switch
callNum
{
case
1
:
fn
(
llm
.
CompletionResponse
{
Content
:
" I am thinking through this problem. </think> {
\"
answer
\"
:
\"
42
\"
}"
,
Done
:
false
,
PromptEvalCount
:
1
,
PromptEvalDuration
:
1
,
})
select
{
case
<-
ctx
.
Done
()
:
return
ctx
.
Err
()
case
<-
time
.
After
(
time
.
Second
)
:
t
.
Fatalf
(
"timeout waiting for structured outputs cancellation"
)
return
nil
}
case
2
:
fn
(
llm
.
CompletionResponse
{
Content
:
`{"answer":"42"}`
,
Done
:
true
,
DoneReason
:
llm
.
DoneReasonStop
,
PromptEvalCount
:
1
,
PromptEvalDuration
:
1
,
EvalCount
:
1
,
EvalDuration
:
1
,
})
return
nil
default
:
t
.
Fatalf
(
"unexpected number of completion calls: %d"
,
callNum
)
return
nil
}
}
think
:=
true
streamRequest
:=
true
w
:=
createRequest
(
t
,
s
.
ChatHandler
,
api
.
ChatRequest
{
Model
:
"test-thinking"
,
Messages
:
[]
api
.
Message
{{
Role
:
"user"
,
Content
:
"Please respond in JSON."
}},
Think
:
&
api
.
ThinkValue
{
Value
:
think
},
Stream
:
&
streamRequest
,
Format
:
format
,
})
wg
.
Wait
()
mock
.
CompletionFn
=
nil
if
w
.
Code
!=
http
.
StatusOK
{
t
.
Fatalf
(
"expected status 200, got %d"
,
w
.
Code
)
}
if
len
(
requests
)
!=
2
{
t
.
Fatalf
(
"expected two completion calls, got %d"
,
len
(
requests
))
}
if
requests
[
0
]
.
Format
!=
nil
{
t
.
Errorf
(
"expected first completion format to be nil, got %q"
,
requests
[
0
]
.
Format
)
}
if
!
bytes
.
Equal
([]
byte
(
format
),
[]
byte
(
requests
[
1
]
.
Format
))
{
t
.
Errorf
(
"expected second completion format to match original format"
)
}
decoder
:=
json
.
NewDecoder
(
w
.
Body
)
var
events
[]
api
.
ChatResponse
for
{
var
event
api
.
ChatResponse
if
err
:=
decoder
.
Decode
(
&
event
);
err
==
io
.
EOF
{
break
}
else
if
err
!=
nil
{
t
.
Fatal
(
err
)
}
events
=
append
(
events
,
event
)
if
event
.
Done
{
break
}
}
if
len
(
events
)
<
2
{
t
.
Fatalf
(
"expected at least two streaming events, got %d"
,
len
(
events
))
}
first
:=
events
[
0
]
if
first
.
Message
.
Thinking
!=
"I am thinking through this problem. "
{
t
.
Errorf
(
"expected first event thinking %q, got %q"
,
"I am thinking through this problem. "
,
first
.
Message
.
Thinking
)
}
if
first
.
Message
.
Content
!=
""
{
t
.
Errorf
(
"expected first event content to be empty, got %q"
,
first
.
Message
.
Content
)
}
if
first
.
Done
{
t
.
Error
(
"expected first event to be non-terminal"
)
}
last
:=
events
[
len
(
events
)
-
1
]
if
last
.
Message
.
Thinking
!=
""
{
t
.
Errorf
(
"expected final event thinking to be empty, got %q"
,
last
.
Message
.
Thinking
)
}
if
last
.
Message
.
Content
!=
`{"answer":"42"}`
{
t
.
Errorf
(
"expected final event content %q, got %q"
,
`{"answer":"42"}`
,
last
.
Message
.
Content
)
}
if
!
last
.
Done
{
t
.
Error
(
"expected final event to be done"
)
}
if
last
.
DoneReason
!=
"stop"
{
t
.
Errorf
(
"expected final done reason stop, got %s"
,
last
.
DoneReason
)
}
})
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment