Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
5db8a818
Unverified
Commit
5db8a818
authored
Oct 11, 2025
by
Devon Rifkin
Committed by
GitHub
Oct 11, 2025
Browse files
Merge pull request #12581 from ollama/drifkin/renderer-api-generate
routes: fix built-in renderers for `api/generate`
parents
0c68ec8d
6db8da99
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
341 additions
and
10 deletions
+341
-10
server/routes.go
server/routes.go
+27
-9
server/routes_debug_test.go
server/routes_debug_test.go
+1
-1
server/routes_generate_renderer_test.go
server/routes_generate_renderer_test.go
+313
-0
No files found.
server/routes.go
View file @
5db8a818
...
@@ -403,12 +403,11 @@ func (s *Server) GenerateHandler(c *gin.Context) {
...
@@ -403,12 +403,11 @@ func (s *Server) GenerateHandler(c *gin.Context) {
msgs
=
append
(
msgs
,
m
.
Messages
...
)
msgs
=
append
(
msgs
,
m
.
Messages
...
)
}
}
userMsg
:=
api
.
Message
{
Role
:
"user"
,
Content
:
req
.
Prompt
}
for
_
,
i
:=
range
images
{
for
_
,
i
:=
range
images
{
imgPrompt
:=
""
userMsg
.
Images
=
append
(
userMsg
.
Images
,
i
.
Data
)
msgs
=
append
(
msgs
,
api
.
Message
{
Role
:
"user"
,
Content
:
fmt
.
Sprintf
(
"[img-%d]"
+
imgPrompt
,
i
.
ID
)})
}
}
values
.
Messages
=
append
(
msgs
,
userMsg
)
values
.
Messages
=
append
(
msgs
,
api
.
Message
{
Role
:
"user"
,
Content
:
req
.
Prompt
})
}
}
values
.
Think
=
req
.
Think
!=
nil
&&
req
.
Think
.
Bool
()
values
.
Think
=
req
.
Think
!=
nil
&&
req
.
Think
.
Bool
()
...
@@ -429,6 +428,24 @@ func (s *Server) GenerateHandler(c *gin.Context) {
...
@@ -429,6 +428,24 @@ func (s *Server) GenerateHandler(c *gin.Context) {
b
.
WriteString
(
s
)
b
.
WriteString
(
s
)
}
}
// check that we're in the `api/chat`-like flow, and if so, generate the
// prompt the same way
// TEMP(drifkin): we should really just detect the chat-like flow and call
// the real chat handler, but doing this as a stopgap to get renderer
// support for generate
if
values
.
Messages
!=
nil
&&
values
.
Suffix
==
""
&&
req
.
Template
==
""
{
prompt
,
images
,
err
=
chatPrompt
(
c
.
Request
.
Context
(),
m
,
r
.
Tokenize
,
opts
,
values
.
Messages
,
[]
api
.
Tool
{},
req
.
Think
)
if
err
!=
nil
{
c
.
JSON
(
http
.
StatusInternalServerError
,
gin
.
H
{
"error"
:
err
.
Error
()})
return
}
// TEMP(drifkin): req.Context will be removed very soon, but we're temporarily supporting it in this flow here
if
req
.
Context
!=
nil
{
b
.
WriteString
(
prompt
)
prompt
=
b
.
String
()
}
}
else
{
// legacy flow
if
err
:=
tmpl
.
Execute
(
&
b
,
values
);
err
!=
nil
{
if
err
:=
tmpl
.
Execute
(
&
b
,
values
);
err
!=
nil
{
c
.
JSON
(
http
.
StatusInternalServerError
,
gin
.
H
{
"error"
:
err
.
Error
()})
c
.
JSON
(
http
.
StatusInternalServerError
,
gin
.
H
{
"error"
:
err
.
Error
()})
return
return
...
@@ -436,6 +453,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
...
@@ -436,6 +453,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
prompt
=
b
.
String
()
prompt
=
b
.
String
()
}
}
}
// If debug mode is enabled, return the rendered template instead of calling the model
// If debug mode is enabled, return the rendered template instead of calling the model
if
req
.
DebugRenderOnly
{
if
req
.
DebugRenderOnly
{
...
...
server/routes_debug_test.go
View file @
5db8a818
...
@@ -146,7 +146,7 @@ func TestGenerateDebugRenderOnly(t *testing.T) {
...
@@ -146,7 +146,7 @@ func TestGenerateDebugRenderOnly(t *testing.T) {
DebugRenderOnly
:
true
,
DebugRenderOnly
:
true
,
},
},
expectDebug
:
true
,
expectDebug
:
true
,
expectTemplate
:
"[img-0]
\n\n
Describe this image"
,
expectTemplate
:
"[img-0]Describe this image"
,
expectNumImages
:
1
,
expectNumImages
:
1
,
},
},
{
{
...
...
server/routes_generate_renderer_test.go
0 → 100644
View file @
5db8a818
package
server
import
(
"bytes"
"encoding/json"
"net/http"
"strings"
"testing"
"time"
"github.com/gin-gonic/gin"
"github.com/google/go-cmp/cmp"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/llm"
)
// TestGenerateWithBuiltinRenderer tests that api/generate uses built-in renderers
// when in chat-like flow (messages present, no suffix, no template)
func
TestGenerateWithBuiltinRenderer
(
t
*
testing
.
T
)
{
gin
.
SetMode
(
gin
.
TestMode
)
mock
:=
mockRunner
{
CompletionResponse
:
llm
.
CompletionResponse
{
Done
:
true
,
DoneReason
:
llm
.
DoneReasonStop
,
PromptEvalCount
:
1
,
PromptEvalDuration
:
1
,
EvalCount
:
1
,
EvalDuration
:
1
,
},
}
s
:=
Server
{
sched
:
&
Scheduler
{
pendingReqCh
:
make
(
chan
*
LlmRequest
,
1
),
finishedReqCh
:
make
(
chan
*
LlmRequest
,
1
),
expiredCh
:
make
(
chan
*
runnerRef
,
1
),
unloadedCh
:
make
(
chan
any
,
1
),
loaded
:
make
(
map
[
string
]
*
runnerRef
),
newServerFn
:
newMockServer
(
&
mock
),
getGpuFn
:
getGpuFn
,
getCpuFn
:
getCpuFn
,
reschedDelay
:
250
*
time
.
Millisecond
,
loadFn
:
func
(
req
*
LlmRequest
,
_
*
ggml
.
GGML
,
_
discover
.
GpuInfoList
,
_
bool
)
bool
{
time
.
Sleep
(
time
.
Millisecond
)
req
.
successCh
<-
&
runnerRef
{
llama
:
&
mock
,
}
return
false
},
},
}
go
s
.
sched
.
Run
(
t
.
Context
())
// Create a model with a built-in renderer (qwen3-coder)
_
,
digest
:=
createBinFile
(
t
,
ggml
.
KV
{
"general.architecture"
:
"qwen3"
,
"qwen3.block_count"
:
uint32
(
1
),
"qwen3.context_length"
:
uint32
(
8192
),
"qwen3.embedding_length"
:
uint32
(
4096
),
"qwen3.attention.head_count"
:
uint32
(
32
),
"qwen3.attention.head_count_kv"
:
uint32
(
8
),
"tokenizer.ggml.tokens"
:
[]
string
{
""
},
"tokenizer.ggml.scores"
:
[]
float32
{
0
},
"tokenizer.ggml.token_type"
:
[]
int32
{
0
},
},
[]
*
ggml
.
Tensor
{
{
Name
:
"token_embd.weight"
,
Shape
:
[]
uint64
{
1
},
WriterTo
:
bytes
.
NewReader
(
make
([]
byte
,
4
))},
{
Name
:
"blk.0.attn_norm.weight"
,
Shape
:
[]
uint64
{
1
},
WriterTo
:
bytes
.
NewReader
(
make
([]
byte
,
4
))},
{
Name
:
"blk.0.ffn_down.weight"
,
Shape
:
[]
uint64
{
1
},
WriterTo
:
bytes
.
NewReader
(
make
([]
byte
,
4
))},
{
Name
:
"blk.0.ffn_gate.weight"
,
Shape
:
[]
uint64
{
1
},
WriterTo
:
bytes
.
NewReader
(
make
([]
byte
,
4
))},
{
Name
:
"blk.0.ffn_up.weight"
,
Shape
:
[]
uint64
{
1
},
WriterTo
:
bytes
.
NewReader
(
make
([]
byte
,
4
))},
{
Name
:
"blk.0.ffn_norm.weight"
,
Shape
:
[]
uint64
{
1
},
WriterTo
:
bytes
.
NewReader
(
make
([]
byte
,
4
))},
{
Name
:
"blk.0.attn_k.weight"
,
Shape
:
[]
uint64
{
1
},
WriterTo
:
bytes
.
NewReader
(
make
([]
byte
,
4
))},
{
Name
:
"blk.0.attn_output.weight"
,
Shape
:
[]
uint64
{
1
},
WriterTo
:
bytes
.
NewReader
(
make
([]
byte
,
4
))},
{
Name
:
"blk.0.attn_q.weight"
,
Shape
:
[]
uint64
{
1
},
WriterTo
:
bytes
.
NewReader
(
make
([]
byte
,
4
))},
{
Name
:
"blk.0.attn_v.weight"
,
Shape
:
[]
uint64
{
1
},
WriterTo
:
bytes
.
NewReader
(
make
([]
byte
,
4
))},
{
Name
:
"output.weight"
,
Shape
:
[]
uint64
{
1
},
WriterTo
:
bytes
.
NewReader
(
make
([]
byte
,
4
))},
})
// Create a model with the qwen3-coder renderer
w
:=
createRequest
(
t
,
s
.
CreateHandler
,
api
.
CreateRequest
{
Model
:
"test-renderer"
,
Files
:
map
[
string
]
string
{
"file.gguf"
:
digest
},
Renderer
:
"qwen3-coder"
,
Stream
:
&
stream
,
})
if
w
.
Code
!=
http
.
StatusOK
{
t
.
Fatalf
(
"expected status 200, got %d"
,
w
.
Code
)
}
mock
.
CompletionResponse
.
Content
=
"Hi!"
t
.
Run
(
"chat-like flow uses renderer"
,
func
(
t
*
testing
.
T
)
{
// Test that when using messages (chat-like flow), the built-in renderer is used
w
:=
createRequest
(
t
,
s
.
GenerateHandler
,
api
.
GenerateRequest
{
Model
:
"test-renderer"
,
Prompt
:
"Write a hello world function"
,
Stream
:
&
stream
,
})
if
w
.
Code
!=
http
.
StatusOK
{
t
.
Errorf
(
"expected status 200, got %d"
,
w
.
Code
)
}
// The qwen3-coder renderer produces output with <|im_start|> and <|im_end|> tags
// When messages are built internally from prompt, it should use the renderer
if
!
strings
.
Contains
(
mock
.
CompletionRequest
.
Prompt
,
"<|im_start|>"
)
{
t
.
Errorf
(
"expected prompt to contain <|im_start|> from qwen3-coder renderer, got: %s"
,
mock
.
CompletionRequest
.
Prompt
)
}
if
!
strings
.
Contains
(
mock
.
CompletionRequest
.
Prompt
,
"<|im_end|>"
)
{
t
.
Errorf
(
"expected prompt to contain <|im_end|> from qwen3-coder renderer, got: %s"
,
mock
.
CompletionRequest
.
Prompt
)
}
})
t
.
Run
(
"chat-like flow with system message uses renderer"
,
func
(
t
*
testing
.
T
)
{
// Test that system messages work with the renderer
w
:=
createRequest
(
t
,
s
.
GenerateHandler
,
api
.
GenerateRequest
{
Model
:
"test-renderer"
,
Prompt
:
"Write a hello world function"
,
System
:
"You are a helpful coding assistant."
,
Stream
:
&
stream
,
})
if
w
.
Code
!=
http
.
StatusOK
{
t
.
Errorf
(
"expected status 200, got %d"
,
w
.
Code
)
}
// Should contain the system message and use renderer format
if
!
strings
.
Contains
(
mock
.
CompletionRequest
.
Prompt
,
"<|im_start|>system"
)
{
t
.
Errorf
(
"expected prompt to contain system message with renderer format, got: %s"
,
mock
.
CompletionRequest
.
Prompt
)
}
if
!
strings
.
Contains
(
mock
.
CompletionRequest
.
Prompt
,
"You are a helpful coding assistant."
)
{
t
.
Errorf
(
"expected prompt to contain system message content, got: %s"
,
mock
.
CompletionRequest
.
Prompt
)
}
})
t
.
Run
(
"custom template bypasses renderer"
,
func
(
t
*
testing
.
T
)
{
// Test that providing a custom template uses the legacy flow
w
:=
createRequest
(
t
,
s
.
GenerateHandler
,
api
.
GenerateRequest
{
Model
:
"test-renderer"
,
Prompt
:
"Write a hello world function"
,
Template
:
"{{ .Prompt }}"
,
Stream
:
&
stream
,
})
if
w
.
Code
!=
http
.
StatusOK
{
t
.
Errorf
(
"expected status 200, got %d"
,
w
.
Code
)
}
// Should NOT use the renderer format when custom template is provided
if
strings
.
Contains
(
mock
.
CompletionRequest
.
Prompt
,
"<|im_start|>"
)
{
t
.
Errorf
(
"expected prompt to NOT use renderer when custom template provided, got: %s"
,
mock
.
CompletionRequest
.
Prompt
)
}
// Should just be the raw prompt from the template
if
diff
:=
cmp
.
Diff
(
mock
.
CompletionRequest
.
Prompt
,
"Write a hello world function"
);
diff
!=
""
{
t
.
Errorf
(
"mismatch (-got +want):
\n
%s"
,
diff
)
}
})
// Create a model with suffix support for the next test
w
=
createRequest
(
t
,
s
.
CreateHandler
,
api
.
CreateRequest
{
Model
:
"test-suffix-renderer"
,
From
:
"test-renderer"
,
Template
:
`{{- if .Suffix }}<PRE> {{ .Prompt }} <SUF>{{ .Suffix }} <MID>
{{- else }}{{ .Prompt }}
{{- end }}`
,
})
if
w
.
Code
!=
http
.
StatusOK
{
t
.
Fatalf
(
"expected status 200, got %d"
,
w
.
Code
)
}
t
.
Run
(
"suffix bypasses renderer"
,
func
(
t
*
testing
.
T
)
{
// Test that providing a suffix uses the legacy flow
w
:=
createRequest
(
t
,
s
.
GenerateHandler
,
api
.
GenerateRequest
{
Model
:
"test-suffix-renderer"
,
Prompt
:
"def add("
,
Suffix
:
" return c"
,
})
if
w
.
Code
!=
http
.
StatusOK
{
t
.
Errorf
(
"expected status 200, got %d"
,
w
.
Code
)
}
// Should NOT use the renderer format when suffix is provided
if
strings
.
Contains
(
mock
.
CompletionRequest
.
Prompt
,
"<|im_start|>"
)
{
t
.
Errorf
(
"expected prompt to NOT use renderer when suffix provided, got: %s"
,
mock
.
CompletionRequest
.
Prompt
)
}
// Should use the suffix template format
if
diff
:=
cmp
.
Diff
(
mock
.
CompletionRequest
.
Prompt
,
"<PRE> def add( <SUF> return c <MID>"
);
diff
!=
""
{
t
.
Errorf
(
"mismatch (-got +want):
\n
%s"
,
diff
)
}
})
}
// TestGenerateWithDebugRenderOnly tests that debug_render_only works with built-in renderers
func
TestGenerateWithDebugRenderOnly
(
t
*
testing
.
T
)
{
gin
.
SetMode
(
gin
.
TestMode
)
mock
:=
mockRunner
{
CompletionResponse
:
llm
.
CompletionResponse
{
Done
:
true
,
DoneReason
:
llm
.
DoneReasonStop
,
PromptEvalCount
:
1
,
PromptEvalDuration
:
1
,
EvalCount
:
1
,
EvalDuration
:
1
,
},
}
s
:=
Server
{
sched
:
&
Scheduler
{
pendingReqCh
:
make
(
chan
*
LlmRequest
,
1
),
finishedReqCh
:
make
(
chan
*
LlmRequest
,
1
),
expiredCh
:
make
(
chan
*
runnerRef
,
1
),
unloadedCh
:
make
(
chan
any
,
1
),
loaded
:
make
(
map
[
string
]
*
runnerRef
),
newServerFn
:
newMockServer
(
&
mock
),
getGpuFn
:
getGpuFn
,
getCpuFn
:
getCpuFn
,
reschedDelay
:
250
*
time
.
Millisecond
,
loadFn
:
func
(
req
*
LlmRequest
,
_
*
ggml
.
GGML
,
_
discover
.
GpuInfoList
,
_
bool
)
bool
{
time
.
Sleep
(
time
.
Millisecond
)
req
.
successCh
<-
&
runnerRef
{
llama
:
&
mock
,
}
return
false
},
},
}
go
s
.
sched
.
Run
(
t
.
Context
())
// Create a model with a built-in renderer
_
,
digest
:=
createBinFile
(
t
,
ggml
.
KV
{
"general.architecture"
:
"qwen3"
,
"qwen3.block_count"
:
uint32
(
1
),
"qwen3.context_length"
:
uint32
(
8192
),
"qwen3.embedding_length"
:
uint32
(
4096
),
"qwen3.attention.head_count"
:
uint32
(
32
),
"qwen3.attention.head_count_kv"
:
uint32
(
8
),
"tokenizer.ggml.tokens"
:
[]
string
{
""
},
"tokenizer.ggml.scores"
:
[]
float32
{
0
},
"tokenizer.ggml.token_type"
:
[]
int32
{
0
},
},
[]
*
ggml
.
Tensor
{
{
Name
:
"token_embd.weight"
,
Shape
:
[]
uint64
{
1
},
WriterTo
:
bytes
.
NewReader
(
make
([]
byte
,
4
))},
{
Name
:
"blk.0.attn_norm.weight"
,
Shape
:
[]
uint64
{
1
},
WriterTo
:
bytes
.
NewReader
(
make
([]
byte
,
4
))},
{
Name
:
"blk.0.ffn_down.weight"
,
Shape
:
[]
uint64
{
1
},
WriterTo
:
bytes
.
NewReader
(
make
([]
byte
,
4
))},
{
Name
:
"blk.0.ffn_gate.weight"
,
Shape
:
[]
uint64
{
1
},
WriterTo
:
bytes
.
NewReader
(
make
([]
byte
,
4
))},
{
Name
:
"blk.0.ffn_up.weight"
,
Shape
:
[]
uint64
{
1
},
WriterTo
:
bytes
.
NewReader
(
make
([]
byte
,
4
))},
{
Name
:
"blk.0.ffn_norm.weight"
,
Shape
:
[]
uint64
{
1
},
WriterTo
:
bytes
.
NewReader
(
make
([]
byte
,
4
))},
{
Name
:
"blk.0.attn_k.weight"
,
Shape
:
[]
uint64
{
1
},
WriterTo
:
bytes
.
NewReader
(
make
([]
byte
,
4
))},
{
Name
:
"blk.0.attn_output.weight"
,
Shape
:
[]
uint64
{
1
},
WriterTo
:
bytes
.
NewReader
(
make
([]
byte
,
4
))},
{
Name
:
"blk.0.attn_q.weight"
,
Shape
:
[]
uint64
{
1
},
WriterTo
:
bytes
.
NewReader
(
make
([]
byte
,
4
))},
{
Name
:
"blk.0.attn_v.weight"
,
Shape
:
[]
uint64
{
1
},
WriterTo
:
bytes
.
NewReader
(
make
([]
byte
,
4
))},
{
Name
:
"output.weight"
,
Shape
:
[]
uint64
{
1
},
WriterTo
:
bytes
.
NewReader
(
make
([]
byte
,
4
))},
})
w
:=
createRequest
(
t
,
s
.
CreateHandler
,
api
.
CreateRequest
{
Model
:
"test-debug-renderer"
,
Files
:
map
[
string
]
string
{
"file.gguf"
:
digest
},
Renderer
:
"qwen3-coder"
,
Stream
:
&
stream
,
})
if
w
.
Code
!=
http
.
StatusOK
{
t
.
Fatalf
(
"expected status 200, got %d"
,
w
.
Code
)
}
t
.
Run
(
"debug_render_only with renderer"
,
func
(
t
*
testing
.
T
)
{
w
:=
createRequest
(
t
,
s
.
GenerateHandler
,
api
.
GenerateRequest
{
Model
:
"test-debug-renderer"
,
Prompt
:
"Write a hello world function"
,
System
:
"You are a coding assistant"
,
DebugRenderOnly
:
true
,
})
if
w
.
Code
!=
http
.
StatusOK
{
t
.
Errorf
(
"expected status 200, got %d"
,
w
.
Code
)
}
var
resp
api
.
GenerateResponse
if
err
:=
json
.
NewDecoder
(
w
.
Body
)
.
Decode
(
&
resp
);
err
!=
nil
{
t
.
Fatal
(
err
)
}
if
resp
.
DebugInfo
==
nil
{
t
.
Fatalf
(
"expected debug info, got nil"
)
}
// Verify that the rendered template uses the built-in renderer
if
!
strings
.
Contains
(
resp
.
DebugInfo
.
RenderedTemplate
,
"<|im_start|>"
)
{
t
.
Errorf
(
"expected rendered template to use qwen3-coder renderer format, got: %s"
,
resp
.
DebugInfo
.
RenderedTemplate
)
}
if
!
strings
.
Contains
(
resp
.
DebugInfo
.
RenderedTemplate
,
"You are a coding assistant"
)
{
t
.
Errorf
(
"expected rendered template to contain system message, got: %s"
,
resp
.
DebugInfo
.
RenderedTemplate
)
}
if
!
strings
.
Contains
(
resp
.
DebugInfo
.
RenderedTemplate
,
"Write a hello world function"
)
{
t
.
Errorf
(
"expected rendered template to contain prompt, got: %s"
,
resp
.
DebugInfo
.
RenderedTemplate
)
}
})
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment