Unverified Commit e53b3cbd authored by Bruce MacDonald's avatar Bruce MacDonald Committed by GitHub
Browse files

llm: set done reason at server level (#9830)

No functional change. Many different done reasons can be set at the runner
level, so rather than obsuring them we should return them to the server
process and let it choose what to do with the done reason. This separates
the API concerns from the runner.
parent b51e0f39
......@@ -675,9 +675,32 @@ type CompletionRequest struct {
Grammar string // set before sending the request to the subprocess
}
// DoneReason represents the reason why a completion response is done
type DoneReason int
const (
// DoneReasonStop indicates the completion stopped naturally
DoneReasonStop DoneReason = iota
// DoneReasonLength indicates the completion stopped due to length limits
DoneReasonLength
// DoneReasonConnectionClosed indicates the completion stopped due to the connection being closed
DoneReasonConnectionClosed
)
func (d DoneReason) String() string {
switch d {
case DoneReasonLength:
return "length"
case DoneReasonStop:
return "stop"
default:
return "" // closed
}
}
type CompletionResponse struct {
Content string `json:"content"`
DoneReason string `json:"done_reason"`
DoneReason DoneReason `json:"done_reason"`
Done bool `json:"done"`
PromptEvalCount int `json:"prompt_eval_count"`
PromptEvalDuration time.Duration `json:"prompt_eval_duration"`
......@@ -786,7 +809,6 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
continue
}
// slog.Debug("got line", "line", string(line))
evt, ok := bytes.CutPrefix(line, []byte("data: "))
if !ok {
evt = line
......
......@@ -83,7 +83,7 @@ type Sequence struct {
// true if an embedding are to be returned instead of text generation
embeddingOnly bool
doneReason string
doneReason llm.DoneReason
// Metrics
startProcessingTime time.Time
......@@ -301,7 +301,7 @@ func flushPending(seq *Sequence) bool {
}
}
func (s *Server) removeSequence(seqIndex int, reason string) {
func (s *Server) removeSequence(seqIndex int, reason llm.DoneReason) {
seq := s.seqs[seqIndex]
flushPending(seq)
......@@ -380,7 +380,7 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
// if past the num predict limit
if seq.numPredict > 0 && seq.numPredicted >= seq.numPredict {
s.removeSequence(seqIdx, "limit")
s.removeSequence(seqIdx, llm.DoneReasonLength)
continue
}
......@@ -482,7 +482,7 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
}
seq.embedding <- embed
s.removeSequence(i, "")
s.removeSequence(i, llm.DoneReasonStop)
continue
}
......@@ -499,7 +499,7 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
// as it's important for the /api/generate context
// seq.responses <- piece
s.removeSequence(i, "stop")
s.removeSequence(i, llm.DoneReasonStop)
continue
}
......@@ -530,7 +530,7 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
}
seq.cache.Inputs = seq.cache.Inputs[:tokenLen]
s.removeSequence(i, "stop")
s.removeSequence(i, llm.DoneReasonStop)
continue
}
......@@ -543,7 +543,7 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
}
if !flushPending(seq) {
s.removeSequence(i, "connection")
s.removeSequence(i, llm.DoneReasonConnectionClosed)
}
}
......@@ -657,14 +657,9 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
flusher.Flush()
} else {
// Send the final response
doneReason := "stop"
if seq.doneReason == "limit" {
doneReason = "length"
}
if err := json.NewEncoder(w).Encode(&llm.CompletionResponse{
Done: true,
DoneReason: doneReason,
DoneReason: seq.doneReason,
PromptEvalCount: seq.numPromptInputs,
PromptEvalDuration: seq.startGenerationTime.Sub(seq.startProcessingTime),
EvalCount: seq.numDecoded,
......
......@@ -82,7 +82,7 @@ type Sequence struct {
// true if an embedding are to be returned instead of text generation
embeddingOnly bool
doneReason string
doneReason llm.DoneReason
// Metrics
startProcessingTime time.Time
......@@ -341,7 +341,7 @@ func flushPending(seq *Sequence) bool {
}
}
func (s *Server) removeSequence(seqIndex int, reason string) {
func (s *Server) removeSequence(seqIndex int, reason llm.DoneReason) {
seq := s.seqs[seqIndex]
flushPending(seq)
......@@ -391,7 +391,7 @@ func (s *Server) processBatch() error {
// if past the num predict limit
if seq.numPredict > 0 && seq.numPredicted >= seq.numPredict {
s.removeSequence(seqIdx, "limit")
s.removeSequence(seqIdx, llm.DoneReasonLength)
continue
}
......@@ -510,7 +510,7 @@ func (s *Server) processBatch() error {
if seq.embeddingOnly {
// TODO(jessegross): Embedding support
slog.Warn("generation of embedding outputs not yet supported")
s.removeSequence(i, "")
s.removeSequence(i, llm.DoneReasonStop)
continue
}
......@@ -528,7 +528,7 @@ func (s *Server) processBatch() error {
// as it's important for the /api/generate context
// seq.responses <- piece
s.removeSequence(i, "stop")
s.removeSequence(i, llm.DoneReasonStop)
continue
}
......@@ -564,7 +564,7 @@ func (s *Server) processBatch() error {
}
seq.cache.Inputs = seq.cache.Inputs[:tokenLen]
s.removeSequence(i, "stop")
s.removeSequence(i, llm.DoneReasonStop)
continue
}
......@@ -577,7 +577,7 @@ func (s *Server) processBatch() error {
}
if !flushPending(seq) {
s.removeSequence(i, "connection")
s.removeSequence(i, llm.DoneReasonConnectionClosed)
}
}
......@@ -690,14 +690,9 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
flusher.Flush()
} else {
// Send the final response
doneReason := "stop"
if seq.doneReason == "limit" {
doneReason = "length"
}
if err := json.NewEncoder(w).Encode(&llm.CompletionResponse{
Done: true,
DoneReason: doneReason,
DoneReason: seq.doneReason,
PromptEvalCount: seq.numPromptInputs,
PromptEvalDuration: seq.startGenerationTime.Sub(seq.startProcessingTime),
EvalCount: seq.numPredicted,
......
......@@ -308,11 +308,10 @@ func (s *Server) GenerateHandler(c *gin.Context) {
Options: opts,
}, func(cr llm.CompletionResponse) {
res := api.GenerateResponse{
Model: req.Model,
CreatedAt: time.Now().UTC(),
Response: cr.Content,
Done: cr.Done,
DoneReason: cr.DoneReason,
Model: req.Model,
CreatedAt: time.Now().UTC(),
Response: cr.Content,
Done: cr.Done,
Metrics: api.Metrics{
PromptEvalCount: cr.PromptEvalCount,
PromptEvalDuration: cr.PromptEvalDuration,
......@@ -326,6 +325,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
}
if cr.Done {
res.DoneReason = cr.DoneReason.String()
res.TotalDuration = time.Since(checkpointStart)
res.LoadDuration = checkpointLoaded.Sub(checkpointStart)
......@@ -1533,11 +1533,10 @@ func (s *Server) ChatHandler(c *gin.Context) {
Options: opts,
}, func(r llm.CompletionResponse) {
res := api.ChatResponse{
Model: req.Model,
CreatedAt: time.Now().UTC(),
Message: api.Message{Role: "assistant", Content: r.Content},
Done: r.Done,
DoneReason: r.DoneReason,
Model: req.Model,
CreatedAt: time.Now().UTC(),
Message: api.Message{Role: "assistant", Content: r.Content},
Done: r.Done,
Metrics: api.Metrics{
PromptEvalCount: r.PromptEvalCount,
PromptEvalDuration: r.PromptEvalDuration,
......@@ -1547,6 +1546,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
}
if r.Done {
res.DoneReason = r.DoneReason.String()
res.TotalDuration = time.Since(checkpointStart)
res.LoadDuration = checkpointLoaded.Sub(checkpointStart)
}
......
......@@ -58,7 +58,7 @@ func TestGenerateChat(t *testing.T) {
mock := mockRunner{
CompletionResponse: llm.CompletionResponse{
Done: true,
DoneReason: "stop",
DoneReason: llm.DoneReasonStop,
PromptEvalCount: 1,
PromptEvalDuration: 1,
EvalCount: 1,
......@@ -401,7 +401,7 @@ func TestGenerateChat(t *testing.T) {
mock.CompletionResponse = llm.CompletionResponse{
Content: `{"name":"get_weather","arguments":{"location":"Seattle, WA","unit":"celsius"}}`,
Done: true,
DoneReason: "done",
DoneReason: llm.DoneReasonStop,
PromptEvalCount: 1,
PromptEvalDuration: 1,
EvalCount: 1,
......@@ -519,7 +519,7 @@ func TestGenerateChat(t *testing.T) {
{
Content: `, WA","unit":"celsius"}}`,
Done: true,
DoneReason: "tool_call",
DoneReason: llm.DoneReasonStop,
PromptEvalCount: 3,
PromptEvalDuration: 1,
},
......@@ -594,7 +594,7 @@ func TestGenerate(t *testing.T) {
mock := mockRunner{
CompletionResponse: llm.CompletionResponse{
Done: true,
DoneReason: "stop",
DoneReason: llm.DoneReasonStop,
PromptEvalCount: 1,
PromptEvalDuration: 1,
EvalCount: 1,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment