runner.go: Don't set cross attention before sending embeddings

Currently if an input has embeddings at any point then we will set cross attention to true from the beginning. This means that any tokens before the embeddings are sent will incorrectly have cross attention layers applied. This only sets cross attention when we have an embedding, either previously in this sequence or in the cache. It also makes cross attention capable of supporting parallelism at the runner level, though the mllama implementation doesn't support that yet.

runner.go: Don't set cross attention before sending embeddings
Currently if an input has embeddings at any point then we will set cross attention to true from the beginning. This means that any tokens before the embeddings are sent will incorrectly have cross attention layers applied. This only sets cross attention when we have an embedding, either previously in this sequence or in the cache. It also makes cross attention capable of supporting parallelism at the runner level, though the mllama implementation doesn't support that yet.
26acdcf4 · Jesse Gross · Jesse Gross · 921779bb · 26acdcf4 · 26acdcf4
Commit 26acdcf4 authored Oct 31, 2024 by Jesse Gross Committed by Jesse Gross Oct 31, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 23 additions and 9 deletions

llama/runner/image.go llama/runner/image.go +11 -0

llama/runner/runner.go llama/runner/runner.go +12 -9

No files found.
--- a/llama/runner/image.go
+++ b/llama/runner/image.go
@@ -5,6 +5,7 @@ import (
 	"fmt"
 	"hash/maphash"
 	"log/slog"
+	"slices"
 	"sync"
 	"time"

@@ -96,6 +97,16 @@ func (c *ImageContext) EmbedSize(llamaContext *llama.Context) int {
 	}
 }

+func (c *ImageContext) NeedCrossAttention(inputs ...input) bool {
+	if c == nil || c.mllama == nil {
+		return false
+	}
+
+	return slices.ContainsFunc(inputs, func(input input) bool {
+		return input.embed != nil
+	})
+}
+
 type imageCache struct {
 	key      uint64
 	val      [][]float32

--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -52,6 +52,10 @@ type Sequence struct {
 	// input cache being used by this sequence
 	cache *InputCacheSlot

+	// does this sequence require cross-attention layers to be processed? - if we have seen
+	// an image for certain multi-modal models
+	crossAttention bool
+
 	// channel to send responses over
 	responses chan string

@@ -287,7 +291,6 @@ func flushPending(seq *Sequence) bool {
 func (s *Server) removeSequence(seqIndex int, reason string) {
 	seq := s.seqs[seqIndex]

-	s.lc.SetCrossAttention(false)
 	flushPending(seq)
 	seq.doneReason = reason
 	close(seq.responses)
@@ -334,6 +337,7 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 	defer s.mu.Unlock()

 	var batch *llama.Batch
+	crossAttention := false

 	seqIdx := s.nextSeq - 1
 	for range s.seqs {
@@ -367,8 +371,9 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 					batch = tokenBatch
 				} else {
 					batch = embedBatch
+					seq.crossAttention = s.image.NeedCrossAttention(input)
 				}
-			} else if embedding != batch.IsEmbedding() {
+			} else if embedding != batch.IsEmbedding() || crossAttention != seq.crossAttention {
 				s.nextSeq = seqIdx
 				break
 			}
@@ -378,6 +383,7 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 				break
 			}

+			crossAttention = seq.crossAttention
 			batch.Add(input.token, input.embed, seq.numPast, []int{seq.cache.Id}, numInputsProcessed+1 == len(seq.inputs))
 			seq.numPast++
 			numInputsProcessed++
@@ -394,6 +400,8 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 		return
 	}

+	s.lc.SetCrossAttention(crossAttention)
+
 	err := s.lc.Decode(batch)
 	if err != nil {
 		slog.Error("failed to decode batch", "error", err)
@@ -605,13 +613,6 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 	s.mu.Lock()
 	for i, sq := range s.seqs {
 		if sq == nil {
-			for _, input := range seq.inputs {
-				if input.embed != nil {
-					s.lc.SetCrossAttention(true)
-					break
-				}
-			}
-
 			seq.cache, seq.inputs, seq.numPast, err = s.cache.LoadCacheSlot(seq.inputs, req.CachePrompt)
 			if err != nil {
 				s.mu.Unlock()
@@ -619,6 +620,8 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 				return
 			}

+			seq.crossAttention = s.image.NeedCrossAttention(seq.cache.Inputs...)
+
 			s.seqs[i] = seq
 			s.cond.Signal()
 			break