encoder.go 3.24 KB
Newer Older
Jesse Gross's avatar
Jesse Gross committed
1
2
3
package kvcache

import (
4
5
	"fmt"

Jesse Gross's avatar
Jesse Gross committed
6
	"github.com/ollama/ollama/ml"
7
	"github.com/ollama/ollama/model/input"
Jesse Gross's avatar
Jesse Gross committed
8
9
10
11
12
13
14
15
16
)

// Encoder cache stores K and V tensors that are position independent
//
// The tensors can be of any shape and will be returned as they were stored
// The mask is currently always nil
//
// Not currently safe for multiple sequences
type EncoderCache struct {
17
18
19
	// config controls mostly backend-specific optimizations
	config *ml.CacheConfig

Jesse Gross's avatar
Jesse Gross committed
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
	// ** current forward pass **

	// the active layer for Get and Put
	curLayer int

	// if something is stored during this pass, this
	// will be the position (but there is no guarantee
	// anything will be stored)
	curPos int32

	// ** cache metadata **

	// was something stored in the cache?
	encoderCached bool

	// position of the cached data
	encoderPos int32

	// ** cache data storage **
39
40
41
	backend      ml.Backend
	ctxs         map[int]ml.Context
	keys, values map[int]ml.Tensor
Jesse Gross's avatar
Jesse Gross committed
42
43
44
}

func NewEncoderCache() *EncoderCache {
45
46
47
48
49
	return &EncoderCache{
		ctxs:   make(map[int]ml.Context),
		keys:   make(map[int]ml.Tensor),
		values: make(map[int]ml.Tensor),
	}
Jesse Gross's avatar
Jesse Gross committed
50
51
52
}

func (c *EncoderCache) Init(backend ml.Backend, dtype ml.DType, capacity int32) {
53
54
55
56
57
58
59
60
61
62
63
64
	if c.config == nil {
		var config ml.CacheConfig
		if cc, ok := backend.(ml.BackendCacheConfig); ok {
			config = cc.CacheConfig()
		}
		c.config = &config
	}

	if c.config.CachePadding != 0 && c.config.CachePadding != 1 {
		panic(fmt.Errorf("encoder cache is unable to enforce requested CachePadding (%v)", c.config.CachePadding))
	}

65
	c.backend = backend
Jesse Gross's avatar
Jesse Gross committed
66
67
}

68
69
70
71
72
73
74
75
func (c *EncoderCache) SetConfig(config ml.CacheConfig) {
	if c.config != nil {
		panic("config cannot be changed after being previously set, either by the model or backend")
	}

	c.config = &config
}

Jesse Gross's avatar
Jesse Gross committed
76
func (c *EncoderCache) Close() {
77
78
79
	for _, ctx := range c.ctxs {
		ctx.Close()
	}
Jesse Gross's avatar
Jesse Gross committed
80
81
}

82
83
84
85
86
func (c *EncoderCache) StartForward(ctx ml.Context, opts input.Options) error {
	// We work with the most recent image
	if len(opts.Multimodal) > 0 {
		c.curPos = opts.Positions[opts.Multimodal[len(opts.Multimodal)-1].Index]
	}
Jesse Gross's avatar
Jesse Gross committed
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106

	return nil
}

func (c *EncoderCache) SetLayer(layer int) {
	c.curLayer = layer
}

func (c *EncoderCache) EncoderCached() bool {
	return c.encoderCached
}

func (c *EncoderCache) Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor) {
	return c.keys[c.curLayer], c.values[c.curLayer], nil
}

func (c *EncoderCache) Put(ctx ml.Context, key, value ml.Tensor) {
	c.encoderPos = c.curPos
	c.encoderCached = true

107
108
109
110
	if c.config.PermutedV {
		value = value.Permute(ctx, 1, 2, 0, 3)
	}

111
	if _, ok := c.ctxs[c.curLayer]; !ok {
112
		c.ctxs[c.curLayer] = c.backend.NewContextSize(2).Layer(c.curLayer)
113
114
115
116
117
118
119
120
	}

	if _, ok := c.keys[c.curLayer]; !ok {
		c.keys[c.curLayer] = c.ctxs[c.curLayer].Empty(key.DType(), key.Shape()...)
	}

	if _, ok := c.values[c.curLayer]; !ok {
		c.values[c.curLayer] = c.ctxs[c.curLayer].Empty(value.DType(), value.Shape()...)
Jesse Gross's avatar
Jesse Gross committed
121
122
	}

123
124
125
126
	ctx.Forward(
		key.Copy(ctx, c.keys[c.curLayer]),
		value.Copy(ctx, c.values[c.curLayer]),
	)
Jesse Gross's avatar
Jesse Gross committed
127
128
129
130
131
132
133
134
135
136
137
138
139
}

func (c *EncoderCache) CopyPrefix(srcSeq, dstSeq int, len int32) {
	panic("encoder cache does not support multiple sequences")
}

func (c *EncoderCache) Remove(seq int, beginIndex, endIndex int32) error {
	if c.encoderPos >= beginIndex && c.encoderPos < endIndex {
		c.encoderCached = false
	}

	return nil
}