cache.go 6.76 KB
Newer Older
Jesse Gross's avatar
Jesse Gross committed
1
package ollamarunner
2
3
4

import (
	"errors"
5
	"fmt"
6
	"log/slog"
Jesse Gross's avatar
Jesse Gross committed
7
	"math"
8
9
10
	"reflect"
	"time"

Jesse Gross's avatar
Jesse Gross committed
11
12
13
	"github.com/ollama/ollama/kvcache"
	"github.com/ollama/ollama/ml"
	"github.com/ollama/ollama/model"
14
15
16
17
)

type InputCache struct {
	// context window size (per slot)
Jesse Gross's avatar
Jesse Gross committed
18
19
20
21
22
23
	numCtx int32

	// does the cache store data or do we need to always send the full input?
	// note that when enabled is false the underlying cache may either be nil
	// or a non-nil dummy that doesn't actually store anything
	enabled bool
24
25
26
27
28
29
30

	// individual KV caches
	slots []InputCacheSlot

	// optimize cache eviction for multiple users
	multiUserCache bool

Jesse Gross's avatar
Jesse Gross committed
31
	cache kvcache.Cache
32
33
}

Jesse Gross's avatar
Jesse Gross committed
34
35
func NewInputCache(model model.Model, kvCacheType string, kvSize int32, numSlots int, multiUserCache bool) (*InputCache, error) {
	if kvSize/int32(numSlots) < 1 {
36
37
38
		return nil, fmt.Errorf("must have at least one kv cache entry per parallel sequence (kv: %v parallel: %v)", kvSize, numSlots)
	}

39
40
41
42
43
44
45
46
47
	slots := make([]InputCacheSlot, numSlots)

	for i := range slots {
		slots[i] = InputCacheSlot{
			Id:     i,
			Inputs: make([]input, 0),
		}
	}

Jesse Gross's avatar
Jesse Gross committed
48
49
50
51
52
	cache := model.Config().Cache
	if cache != nil {
		cache.Init(model.Backend(), kvCacheTypeFromStr(kvCacheType), kvSize)
	}

53
	return &InputCache{
Jesse Gross's avatar
Jesse Gross committed
54
55
		numCtx:         kvSize / int32(numSlots),
		enabled:        cache != nil,
56
57
		slots:          slots,
		multiUserCache: multiUserCache,
Jesse Gross's avatar
Jesse Gross committed
58
		cache:          cache,
59
	}, nil
60
61
}

Jesse Gross's avatar
Jesse Gross committed
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
func kvCacheTypeFromStr(s string) ml.DType {
	switch s {
	case "q8_0":
		panic("kv cache quantization not yet implemented")
	case "q4_0":
		panic("kv cache quantization not yet implemented")
	default:
		return ml.DTypeF16
	}
}

func (c *InputCache) Close() {
	c.cache.Close()
}

77
78
// Locking: Operations on InputCacheSlot (including finding one
// through LoadCacheSlot) require a lock to be be held that serializes
Jesse Gross's avatar
Jesse Gross committed
79
// these operations with each other and processBatch
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94

type InputCacheSlot struct {
	// Index in the KV cache
	Id int

	// Inputs that are stored in the KV cache
	Inputs []input

	// is this cache actively being processed as part of a sequence?
	InUse bool

	// last time this cache was used (as of start of processing)
	lastUsed time.Time
}

95
func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCacheSlot, []input, error) {
96
	var slot *InputCacheSlot
Jesse Gross's avatar
Jesse Gross committed
97
	var numPast int32
98
99
100
	var err error

	// In single-user scenarios, the longest cache slot works fine for getting good input
Jesse Gross's avatar
Jesse Gross committed
101
	// cache hit rates and it keeps the footprint of the cache small, which improves throughput.
102
	// For multiple users, the "best" cache slot produces better input cache hit rates
Jesse Gross's avatar
Jesse Gross committed
103
	// at the cost of worse performance when we miss the input cache.
104
105
106
107
108
109
	if !c.multiUserCache {
		slot, numPast, err = c.findLongestCacheSlot(prompt)
	} else {
		slot, numPast, err = c.findBestCacheSlot(prompt)
	}
	if err != nil {
110
		return nil, nil, err
111
112
113
114
115
116
117
118
119
	}

	if !cachePrompt {
		numPast = 0
	}

	slot.InUse = true
	slot.lastUsed = time.Now()

Jesse Gross's avatar
Jesse Gross committed
120
	if numPast == int32(len(prompt)) {
121
122
123
124
		// Leave one input to sample so we can get a response
		numPast--
	}

Jesse Gross's avatar
Jesse Gross committed
125
126
127
128
129
130
131
132
133
134
	if c.cache != nil {
		err = c.cache.Remove(slot.Id, numPast, math.MaxInt32)
		if err != nil {
			// Some models don't support partial erasure
			err = c.cache.Remove(slot.Id, 0, math.MaxInt32)
			if err != nil {
				return nil, nil, err
			}
			numPast = 0
		}
135
136
137
	}

	slog.Debug("loading cache slot", "id", slot.Id, "cache", len(slot.Inputs), "prompt", len(prompt),
Jesse Gross's avatar
Jesse Gross committed
138
		"used", numPast, "remaining", int32(len(prompt))-numPast)
139
140
141
142

	prompt = prompt[numPast:]
	slot.Inputs = slot.Inputs[:numPast]

143
	return slot, prompt, nil
144
145
}

Jesse Gross's avatar
Jesse Gross committed
146
147
func (c *InputCache) findLongestCacheSlot(prompt []input) (*InputCacheSlot, int32, error) {
	longest := int32(-1)
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
	var longestSlot *InputCacheSlot

	for i, s := range c.slots {
		if s.InUse {
			continue
		}

		count := countCommonPrefix(s.Inputs, prompt)
		if count > longest {
			longest = count
			longestSlot = &c.slots[i]
		}
	}

	if longestSlot == nil {
		return nil, 0, errors.New("no available cache slots")
	}

	return longestSlot, longest, nil
}

Jesse Gross's avatar
Jesse Gross committed
169
func (c *InputCache) findBestCacheSlot(prompt []input) (*InputCacheSlot, int32, error) {
170
171
172
	oldest := time.Now()
	var oldestSlot *InputCacheSlot

Jesse Gross's avatar
Jesse Gross committed
173
	longest := int32(-1)
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
	var longestSlot *InputCacheSlot

	for i, s := range c.slots {
		count := countCommonPrefix(s.Inputs, prompt)
		if count > longest {
			longest = count
			longestSlot = &c.slots[i]
		}

		if s.lastUsed.Compare(oldest) < 0 && !s.InUse {
			oldest = s.lastUsed
			oldestSlot = &c.slots[i]
		}
	}

Jesse Gross's avatar
Jesse Gross committed
189
	if longest == int32(len(longestSlot.Inputs)) && !longestSlot.InUse {
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
		return longestSlot, longest, nil
	}

	if oldestSlot.InUse {
		return nil, 0, errors.New("no available cache slots")
	}

	if len(oldestSlot.Inputs) != 0 {
		slog.Debug("evicting cache slot", "id", oldestSlot.Id, "inputs", len(oldestSlot.Inputs),
			"used", oldestSlot.lastUsed)
	}

	if longest > 0 && longestSlot != oldestSlot {
		slog.Debug("forking cache slot", "src", longestSlot.Id, "dst", oldestSlot.Id, "inputs", longest, "total",
			len(longestSlot.Inputs))
		oldestSlot.Inputs = make([]input, longest)
		copy(oldestSlot.Inputs, longestSlot.Inputs[:longest])
Jesse Gross's avatar
Jesse Gross committed
207
208
		if c.cache != nil {
			c.cache.CopyPrefix(longestSlot.Id, oldestSlot.Id, longest)
209
210
211
212
213
214
		}
	}

	return oldestSlot, longest, nil
}

Jesse Gross's avatar
Jesse Gross committed
215
216
func countCommonPrefix(a []input, b []input) int32 {
	var count int32
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232

	for i := range a {
		if i >= len(b) {
			break
		}

		if !reflect.DeepEqual(a[i], b[i]) {
			break
		}

		count++
	}

	return count
}

Jesse Gross's avatar
Jesse Gross committed
233
func (c *InputCache) ShiftDiscard(inputLen int32, numKeep int32) int32 {
234
235
236
237
238
239
240
241
242
243
244
245
246
	targetFree := (c.numCtx - numKeep) / 2
	targetFree = max(targetFree, 1)

	currentFree := c.numCtx - inputLen
	discard := targetFree - currentFree

	if discard < 0 {
		discard = 0
	}

	return discard
}

247
248
249
250
// Frees up space in the KV cache by deleting the oldest half of history and shifting
// the newest half into that space (saving numKeep inputs at the beginning).
//
// Assumes that at least 1 entry can be freed up by shifting (i.e. numKeep < numCtx)
Jesse Gross's avatar
Jesse Gross committed
251
func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int32) error {
252
253
254
255
	if numKeep >= c.numCtx {
		return fmt.Errorf("unable to shift context - keep exceeds context (keep: %v context: %v)", numKeep, c.numCtx)
	}

Jesse Gross's avatar
Jesse Gross committed
256
257
	inputLen := int32(len(slot.Inputs))
	discard := c.ShiftDiscard(inputLen, numKeep)
258
259

	if discard <= 0 {
260
		return nil
261
262
	}

263
	slog.Debug("context limit hit - shifting", "id", slot.Id, "limit", c.numCtx, "input", len(slot.Inputs),
264
265
		"keep", numKeep, "discard", discard)

266
	// TODO (jessegross): KV cache removal can fail for certain types of models
Jesse Gross's avatar
Jesse Gross committed
267
268
269
270
271
	if c.cache != nil {
		err := c.cache.Remove(slot.Id, numKeep, numKeep+discard)
		if err != nil {
			return fmt.Errorf("unable to remove old kv cache entries (id: %v, keep: %v discard: %v): %w", slot.Id, numKeep, discard, err)
		}
272
	}
273

Jesse Gross's avatar
Jesse Gross committed
274
	for i := numKeep + discard; i < inputLen; i++ {
275
		slot.Inputs[i-discard] = slot.Inputs[i]
276
	}
Jesse Gross's avatar
Jesse Gross committed
277
	slot.Inputs = slot.Inputs[:inputLen-discard]
278
279

	return nil
280
}