model_text.go 8.91 KB
Newer Older
Michael Yang's avatar
Michael Yang committed
1
2
3
4
5
6
package mllama

import (
	"math"
	"slices"

7
	"github.com/ollama/ollama/fs"
Jesse Gross's avatar
Jesse Gross committed
8
	"github.com/ollama/ollama/kvcache"
Michael Yang's avatar
Michael Yang committed
9
10
	"github.com/ollama/ollama/ml"
	"github.com/ollama/ollama/ml/nn"
11
12
	"github.com/ollama/ollama/ml/nn/fast"
	"github.com/ollama/ollama/ml/nn/rope"
Michael Yang's avatar
Michael Yang committed
13
14
15
)

type TextSelfAttention struct {
Jesse Gross's avatar
Jesse Gross committed
16
17
18
19
20
	Query       *nn.Linear `gguf:"attn_q"`
	Key         *nn.Linear `gguf:"attn_k"`
	Value       *nn.Linear `gguf:"attn_v"`
	Output      *nn.Linear `gguf:"attn_output"`
	RopeFactors ml.Tensor  `gguf:"rope_freqs.weight"`
Michael Yang's avatar
Michael Yang committed
21
22
}

23
func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
24
25
26
27
28
	batchSize := hiddenState.Dim(1)
	headDim := opts.hiddenSize / opts.numHeads

	query := sa.Query.Forward(ctx, hiddenState)
	query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
29
	query = fast.RoPE(ctx, query, positions, opts.ropeDim, opts.ropeBase, 1./opts.ropeScale, rope.WithFactors(sa.RopeFactors))
Michael Yang's avatar
Michael Yang committed
30
31
32

	key := sa.Key.Forward(ctx, hiddenState)
	key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
33
	key = fast.RoPE(ctx, key, positions, opts.ropeDim, opts.ropeBase, 1./opts.ropeScale, rope.WithFactors(sa.RopeFactors))
Michael Yang's avatar
Michael Yang committed
34
35
36
37

	value := sa.Value.Forward(ctx, hiddenState)
	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)

38
	scaleFactor := 1.0 / math.Sqrt(float64(headDim))
39
	attention := nn.Attention(ctx, query, key, value, scaleFactor, cache)
Michael Yang's avatar
Michael Yang committed
40
41
42
43
44
	attention = attention.Reshape(ctx, opts.hiddenSize, batchSize)

	return sa.Output.Forward(ctx, attention)
}

Jesse Gross's avatar
Jesse Gross committed
45
func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
Patrick Devine's avatar
Patrick Devine committed
46
	// This will only get called for layers in the cache, which are just the self attention layers
Jesse Gross's avatar
Jesse Gross committed
47
	if sa, ok := m.Transformer.Layers[layer].(*TextSelfAttentionDecoderLayer); ok {
48
		return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, 1./m.ropeScale, rope.WithFactors(sa.SelfAttention.RopeFactors)), nil
Jesse Gross's avatar
Jesse Gross committed
49
50
51
	}

	return key, nil
Jesse Gross's avatar
Jesse Gross committed
52
53
}

Michael Yang's avatar
Michael Yang committed
54
55
56
57
58
59
60
type TextMLP struct {
	Up   *nn.Linear `gguf:"ffn_up"`
	Down *nn.Linear `gguf:"ffn_down"`
	Gate *nn.Linear `gguf:"ffn_gate"`
}

func (mlp *TextMLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *TextModelOptions) ml.Tensor {
61
	hiddenState = mlp.Gate.Forward(ctx, hiddenState).SILU(ctx, mlp.Up.Forward(ctx, hiddenState))
Michael Yang's avatar
Michael Yang committed
62
63
64
65
66
67
68
69
70
71
72
	return mlp.Down.Forward(ctx, hiddenState)
}

type TextSelfAttentionDecoderLayer struct {
	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
	SelfAttention *TextSelfAttention

	MLPNorm *nn.RMSNorm `gguf:"ffn_norm"`
	MLP     *TextMLP
}

73
func (d *TextSelfAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, positions, outputs, _, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
74
75
76
	residual := hiddenState

	hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
77
	hiddenState = d.SelfAttention.Forward(ctx, hiddenState, positions, cache, opts)
78
79
80
81
82
83
84
85

	// In the final layer (outputs != nil), optimize by pruning to just the token positions
	// we need logits for.
	if outputs != nil {
		hiddenState = hiddenState.Rows(ctx, outputs)
		residual = residual.Rows(ctx, outputs)
	}

Michael Yang's avatar
Michael Yang committed
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
	hiddenState = hiddenState.Add(ctx, residual)
	residual = hiddenState

	hiddenState = d.MLPNorm.Forward(ctx, hiddenState, opts.eps)
	hiddenState = d.MLP.Forward(ctx, hiddenState, opts)
	return hiddenState.Add(ctx, residual)
}

type TextCrossAttention struct {
	QueryNorm *nn.RMSNorm `gguf:"cross_attn_q_norm"`
	Query     *nn.Linear  `gguf:"cross_attn_q_proj"`
	KeyNorm   *nn.RMSNorm `gguf:"cross_attn_k_norm"`
	Key       *nn.Linear  `gguf:"cross_attn_k_proj"`
	Value     *nn.Linear  `gguf:"cross_attn_v_proj"`
	Output    *nn.Linear  `gguf:"cross_attn_o_proj"`
}

Jesse Gross's avatar
Jesse Gross committed
103
func (ca *TextCrossAttention) Forward(ctx ml.Context, hiddenState, crossAttentionStates ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
104
105
106
107
108
109
110
	batchSize := hiddenState.Dim(1)
	headDim := opts.hiddenSize / opts.numHeads

	query := ca.Query.Forward(ctx, hiddenState)
	query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
	query = ca.QueryNorm.Forward(ctx, query, opts.eps)

111
	var key, value ml.Tensor
Jesse Gross's avatar
Jesse Gross committed
112
113
	if crossAttentionStates != nil {
		numVisionTokens, numTiles := crossAttentionStates.Dim(1), crossAttentionStates.Dim(2)
Michael Yang's avatar
Michael Yang committed
114

Jesse Gross's avatar
Jesse Gross committed
115
116
117
		key = ca.Key.Forward(ctx, crossAttentionStates)
		key = key.Reshape(ctx, headDim, opts.numKVHeads, numVisionTokens*numTiles)
		key = ca.KeyNorm.Forward(ctx, key, opts.eps)
Michael Yang's avatar
Michael Yang committed
118

Jesse Gross's avatar
Jesse Gross committed
119
120
121
122
123
		value = ca.Value.Forward(ctx, crossAttentionStates)
		value = value.Reshape(ctx, headDim, opts.numKVHeads, numVisionTokens*numTiles)

		cache.Put(ctx, key, value)
	}
Michael Yang's avatar
Michael Yang committed
124

125
	key, value, _ = cache.Get(ctx)
Michael Yang's avatar
Michael Yang committed
126

127
	scaleFactor := 1.0 / math.Sqrt(float64(headDim))
128
129
130
131
132
133
134
135
136
137
138
139

	query = query.Permute(ctx, 0, 2, 1, 3)
	key = key.Permute(ctx, 0, 2, 1, 3)
	value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)

	kq := key.MulmatFullPrec(ctx, query)

	kq = kq.Scale(ctx, scaleFactor)
	kq = kq.Softmax(ctx)

	kqv := value.Mulmat(ctx, kq)
	attention := kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
Michael Yang's avatar
Michael Yang committed
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
	attention = attention.Reshape(ctx, opts.hiddenSize, batchSize)

	return ca.Output.Forward(ctx, attention)
}

type TextCrossAttentionDecoderLayer struct {
	AttentionNorm  *nn.RMSNorm `gguf:"attn_norm"`
	CrossAttention *TextCrossAttention
	AttentionGate  ml.Tensor `gguf:"cross_attn_attn_gate"`

	MLPNorm *nn.RMSNorm `gguf:"ffn_norm"`
	MLP     *TextMLP
	MLPGate ml.Tensor `gguf:"cross_attn_mlp_gate"`
}

155
func (d *TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _, _, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
	residual := hiddenState

	hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
	hiddenState = d.CrossAttention.Forward(ctx, hiddenState, crossAttentionStates, cache, opts)
	hiddenState = hiddenState.Mul(ctx, d.AttentionGate.Tanh(ctx))
	hiddenState = hiddenState.Add(ctx, residual)
	residual = hiddenState

	hiddenState = d.MLPNorm.Forward(ctx, hiddenState, opts.eps)
	hiddenState = d.MLP.Forward(ctx, hiddenState, opts)
	hiddenState = hiddenState.Mul(ctx, d.MLPGate.Tanh(ctx))
	return hiddenState.Add(ctx, residual)
}

type TextDecoderLayer interface {
171
	Forward(ctx ml.Context, hiddenState, positionIDs, outputs, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor
Michael Yang's avatar
Michael Yang committed
172
173
174
175
176
177
}

type TextDecoder struct {
	Layers []TextDecoderLayer
}

178
func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
179
	for i, layer := range d.Layers {
Jesse Gross's avatar
Jesse Gross committed
180
		layerType := selfAttentionLayer
Michael Yang's avatar
Michael Yang committed
181
		if slices.Contains(opts.crossAttentionLayers, int32(i)) {
Jesse Gross's avatar
Jesse Gross committed
182
183
184
185
186
187
188
			layerType = crossAttentionLayer
		}

		cache.SetLayer(i)
		cache.SetLayerType(layerType)

		if layerType == selfAttentionLayer || crossAttentionStates != nil || cache.UnderlyingCache().(*kvcache.EncoderCache).EncoderCached() {
189
190
191
192
193
			var lastLayerOutputs ml.Tensor
			if i == len(d.Layers)-1 {
				lastLayerOutputs = outputs
			}

194
			hiddenState = layer.Forward(ctx, hiddenState, positionIDs, lastLayerOutputs, crossAttentionStates, crossAttentionMask, cache, opts)
Michael Yang's avatar
Michael Yang committed
195
196
197
198
199
200
201
		}
	}

	return hiddenState
}

type TextModelOptions struct {
202
	hiddenSize, numHeads, numKVHeads int
203
	ropeDim                          int
Michael Yang's avatar
Michael Yang committed
204
205
	eps, ropeBase, ropeScale         float32

Michael Yang's avatar
Michael Yang committed
206
	crossAttentionLayers []int32
Michael Yang's avatar
Michael Yang committed
207
208
209
210
211
212
213
214
215
216
217
}

type TextModel struct {
	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
	Transformer    *TextDecoder  `gguf:"blk"`
	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
	Output         *nn.Linear    `gguf:"output"`

	*TextModelOptions
}

218
func (m *TextModel) Forward(ctx ml.Context, inputIDs, positionIDs, outputs, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
219
	hiddenState := m.TokenEmbedding.Forward(ctx, inputIDs)
220
	hiddenState = m.Transformer.Forward(ctx, hiddenState, positionIDs, outputs, crossAttentionStates, crossAttentionMask, cache, m.TextModelOptions)
Michael Yang's avatar
Michael Yang committed
221
222
223
224
	hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
	return m.Output.Forward(ctx, hiddenState)
}

225
func newTextModel(c fs.Config) *TextModel {
Michael Yang's avatar
Michael Yang committed
226
227
228
	var decoderLayers []TextDecoderLayer
	for i := range c.Uint("block_count") {
		var textDecoderLayer TextDecoderLayer
Michael Yang's avatar
Michael Yang committed
229
		if slices.Contains(c.Ints("attention.cross_attention_layers"), int32(i)) {
Michael Yang's avatar
Michael Yang committed
230
231
232
233
234
235
236
237
238
239
240
			textDecoderLayer = &TextCrossAttentionDecoderLayer{}
		} else {
			textDecoderLayer = &TextSelfAttentionDecoderLayer{}
		}

		decoderLayers = append(decoderLayers, textDecoderLayer)
	}

	return &TextModel{
		Transformer: &TextDecoder{Layers: decoderLayers},
		TextModelOptions: &TextModelOptions{
241
242
243
			hiddenSize:           int(c.Uint("embedding_length")),
			numHeads:             int(c.Uint("attention.head_count")),
			numKVHeads:           int(c.Uint("attention.head_count_kv")),
244
			ropeDim:              int(c.Uint("rope.dimension_count")),
Michael Yang's avatar
Michael Yang committed
245
246
			eps:                  c.Float("attention.layer_norm_rms_epsilon"),
			ropeBase:             c.Float("rope.freq_base"),
247
			ropeScale:            c.Float("rope.scaling.factor", 1),
Michael Yang's avatar
Michael Yang committed
248
			crossAttentionLayers: c.Ints("attention.cross_attention_layers"),
Michael Yang's avatar
Michael Yang committed
249
250
251
		},
	}
}