model_vision.go 8.6 KB
Newer Older
Michael Yang's avatar
Michael Yang committed
1
2
3
4
5
6
package mllama

import (
	"math"
	"slices"

7
	"github.com/ollama/ollama/fs"
Michael Yang's avatar
Michael Yang committed
8
9
10
11
	"github.com/ollama/ollama/ml"
	"github.com/ollama/ollama/ml/nn"
)

12
var batchSize int = 1
Michael Yang's avatar
Michael Yang committed
13
14
15
16
17

type VisionSelfAttention struct {
	Query  *nn.Linear `gguf:"attn_q"`
	Key    *nn.Linear `gguf:"attn_k"`
	Value  *nn.Linear `gguf:"attn_v"`
18
	Output *nn.Linear `gguf:"attn_output"`
Michael Yang's avatar
Michael Yang committed
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52

	Gate ml.Tensor `gguf:"attn_gate"`
}

func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
	headDim := opts.hiddenSize / opts.numHeads

	query := sa.Query.Forward(ctx, hiddenState)
	query = query.Reshape(ctx, headDim, opts.numHeads, query.Dim(1), batchSize)
	query = query.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)

	key := sa.Key.Forward(ctx, hiddenState)
	key = key.Reshape(ctx, headDim, opts.numHeads, key.Dim(1), batchSize)
	key = key.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)

	value := sa.Value.Forward(ctx, hiddenState)
	value = value.Reshape(ctx, headDim, opts.numHeads, value.Dim(1), batchSize)
	value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)

	scores := key.Mulmat(ctx, query)
	scores = scores.Scale(ctx, 1.0/math.Sqrt(float64(headDim)))
	scores = scores.Softmax(ctx)

	attention := value.Mulmat(ctx, scores)
	attention = attention.Reshape(ctx, headDim, attention.Dim(1), opts.numHeads, batchSize)
	attention = attention.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
	attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize)

	hiddenState = sa.Output.Forward(ctx, attention)
	return hiddenState
}

type VisionMLP struct {
	Up   *nn.Linear `gguf:"ffn_up"`
53
	Down *nn.Linear `gguf:"ffn_down"`
Michael Yang's avatar
Michael Yang committed
54
55
56
}

func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
57
58
	hiddenState = mlp.Up.Forward(ctx, hiddenState).GELU(ctx)
	hiddenState = mlp.Down.Forward(ctx, hiddenState)
Michael Yang's avatar
Michael Yang committed
59
60
61
62
63

	return hiddenState
}

type VisionEncoderLayer struct {
64
	AttentionNorm *nn.LayerNorm `gguf:"attn_norm"`
Michael Yang's avatar
Michael Yang committed
65
	SelfAttention *VisionSelfAttention
66
	AttentionGate ml.Tensor `gguf:"attn_gate"`
Michael Yang's avatar
Michael Yang committed
67

68
	MLPNorm *nn.LayerNorm `gguf:"ffn_norm"`
Michael Yang's avatar
Michael Yang committed
69
	MLP     *VisionMLP
70
	MLPGate ml.Tensor `gguf:"ffn_gate"`
Michael Yang's avatar
Michael Yang committed
71
72
73
74
75
76
77
78
}

func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
	residual := hiddenState

	// self attention
	hiddenState = e.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
	hiddenState = e.SelfAttention.Forward(ctx, hiddenState, opts)
79
80
81
82

	if e.AttentionGate != nil {
		hiddenState = hiddenState.Mul(ctx, e.AttentionGate)
	}
Michael Yang's avatar
Michael Yang committed
83
84
85
86
87
88
	hiddenState = hiddenState.Add(ctx, residual)
	residual = hiddenState

	// feed forward
	hiddenState = e.MLPNorm.Forward(ctx, hiddenState, opts.eps)
	hiddenState = e.MLP.Forward(ctx, hiddenState, opts)
89
90
91
92
93
94
	hiddenState = hiddenState.Add(ctx, residual)
	if e.MLPGate != nil {
		hiddenState = hiddenState.Mul(ctx, e.MLPGate)
	}

	return hiddenState
Michael Yang's avatar
Michael Yang committed
95
96
97
98
99
100
}

type VisionEncoder struct {
	Layers []VisionEncoderLayer
}

Michael Yang's avatar
Michael Yang committed
101
func (e *VisionEncoder) Forward(ctx ml.Context, hiddenState ml.Tensor, intermediateLayersIndices []int32, opts *VisionModelOptions) (ml.Tensor, []ml.Tensor) {
Michael Yang's avatar
Michael Yang committed
102
103
	var intermediateHiddenStates []ml.Tensor
	for i, layer := range e.Layers {
Michael Yang's avatar
Michael Yang committed
104
		if slices.Contains(intermediateLayersIndices, int32(i)) {
105
			intermediateHiddenStates = append(intermediateHiddenStates, hiddenState.Reshape(ctx, append([]int{1}, hiddenState.Shape()...)...))
Michael Yang's avatar
Michael Yang committed
106
107
108
109
110
111
112
113
114
115
116
117
118
		}

		hiddenState = layer.Forward(ctx, hiddenState, opts)
	}

	return hiddenState, intermediateHiddenStates
}

type PrecomputedAspectRatioEmbedding struct {
	Embedding *nn.Embedding
	Gate      ml.Tensor `gguf:"gate"`
}

119
func (e *PrecomputedAspectRatioEmbedding) Forward(ctx ml.Context, hiddenState ml.Tensor, aspectRatioIDs ml.Tensor, numTiles int, opts *VisionModelOptions) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
120
	embeddings := e.Embedding.Forward(ctx, aspectRatioIDs)
121
	embeddings = embeddings.Reshape(ctx, opts.hiddenSize, 1, numTiles)
Michael Yang's avatar
Michael Yang committed
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
	if e.Gate != nil {
		embeddings = embeddings.Mul(ctx, e.Gate)
	}

	return hiddenState.Add(ctx, embeddings)
}

type PrecomputedPositionEmbedding struct {
	PositionEmbedding     *nn.Embedding `gguf:"position_embd"`
	PositionEmbeddingGate ml.Tensor     `gguf:"position_embd.gate"`

	TilePositionEmbedding     *nn.Embedding `gguf:"tile_position_embd"`
	TilePositionEmbeddingGate ml.Tensor     `gguf:"tile_position_embd.gate"`
}

137
func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, positionIDs, aspectRatioIDs ml.Tensor, numPositions, numTiles int, opts *VisionModelOptions) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
138
139
140
141
142
143
144
145
	positionEmbedding := e.PositionEmbedding.Forward(ctx, positionIDs)
	if e.PositionEmbeddingGate != nil {
		positionEmbedding = positionEmbedding.Mul(ctx, e.PositionEmbeddingGate)
	}

	hiddenState = hiddenState.Add(ctx, positionEmbedding)

	tilePositionEmbedding := e.TilePositionEmbedding.Forward(ctx, aspectRatioIDs)
146
	tilePositionEmbedding = tilePositionEmbedding.Reshape(ctx, opts.hiddenSize, numPositions, numTiles)
Michael Yang's avatar
Michael Yang committed
147
148
149
150
151
152
153
154
	if e.TilePositionEmbeddingGate != nil {
		tilePositionEmbedding = tilePositionEmbedding.Mul(ctx, e.TilePositionEmbeddingGate)
	}

	return hiddenState.Add(ctx, tilePositionEmbedding)
}

type VisionModelOptions struct {
155
156
157
	hiddenSize, numHeads int
	imageSize, patchSize int
	eps                  float32
Michael Yang's avatar
Michael Yang committed
158

Michael Yang's avatar
Michael Yang committed
159
	intermediateLayersIndices []int32
Michael Yang's avatar
Michael Yang committed
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
}

type VisionModel struct {
	PatchEmbeddings *nn.Conv2D `gguf:"patch_embd"`

	PreTilePositionEmbedding  *PrecomputedAspectRatioEmbedding `gguf:"pre_tile_position_embd"`
	PostTilePositionEmbedding *PrecomputedAspectRatioEmbedding `gguf:"post_tile_position_embd"`
	PositionEmbedding         *PrecomputedPositionEmbedding

	PreLayerNorm   *nn.LayerNorm `gguf:"pre_ln"`
	PostLayerNorm  *nn.LayerNorm `gguf:"post_ln"`
	ClassEmbedding ml.Tensor     `gguf:"class_embd"`

	Transformer       *VisionEncoder `gguf:"blk"`
	GlobalTransformer *VisionEncoder `gguf:"global.blk"`

	*VisionModelOptions
}

func (m *VisionModel) Forward(ctx ml.Context, pixelValues, positionIDs, aspectRatioIDs ml.Tensor) ml.Tensor {
180
	numPatches := (m.imageSize / m.patchSize) * (m.imageSize / m.patchSize)
Michael Yang's avatar
Michael Yang committed
181
182
183
184
185
	numPositions := numPatches
	if m.ClassEmbedding != nil {
		numPositions++
	}

186
187
	numTiles := pixelValues.Dim(3)

Michael Yang's avatar
Michael Yang committed
188
	hiddenState := m.PatchEmbeddings.Forward(ctx, pixelValues, m.patchSize, m.patchSize, 0, 0, 1, 1)
189
	hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize, numTiles)
Michael Yang's avatar
Michael Yang committed
190
191
	hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)

192
193
	hiddenState = m.PreTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, numTiles, m.VisionModelOptions)
	hiddenState = m.ClassEmbedding.Repeat(ctx, 2, numTiles).Concat(ctx, hiddenState, 1)
Michael Yang's avatar
Michael Yang committed
194

195
	hiddenState = m.PositionEmbedding.Forward(ctx, hiddenState, positionIDs, aspectRatioIDs, numPositions, numTiles, m.VisionModelOptions)
Michael Yang's avatar
Michael Yang committed
196
197
198
199
200
201
202
203
204
205
	hiddenState = m.PreLayerNorm.Forward(ctx, hiddenState, m.eps)

	numPaddingPatches := 8 - (hiddenState.Dim(1)%8)%8
	hiddenState = hiddenState.Pad(ctx, 0, numPaddingPatches, 0, 0)

	hiddenState = hiddenState.Reshape(ctx, hiddenState.Dim(0), hiddenState.Dim(1)*hiddenState.Dim(2), batchSize)
	hiddenState, intermediateHiddenStates := m.Transformer.Forward(ctx, hiddenState, m.intermediateLayersIndices, m.VisionModelOptions)

	hiddenState = m.PostLayerNorm.Forward(ctx, hiddenState, m.eps)

206
207
	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, numTiles, batchSize)
	hiddenState = m.PostTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, numTiles, m.VisionModelOptions)
Michael Yang's avatar
Michael Yang committed
208

209
	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numTiles*(numPositions+numPaddingPatches), batchSize)
Michael Yang's avatar
Michael Yang committed
210
211
212
	hiddenState, _ = m.GlobalTransformer.Forward(ctx, hiddenState, nil, m.VisionModelOptions)

	hiddenStates := intermediateHiddenStates[0].Stack(ctx, 0, intermediateHiddenStates[1:]...)
213
214
	hiddenStates = hiddenStates.Reshape(ctx, len(intermediateHiddenStates)*m.hiddenSize, numPositions+numPaddingPatches, numTiles, batchSize)
	hiddenStates = hiddenStates.Pad(ctx, 0, -numPaddingPatches, 0, 0)
Michael Yang's avatar
Michael Yang committed
215

216
217
	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, numTiles, batchSize)
	hiddenState = hiddenState.Pad(ctx, 0, -numPaddingPatches, 0, 0)
Michael Yang's avatar
Michael Yang committed
218
219
220
	return hiddenState.Concat(ctx, hiddenStates, 0)
}

221
func newVisionModel(c fs.Config) *VisionModel {
Michael Yang's avatar
Michael Yang committed
222
223
224
225
226
	return &VisionModel{
		Transformer:       &VisionEncoder{Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count"))},
		GlobalTransformer: &VisionEncoder{Layers: make([]VisionEncoderLayer, c.Uint("vision.global.block_count"))},

		VisionModelOptions: &VisionModelOptions{
227
228
			hiddenSize: int(c.Uint("vision.embedding_length")),
			numHeads:   int(c.Uint("vision.attention.head_count")),
Michael Yang's avatar
Michael Yang committed
229
230
231
232
233
234

			imageSize: int(c.Uint("vision.image_size")),
			patchSize: int(c.Uint("vision.patch_size")),

			eps: c.Float("vision.attention.layer_norm_epsilon"),

Michael Yang's avatar
Michael Yang committed
235
			intermediateLayersIndices: c.Ints("vision.intermediate_layers_indices"),
Michael Yang's avatar
Michael Yang committed
236
237
238
		},
	}
}