model_vision.go 8.14 KB
Newer Older
Michael Yang's avatar
Michael Yang committed
1
2
3
4
5
6
package mllama

import (
	"math"
	"slices"

7
	"github.com/ollama/ollama/fs"
Michael Yang's avatar
Michael Yang committed
8
9
10
11
	"github.com/ollama/ollama/ml"
	"github.com/ollama/ollama/ml/nn"
)

12
var batchSize int = 1
Michael Yang's avatar
Michael Yang committed
13
14
15
16
17

type VisionSelfAttention struct {
	Query  *nn.Linear `gguf:"attn_q"`
	Key    *nn.Linear `gguf:"attn_k"`
	Value  *nn.Linear `gguf:"attn_v"`
18
	Output *nn.Linear `gguf:"attn_output"`
Michael Yang's avatar
Michael Yang committed
19
20
21
22
23
24
25
26
27
28
29
30
31
32
}

func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
	headDim := opts.hiddenSize / opts.numHeads

	query := sa.Query.Forward(ctx, hiddenState)
	query = query.Reshape(ctx, headDim, opts.numHeads, query.Dim(1), batchSize)

	key := sa.Key.Forward(ctx, hiddenState)
	key = key.Reshape(ctx, headDim, opts.numHeads, key.Dim(1), batchSize)

	value := sa.Value.Forward(ctx, hiddenState)
	value = value.Reshape(ctx, headDim, opts.numHeads, value.Dim(1), batchSize)

Michael Yang's avatar
Michael Yang committed
33
	attention := nn.Attention(ctx, query, key, value, 1./math.Sqrt(float64(headDim)), nil)
Michael Yang's avatar
Michael Yang committed
34
	attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize)
Michael Yang's avatar
Michael Yang committed
35
	return sa.Output.Forward(ctx, attention)
Michael Yang's avatar
Michael Yang committed
36
37
38
39
}

type VisionMLP struct {
	Up   *nn.Linear `gguf:"ffn_up"`
40
	Down *nn.Linear `gguf:"ffn_down"`
Michael Yang's avatar
Michael Yang committed
41
42
43
}

func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
44
45
	hiddenState = mlp.Up.Forward(ctx, hiddenState).GELU(ctx)
	hiddenState = mlp.Down.Forward(ctx, hiddenState)
Michael Yang's avatar
Michael Yang committed
46
47
48
49
50

	return hiddenState
}

type VisionEncoderLayer struct {
51
	AttentionNorm *nn.LayerNorm `gguf:"attn_norm"`
Michael Yang's avatar
Michael Yang committed
52
	SelfAttention *VisionSelfAttention
53
	AttentionGate ml.Tensor `gguf:"attn_gate"`
Michael Yang's avatar
Michael Yang committed
54

55
	MLPNorm *nn.LayerNorm `gguf:"ffn_norm"`
Michael Yang's avatar
Michael Yang committed
56
	MLP     *VisionMLP
57
	MLPGate ml.Tensor `gguf:"ffn_gate"`
Michael Yang's avatar
Michael Yang committed
58
59
60
61
62
63
64
65
}

func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
	residual := hiddenState

	// self attention
	hiddenState = e.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
	hiddenState = e.SelfAttention.Forward(ctx, hiddenState, opts)
66
67
68
	if e.AttentionGate != nil {
		hiddenState = hiddenState.Mul(ctx, e.AttentionGate)
	}
Michael Yang's avatar
Michael Yang committed
69
70
71
72
73
	hiddenState = hiddenState.Add(ctx, residual)
	residual = hiddenState

	hiddenState = e.MLPNorm.Forward(ctx, hiddenState, opts.eps)
	hiddenState = e.MLP.Forward(ctx, hiddenState, opts)
74
75
76
	if e.MLPGate != nil {
		hiddenState = hiddenState.Mul(ctx, e.MLPGate)
	}
Michael Yang's avatar
Michael Yang committed
77
	hiddenState = hiddenState.Add(ctx, residual)
78
	return hiddenState
Michael Yang's avatar
Michael Yang committed
79
80
81
82
83
84
}

type VisionEncoder struct {
	Layers []VisionEncoderLayer
}

Michael Yang's avatar
Michael Yang committed
85
func (e *VisionEncoder) Forward(ctx ml.Context, hiddenState ml.Tensor, intermediateLayersIndices []int32, opts *VisionModelOptions) (ml.Tensor, []ml.Tensor) {
Michael Yang's avatar
Michael Yang committed
86
87
	var intermediateHiddenStates []ml.Tensor
	for i, layer := range e.Layers {
Michael Yang's avatar
Michael Yang committed
88
		if slices.Contains(intermediateLayersIndices, int32(i)) {
89
			intermediateHiddenStates = append(intermediateHiddenStates, hiddenState.Reshape(ctx, append([]int{1}, hiddenState.Shape()...)...))
Michael Yang's avatar
Michael Yang committed
90
91
92
93
94
95
96
97
98
99
100
101
102
		}

		hiddenState = layer.Forward(ctx, hiddenState, opts)
	}

	return hiddenState, intermediateHiddenStates
}

type PrecomputedAspectRatioEmbedding struct {
	Embedding *nn.Embedding
	Gate      ml.Tensor `gguf:"gate"`
}

103
func (e *PrecomputedAspectRatioEmbedding) Forward(ctx ml.Context, hiddenState ml.Tensor, aspectRatioIDs ml.Tensor, numTiles int, opts *VisionModelOptions) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
104
	embeddings := e.Embedding.Forward(ctx, aspectRatioIDs)
105
	embeddings = embeddings.Reshape(ctx, opts.hiddenSize, 1, numTiles)
Michael Yang's avatar
Michael Yang committed
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
	if e.Gate != nil {
		embeddings = embeddings.Mul(ctx, e.Gate)
	}

	return hiddenState.Add(ctx, embeddings)
}

type PrecomputedPositionEmbedding struct {
	PositionEmbedding     *nn.Embedding `gguf:"position_embd"`
	PositionEmbeddingGate ml.Tensor     `gguf:"position_embd.gate"`

	TilePositionEmbedding     *nn.Embedding `gguf:"tile_position_embd"`
	TilePositionEmbeddingGate ml.Tensor     `gguf:"tile_position_embd.gate"`
}

121
func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, positionIDs, aspectRatioIDs ml.Tensor, numPositions, numTiles int, opts *VisionModelOptions) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
122
123
124
125
126
127
128
129
	positionEmbedding := e.PositionEmbedding.Forward(ctx, positionIDs)
	if e.PositionEmbeddingGate != nil {
		positionEmbedding = positionEmbedding.Mul(ctx, e.PositionEmbeddingGate)
	}

	hiddenState = hiddenState.Add(ctx, positionEmbedding)

	tilePositionEmbedding := e.TilePositionEmbedding.Forward(ctx, aspectRatioIDs)
130
	tilePositionEmbedding = tilePositionEmbedding.Reshape(ctx, opts.hiddenSize, numPositions, numTiles)
Michael Yang's avatar
Michael Yang committed
131
132
133
134
135
136
137
138
	if e.TilePositionEmbeddingGate != nil {
		tilePositionEmbedding = tilePositionEmbedding.Mul(ctx, e.TilePositionEmbeddingGate)
	}

	return hiddenState.Add(ctx, tilePositionEmbedding)
}

type VisionModelOptions struct {
139
140
141
	hiddenSize, numHeads int
	imageSize, patchSize int
	eps                  float32
Michael Yang's avatar
Michael Yang committed
142

Michael Yang's avatar
Michael Yang committed
143
	intermediateLayersIndices []int32
Michael Yang's avatar
Michael Yang committed
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
}

type VisionModel struct {
	PatchEmbeddings *nn.Conv2D `gguf:"patch_embd"`

	PreTilePositionEmbedding  *PrecomputedAspectRatioEmbedding `gguf:"pre_tile_position_embd"`
	PostTilePositionEmbedding *PrecomputedAspectRatioEmbedding `gguf:"post_tile_position_embd"`
	PositionEmbedding         *PrecomputedPositionEmbedding

	PreLayerNorm   *nn.LayerNorm `gguf:"pre_ln"`
	PostLayerNorm  *nn.LayerNorm `gguf:"post_ln"`
	ClassEmbedding ml.Tensor     `gguf:"class_embd"`

	Transformer       *VisionEncoder `gguf:"blk"`
	GlobalTransformer *VisionEncoder `gguf:"global.blk"`

	*VisionModelOptions
}

func (m *VisionModel) Forward(ctx ml.Context, pixelValues, positionIDs, aspectRatioIDs ml.Tensor) ml.Tensor {
164
	numPatches := (m.imageSize / m.patchSize) * (m.imageSize / m.patchSize)
Michael Yang's avatar
Michael Yang committed
165
166
167
168
169
	numPositions := numPatches
	if m.ClassEmbedding != nil {
		numPositions++
	}

170
171
	numTiles := pixelValues.Dim(3)

Michael Yang's avatar
Michael Yang committed
172
	hiddenState := m.PatchEmbeddings.Forward(ctx, pixelValues, m.patchSize, m.patchSize, 0, 0, 1, 1)
173
	hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize, numTiles)
Michael Yang's avatar
Michael Yang committed
174
175
	hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)

176
177
	hiddenState = m.PreTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, numTiles, m.VisionModelOptions)
	hiddenState = m.ClassEmbedding.Repeat(ctx, 2, numTiles).Concat(ctx, hiddenState, 1)
Michael Yang's avatar
Michael Yang committed
178

179
	hiddenState = m.PositionEmbedding.Forward(ctx, hiddenState, positionIDs, aspectRatioIDs, numPositions, numTiles, m.VisionModelOptions)
Michael Yang's avatar
Michael Yang committed
180
181
182
183
184
185
186
187
188
189
	hiddenState = m.PreLayerNorm.Forward(ctx, hiddenState, m.eps)

	numPaddingPatches := 8 - (hiddenState.Dim(1)%8)%8
	hiddenState = hiddenState.Pad(ctx, 0, numPaddingPatches, 0, 0)

	hiddenState = hiddenState.Reshape(ctx, hiddenState.Dim(0), hiddenState.Dim(1)*hiddenState.Dim(2), batchSize)
	hiddenState, intermediateHiddenStates := m.Transformer.Forward(ctx, hiddenState, m.intermediateLayersIndices, m.VisionModelOptions)

	hiddenState = m.PostLayerNorm.Forward(ctx, hiddenState, m.eps)

190
191
	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, numTiles, batchSize)
	hiddenState = m.PostTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, numTiles, m.VisionModelOptions)
Michael Yang's avatar
Michael Yang committed
192

193
	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numTiles*(numPositions+numPaddingPatches), batchSize)
Michael Yang's avatar
Michael Yang committed
194
195
196
	hiddenState, _ = m.GlobalTransformer.Forward(ctx, hiddenState, nil, m.VisionModelOptions)

	hiddenStates := intermediateHiddenStates[0].Stack(ctx, 0, intermediateHiddenStates[1:]...)
197
198
	hiddenStates = hiddenStates.Reshape(ctx, len(intermediateHiddenStates)*m.hiddenSize, numPositions+numPaddingPatches, numTiles, batchSize)
	hiddenStates = hiddenStates.Pad(ctx, 0, -numPaddingPatches, 0, 0)
Michael Yang's avatar
Michael Yang committed
199

200
201
	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, numTiles, batchSize)
	hiddenState = hiddenState.Pad(ctx, 0, -numPaddingPatches, 0, 0)
Michael Yang's avatar
Michael Yang committed
202
203
204
	return hiddenState.Concat(ctx, hiddenStates, 0)
}

205
func newVisionModel(c fs.Config) *VisionModel {
Michael Yang's avatar
Michael Yang committed
206
207
208
209
210
	return &VisionModel{
		Transformer:       &VisionEncoder{Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count"))},
		GlobalTransformer: &VisionEncoder{Layers: make([]VisionEncoderLayer, c.Uint("vision.global.block_count"))},

		VisionModelOptions: &VisionModelOptions{
211
212
			hiddenSize: int(c.Uint("vision.embedding_length")),
			numHeads:   int(c.Uint("vision.attention.head_count")),
Michael Yang's avatar
Michael Yang committed
213
214
215
216
217
218

			imageSize: int(c.Uint("vision.image_size")),
			patchSize: int(c.Uint("vision.patch_size")),

			eps: c.Float("vision.attention.layer_norm_epsilon"),

Michael Yang's avatar
Michael Yang committed
219
			intermediateLayersIndices: c.Ints("vision.intermediate_layers_indices"),
Michael Yang's avatar
Michael Yang committed
220
221
222
		},
	}
}