model_vision.go 11 KB
Newer Older
1
2
3
4
5
6
7
8
9
package qwen25vl

import (
	"math"
	"slices"

	"github.com/ollama/ollama/fs"
	"github.com/ollama/ollama/ml"
	"github.com/ollama/ollama/ml/nn"
Michael Yang's avatar
Michael Yang committed
10
	"github.com/ollama/ollama/ml/nn/rope"
11
12
)

Michael Yang's avatar
Michael Yang committed
13
14
15
16
17
func blockDiagonalMask(ctx ml.Context, seqLength int, bounds []int) ml.Tensor {
	// Initialize a 2D mask with -Inf
	s := make([][]float32, seqLength)
	for i := range s {
		s[i] = slices.Repeat([]float32{float32(math.Inf(-1))}, seqLength)
18
19
20
21
	}

	// Fill in the mask with zeros for tokens that CAN attend to each other
	for i := 1; i < len(bounds); i++ {
Michael Yang's avatar
Michael Yang committed
22
23
		start, end := bounds[i-1], bounds[i]
		// Enable attention within this sequence block
24
25
		for row := start; row < end; row++ {
			for col := start; col < end; col++ {
Michael Yang's avatar
Michael Yang committed
26
				s[row][col] = 0.0
27
28
29
30
			}
		}
	}

Michael Yang's avatar
Michael Yang committed
31
	return ctx.Input().FromFloats(slices.Concat(s...), seqLength, seqLength)
32
33
34
35
36
37
38
39
40
}

type VisionSelfAttention struct {
	Query  *nn.Linear `gguf:"attn_q"`
	Key    *nn.Linear `gguf:"attn_k"`
	Value  *nn.Linear `gguf:"attn_v"`
	Output *nn.Linear `gguf:"attn_out"`
}

Michael Yang's avatar
Michael Yang committed
41
func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenStates, positions, mask ml.Tensor, opts *VisionModelOptions) ml.Tensor {
42
43
44
45
	query := sa.Query.Forward(ctx, hiddenStates)
	key := sa.Key.Forward(ctx, hiddenStates)
	value := sa.Value.Forward(ctx, hiddenStates)

Michael Yang's avatar
Michael Yang committed
46
47
48
	query = query.Reshape(ctx, opts.headDim, opts.numHeads, query.Dim(1))
	key = key.Reshape(ctx, opts.headDim, opts.numHeads, key.Dim(1))
	value = value.Reshape(ctx, opts.headDim, opts.numHeads, value.Dim(1))
49

Michael Yang's avatar
Michael Yang committed
50
51
	query = opts.applyRotaryPositionEmbeddings(ctx, query, positions)
	key = opts.applyRotaryPositionEmbeddings(ctx, key, positions)
52
53
54
55
56
57
58
59

	// Scale factor for scaled dot-product attention
	scale := 1.0 / math.Sqrt(float64(opts.headDim))

	// Scaled dot-product attention
	query = query.Permute(ctx, 0, 2, 1, 3)
	key = key.Permute(ctx, 0, 2, 1, 3)
	value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
Michael Yang's avatar
Michael Yang committed
60

61
62
63
64
65
66
67
68
	kq := key.MulmatFullPrec(ctx, query)
	kq = kq.Scale(ctx, scale)
	if mask != nil {
		kq = kq.Add(ctx, mask)
	}
	kq = kq.Softmax(ctx)
	kqv := value.Mulmat(ctx, kq)
	attention := kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
Michael Yang's avatar
Michael Yang committed
69
	attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2))
70
71
72
73
74
75
76
77
78
79
80
81

	return sa.Output.Forward(ctx, attention)
}

// VisionMLP implements the multi-layer perceptron
type VisionMLP struct {
	Gate *nn.Linear `gguf:"ffn_gate"`
	Up   *nn.Linear `gguf:"ffn_up"`
	Down *nn.Linear `gguf:"ffn_down"`
}

func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *VisionModelOptions) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
82
	hiddenStates = mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx, mlp.Up.Forward(ctx, hiddenStates))
83
84
85
86
87
88
89
90
91
92
	return mlp.Down.Forward(ctx, hiddenStates)
}

type VisionEncoderLayer struct {
	Norm1         *nn.RMSNorm `gguf:"ln1"`
	SelfAttention *VisionSelfAttention
	Norm2         *nn.RMSNorm `gguf:"ln2"`
	MLP           *VisionMLP
}

Michael Yang's avatar
Michael Yang committed
93
func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenStates, positions, mask ml.Tensor, opts *VisionModelOptions) ml.Tensor {
94
95
	residual := hiddenStates
	hiddenStates = e.Norm1.Forward(ctx, hiddenStates, opts.eps)
Michael Yang's avatar
Michael Yang committed
96
	hiddenStates = e.SelfAttention.Forward(ctx, hiddenStates, positions, mask, opts)
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
	hiddenStates = hiddenStates.Add(ctx, residual)

	residual = hiddenStates
	hiddenStates = e.Norm2.Forward(ctx, hiddenStates, opts.eps)
	hiddenStates = e.MLP.Forward(ctx, hiddenStates, opts)
	return hiddenStates.Add(ctx, residual)
}

// VisionModelOptions contains configuration options
type VisionModelOptions struct {
	hiddenSize        int
	numHeads          int
	headDim           int
	patchSize         int
	numChannels       int
	eps               float32
	ropeTheta         float32
	spatialMergeSize  int
	windowSize        int
	fullAttnBlocks    []int32
	temporalPatchSize int
}

Michael Yang's avatar
Michael Yang committed
120
121
122
123
124
125
126
127
128
129
130
func (o VisionModelOptions) applyRotaryPositionEmbeddings(ctx ml.Context, states, positions ml.Tensor) ml.Tensor {
	return nn.RoPE(ctx, states, positions, o.headDim/2, o.ropeTheta, 1,
		rope.WithVision([]int{
			o.headDim / 4,
			o.headDim / 4,
			o.headDim / 4,
			o.headDim / 4,
		}),
	)
}

131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
type PatchEmbedding struct {
	PatchConv0 *nn.Conv2D `gguf:"patch_embd_0"`
	PatchConv1 *nn.Conv2D `gguf:"patch_embd_1"`
}

func (pe *PatchEmbedding) Forward(ctx ml.Context, pixelValues ml.Tensor, opts *VisionModelOptions) ml.Tensor {
	numPatches := pixelValues.Shape()[1]

	// Reshape the input tensor to match the expected dimensions
	pixelValues = pixelValues.Reshape(ctx, opts.patchSize*opts.patchSize, opts.temporalPatchSize, opts.numChannels, numPatches)

	// Permute the tensor to bring the temporal dimension to the front
	pixelValues = pixelValues.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)

	// Split the tensor into parts for the temporal convolutions
	in0 := pixelValues.View(ctx, 0, 1, pixelValues.Stride(1), pixelValues.Dim(1), pixelValues.Stride(2), pixelValues.Dim(2), pixelValues.Stride(3), pixelValues.Dim(3)).Contiguous(ctx)
	in0 = in0.Reshape(ctx, opts.patchSize, opts.patchSize, opts.numChannels, numPatches)
	in1 := pixelValues.View(ctx, pixelValues.Stride(0), 1, pixelValues.Stride(1), pixelValues.Dim(1), pixelValues.Stride(2), pixelValues.Dim(2), pixelValues.Stride(3), pixelValues.Dim(3)).Contiguous(ctx)
	in1 = in1.Reshape(ctx, opts.patchSize, opts.patchSize, opts.numChannels, numPatches)

	s0, s1 := opts.patchSize, opts.patchSize // Use full stride
	p0, p1 := 0, 0                           // padding
	d0, d1 := 1, 1                           // dilation
	out0 := pe.PatchConv0.Forward(ctx, in0, s0, s1, p0, p1, d0, d1)
	out1 := pe.PatchConv1.Forward(ctx, in1, s0, s1, p0, p1, d0, d1)

	// Add the outputs from the two temporal convolutions
	out := out0.Add(ctx, out1)

	// Reshape the output tensor to match the expected dimensions
	return out.Reshape(ctx, opts.hiddenSize, numPatches)
}

// VisionPatchMerger implements patch merging for the Qwen vision model
type VisionPatchMerger struct {
	LNQ  *nn.RMSNorm `gguf:"ln_q"`
	MLP0 *nn.Linear  `gguf:"mlp.0"`
	MLP2 *nn.Linear  `gguf:"mlp.2"`
}

// Forward computes patch merging for the vision model
func (pm *VisionPatchMerger) Forward(ctx ml.Context, visionOutputs ml.Tensor, opts *VisionModelOptions) ml.Tensor {
	normalized := pm.LNQ.Forward(ctx, visionOutputs, opts.eps)

	hiddenSize := visionOutputs.Dim(0) * (opts.spatialMergeSize * opts.spatialMergeSize)

	// Reshape the normalized output to view the hidden size dimension
Michael Yang's avatar
Michael Yang committed
178
	reshaped := normalized.Reshape(ctx, hiddenSize, normalized.Dim(1)/(opts.spatialMergeSize*opts.spatialMergeSize))
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
	hidden := pm.MLP0.Forward(ctx, reshaped)
	activated := hidden.GELU(ctx)

	output := pm.MLP2.Forward(ctx, activated)

	return output
}

// VisionModel implements the Qwen vision model
type VisionModel struct {
	PatchEmbedding *PatchEmbedding
	Layers         []VisionEncoderLayer `gguf:"blk"`
	PatchMerger    *VisionPatchMerger   `gguf:"merger"`

	*VisionModelOptions
}

// Forward computes the vision model for an input tensor
func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor, grid *Grid) ml.Tensor {
	// Extract patch embeddings
	hiddenStates := m.PatchEmbedding.Forward(ctx, pixelValues, m.VisionModelOptions)

Michael Yang's avatar
Michael Yang committed
201
	index, bounds := m.windowIndex(grid)
202
203
	spatialMergeUnit := m.spatialMergeSize * m.spatialMergeSize

Michael Yang's avatar
Michael Yang committed
204
	windowIndex := ctx.Input().FromInts(index, len(index))
205
	hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0)*spatialMergeUnit, hiddenStates.Dim(1)/spatialMergeUnit)
Michael Yang's avatar
Michael Yang committed
206
	hiddenStates = hiddenStates.Rows(ctx, windowIndex.Argsort(ctx))
207
208
	hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0)/spatialMergeUnit, hiddenStates.Dim(1)*spatialMergeUnit)

Michael Yang's avatar
Michael Yang committed
209
210
211
212
213
214
215
	positions := ctx.Input().FromInts(func() []int32 {
		s := [][]int32{
			make([]int32, grid.Height*grid.Width),
			make([]int32, grid.Height*grid.Width),
			make([]int32, grid.Height*grid.Width),
			make([]int32, grid.Height*grid.Width),
		}
216

Michael Yang's avatar
Michael Yang committed
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
		var cur int
		for y := 0; y < grid.Height; y += m.spatialMergeSize {
			for x := 0; x < grid.Width; x += m.spatialMergeSize {
				for dy := range 2 {
					for dx := range 2 {
						i := int(index[cur/spatialMergeUnit]) * spatialMergeUnit
						i += cur % spatialMergeUnit
						s[0][i] = int32(y + dy)
						s[1][i] = int32(x + dx)
						s[2][i] = int32(y + dy)
						s[3][i] = int32(x + dx)
						cur++
					}
				}
			}
		}

		return slices.Concat(s...)
	}(), grid.Height*grid.Width*4)

	mask := blockDiagonalMask(ctx, hiddenStates.Dim(1), bounds)
238
239
240
241

	// Apply encoder layers
	for i, layer := range m.Layers {
		if slices.Contains(m.fullAttnBlocks, int32(i)) {
Michael Yang's avatar
Michael Yang committed
242
			hiddenStates = layer.Forward(ctx, hiddenStates, positions, nil, m.VisionModelOptions)
243
244
245
246
		} else {
			hiddenStates = layer.Forward(
				ctx,
				hiddenStates,
Michael Yang's avatar
Michael Yang committed
247
				positions,
248
249
250
251
252
253
254
				mask,
				m.VisionModelOptions,
			)
		}
	}

	hiddenStates = m.PatchMerger.Forward(ctx, hiddenStates, m.VisionModelOptions)
Michael Yang's avatar
Michael Yang committed
255
	return hiddenStates.Rows(ctx, windowIndex)
256
257
}

Michael Yang's avatar
Michael Yang committed
258
259
// windowIndex divides the grid into windows and returns:
//  1. A slice of grid point indices organized by windows
260
261
262
263
264
265
//  2. A slice of boundaries that mark where each window's data begins and ends
//     in the flattened representation, scaled by spatialMergeSize squared
//
// The boundaries slice always starts with 0 and contains cumulative ending
// positions for each window, allowing downstream processing to identify
// window boundaries in the tensor data.
Michael Yang's avatar
Michael Yang committed
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
func (m *VisionModel) windowIndex(grid *Grid) (index []int32, bounds []int) {
	height := grid.Height / m.spatialMergeSize
	width := grid.Width / m.spatialMergeSize
	window := m.windowSize / m.patchSize / m.spatialMergeSize

	index = make([]int32, height*width)

	bounds = make([]int, 0, ((height+window-1)/window)*((width+window-1)/window)+1)
	bounds = append(bounds, 0)

	var cur int32
	for y := 0; y < height; y += window {
		for x := 0; x < width; x += window {
			h1 := min(window, height-y)
			w1 := min(window, width-x)
			for dy := range h1 {
				for dx := range w1 {
					win := (y+dy)*width + (x + dx)
					index[win] = cur
					cur++
286
287
				}
			}
Michael Yang's avatar
Michael Yang committed
288
			bounds = append(bounds, int(cur)*window)
289
290
		}
	}
Michael Yang's avatar
Michael Yang committed
291
	return index, bounds
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
}

// newVisionModel creates a new instance of the Qwen vision model
func newVisionModel(c fs.Config) *VisionModel {
	patchSize := int(c.Uint("vision.patch_size", 14))
	hiddenSize := int(c.Uint("vision.embedding_length", 1280))
	numHeads := int(c.Uint("vision.attention.head_count", 16))
	numChannels := int(c.Uint("vision.num_channels", 3))
	eps := c.Float("vision.attention.layer_norm_epsilon", 1e-6)
	ropeTheta := c.Float("vision.rope.freq_base", 10000.0)
	spatialMergeSize := int(c.Uint("vision.spatial_merge_size", 2))
	windowSize := int(c.Uint("vision.window_size", 112))
	fullAttnBlocks := c.Ints("qwen25vl.vision.fullatt_block_indexes", []int32{7, 15, 23, 31})
	temporalPatchSize := int(c.Uint("vision.temporal_patch_size", 2))

	model := &VisionModel{
		Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count", 32)),
		VisionModelOptions: &VisionModelOptions{
			hiddenSize:        hiddenSize,
			numHeads:          numHeads,
			headDim:           hiddenSize / numHeads,
			patchSize:         patchSize,
			numChannels:       numChannels,
			eps:               eps,
			ropeTheta:         ropeTheta,
			spatialMergeSize:  spatialMergeSize,
			windowSize:        windowSize,
			temporalPatchSize: temporalPatchSize,
			fullAttnBlocks:    fullAttnBlocks,
		},
	}

	return model
}