model.go 5.44 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
package qwen25vl

import (
	"bytes"
	"image"
	"slices"

	"github.com/ollama/ollama/fs"
	"github.com/ollama/ollama/kvcache"
	"github.com/ollama/ollama/ml"
	"github.com/ollama/ollama/model"
	"github.com/ollama/ollama/model/input"
)

type Model struct {
	model.Base
	model.BytePairEncoding

	*TextModel
20
	*VisionModel `gguf:"v"`
21
22
23
24
25
26
27
28
29
30
31
32
33
34

	ImageProcessor
}

// Implement MultimodalProcessor interface
var _ model.MultimodalProcessor = (*Model)(nil)

func New(c fs.Config) (model.Model, error) {
	m := &Model{
		BytePairEncoding: model.NewBytePairEncoding(
			&model.Vocabulary{
				Values: c.Strings("tokenizer.ggml.tokens"),
				Types:  c.Ints("tokenizer.ggml.token_type"),
				Merges: c.Strings("tokenizer.ggml.merges"),
Michael Yang's avatar
Michael Yang committed
35
				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", false),
36
				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
37
				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
38
39
40
41
				EOS: append(
					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
					c.Ints("tokenizer.ggml.eos_token_ids")...,
				),
42
			},
43
			`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
44
45
46
47
48
49
50
51
52
53
54
55
		),
		TextModel:      NewTextModel(c),
		VisionModel:    newVisionModel(c),
		ImageProcessor: newImageProcessor(c),
	}

	m.Cache = kvcache.NewCausalCache(m.TextModel.Shift)

	return m, nil
}

func (m *Model) PixelValues(ctx ml.Context, multimodalData []byte) (ml.Tensor, *Grid, error) {
Michael Yang's avatar
Michael Yang committed
56
	img, _, err := image.Decode(bytes.NewReader(multimodalData))
57
58
59
60
	if err != nil {
		return nil, nil, err
	}

Michael Yang's avatar
Michael Yang committed
61
	f32s, grid, err := m.ImageProcessor.ProcessImage(img)
62
63
64
65
66
	if err != nil {
		return nil, nil, err
	}

	// Calculate tensor dimensions
Michael Yang's avatar
Michael Yang committed
67
	patchDim := m.numChannels * m.temporalPatchSize * m.patchSize * m.patchSize
68
69
	numPatches := grid.Temporal * grid.Height * grid.Width

Michael Yang's avatar
Michael Yang committed
70
	pixelValues := ctx.Input().FromFloats(f32s, patchDim, numPatches)
71
72
73
74

	return pixelValues, grid, nil
}

75
func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
76
77
78
79
80
81
82
83
84
85
	if len(m.VisionModel.Layers) == 0 {
		return nil, model.ErrNoVisionModel
	}

	pixels, grid, err := m.PixelValues(ctx, multimodalData)
	if err != nil {
		return nil, err
	}

	visionOutputs := m.VisionModel.Forward(ctx, pixels, grid)
Michael Yang's avatar
Michael Yang committed
86
	return []input.Multimodal{{Tensor: visionOutputs, Data: grid}}, nil
87
88
89
}

// PostTokenize arranges Qwen-2.5-VL's inputs for the forward pass
90
func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
Michael Yang's avatar
Michael Yang committed
91
92
	// Reset position cache
	m.positionCache = m.positionCache[:0]
93
	var result []*input.Input
94
95
96
97
98
99
100

	var (
		imageToken       int32 = 151655
		visionStartToken int32 = 151652
		visionEndToken   int32 = 151653
	)

Michael Yang's avatar
Michael Yang committed
101
102
103
104
105
106
107
	appendInput := func(i *input.Input, p int) int {
		result = append(result, i)
		m.positionCache = append(m.positionCache, int32(p))
		return p + 1
	}

	var p int
108
109
110
	for _, inp := range inputs {
		if inp.Multimodal == nil {
			// If not a multimodal input, add it to the result unchanged
Michael Yang's avatar
Michael Yang committed
111
			p = appendInput(inp, p)
112
113
		} else {
			// First add the vision start token
Michael Yang's avatar
Michael Yang committed
114
			p = appendInput(&input.Input{Token: visionStartToken}, p)
115
116

			// Add the image token with the multimodal tensor data at the first position
Michael Yang's avatar
Michael Yang committed
117
118
			tokensPerGrid := inp.Multimodal[0].Tensor.Dim(1)
			appendInput(&input.Input{
119
				Token:          imageToken,
120
				Multimodal:     inp.Multimodal,
121
				MultimodalHash: inp.MultimodalHash,
Michael Yang's avatar
Michael Yang committed
122
123
				SameBatch:      tokensPerGrid,
			}, p)
124
125

			// Add the placeholder tokens for the remaining positions (tokensPerGrid-1)
Michael Yang's avatar
Michael Yang committed
126
127
128
			for range tokensPerGrid - 1 {
				appendInput(&input.Input{Token: imageToken}, p)
			}
129

Michael Yang's avatar
Michael Yang committed
130
131
			grid := inp.Multimodal[0].Data.(*Grid)
			p = appendInput(&input.Input{Token: visionEndToken}, p+max(grid.Width/m.spatialMergeSize, grid.Height/m.spatialMergeSize))
132
133
134
135
136
137
138
		}
	}

	return result, nil
}

func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
Michael Yang's avatar
Michael Yang committed
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
	// Initial token embedding
	hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs).Duplicate(ctx)

	positionSlice := func() [][]int32 {
		s := [][]int32{
			make([]int32, len(batch.Positions)),
			make([]int32, len(batch.Positions)),
			make([]int32, len(batch.Positions)),
			make([]int32, len(batch.Positions)),
		}
		for i, position := range batch.Positions {
			if position < int32(len(m.positionCache)) {
				position = m.positionCache[position]
			} else if len(m.positionCache) > 0 {
				position = position - int32(len(m.positionCache)) + m.positionCache[len(m.positionCache)-1] + 1
			}

			s[0][i] = position
			s[1][i] = position
			s[2][i] = position
		}
		return s
	}()

	for _, mi := range batch.Multimodal {
		img := mi.Multimodal[0].Tensor
		ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1))))
		if grid, ok := mi.Multimodal[0].Data.(*Grid); ok {
			for i := range img.Dim(1) {
				w := grid.Width / m.spatialMergeSize
				positionSlice[1][mi.Index+i] += int32(i / w)
				positionSlice[2][mi.Index+i] += int32(i % w)
			}
		}
	}

	positions := ctx.Input().FromInts(slices.Concat(positionSlice...), len(positionSlice[0])*len(positionSlice))

	// Process through transformer layers
	for i, layer := range m.TextModel.Layers {
		m.Cache.SetLayer(i)

		var lastLayerOutputs ml.Tensor
		if i == len(m.TextModel.Layers)-1 {
			lastLayerOutputs = batch.Outputs
		}

		hiddenStates = layer.Forward(ctx, hiddenStates, positions, lastLayerOutputs, m.Cache, m.TextOptions)
	}
188

Michael Yang's avatar
Michael Yang committed
189
190
	hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.TextModel.eps)
	return m.Output.Forward(ctx, hiddenStates), nil
191
192
193
194
195
}

func init() {
	model.Register("qwen25vl", New)
}