model_vision.go 3.79 KB
Newer Older
Michael Yang's avatar
Michael Yang committed
1
2
3
4
5
package gemma3

import (
	"math"

6
	"github.com/ollama/ollama/fs"
Michael Yang's avatar
Michael Yang committed
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
	"github.com/ollama/ollama/ml"
	"github.com/ollama/ollama/ml/nn"
)

var batchSize int = 1

type VisionSelfAttention struct {
	Query  *nn.Linear `gguf:"attn_q"`
	Key    *nn.Linear `gguf:"attn_k"`
	Value  *nn.Linear `gguf:"attn_v"`
	Output *nn.Linear `gguf:"attn_output"`
}

func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
	headDim := opts.hiddenSize / opts.numHeads

	query := sa.Query.Forward(ctx, hiddenState)
	key := sa.Key.Forward(ctx, hiddenState)
	value := sa.Value.Forward(ctx, hiddenState)

Michael Yang's avatar
Michael Yang committed
27
28
29
	query = query.Reshape(ctx, headDim, opts.numHeads, query.Dim(1), batchSize)
	key = key.Reshape(ctx, headDim, opts.numHeads, key.Dim(1), batchSize)
	value = value.Reshape(ctx, headDim, opts.numHeads, value.Dim(1), batchSize)
Michael Yang's avatar
Michael Yang committed
30

Michael Yang's avatar
Michael Yang committed
31
	attention := nn.Attention(ctx, query, key, value, 1.0/math.Sqrt(float64(headDim)), nil)
Michael Yang's avatar
Michael Yang committed
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
	attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize)

	hiddenState = sa.Output.Forward(ctx, attention)
	return hiddenState
}

type VisionMLP struct {
	FC1 *nn.Linear `gguf:"fc1"`
	FC2 *nn.Linear `gguf:"fc2"`
}

func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
	hiddenState = mlp.FC1.Forward(ctx, hiddenState).GELU(ctx)
	hiddenState = mlp.FC2.Forward(ctx, hiddenState)
	return hiddenState
}

type VisionEncoderLayer struct {
Jesse Gross's avatar
Jesse Gross committed
50
	LayerNorm1    *nn.LayerNorm `gguf:"layer_norm1"`
Michael Yang's avatar
Michael Yang committed
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
	SelfAttention *VisionSelfAttention

	LayerNorm2 *nn.LayerNorm `gguf:"layer_norm2"`
	MLP        *VisionMLP    `gguf:"mlp"`
}

func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
	residual := hiddenState

	// self attention
	hiddenState = e.LayerNorm1.Forward(ctx, hiddenState, opts.eps)
	hiddenState = e.SelfAttention.Forward(ctx, hiddenState, opts)
	hiddenState = hiddenState.Add(ctx, residual)
	residual = hiddenState

	// feed forward
	hiddenState = e.LayerNorm2.Forward(ctx, hiddenState, opts.eps)
	hiddenState = e.MLP.Forward(ctx, hiddenState, opts)
	return hiddenState.Add(ctx, residual)
}

type VisionModelOptions struct {
jmorganca's avatar
jmorganca committed
73
74
75
	hiddenSize, numHeads int
	imageSize, patchSize int
	eps                  float32
Michael Yang's avatar
Michael Yang committed
76
77
78
79
80
81
82
}

type VisionModel struct {
	PatchEmbedding    *nn.Conv2D    `gguf:"patch_embedding"`
	PositionEmbedding *nn.Embedding `gguf:"position_embedding"`
	PostLayerNorm     *nn.LayerNorm `gguf:"post_layernorm"`

Michael Yang's avatar
Michael Yang committed
83
	Layers []VisionEncoderLayer `gguf:"blk"`
Michael Yang's avatar
Michael Yang committed
84
85
86
87

	*VisionModelOptions
}

Michael Yang's avatar
Michael Yang committed
88
func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
89
90
91
92
93
94
	numPatches := (m.imageSize / m.patchSize) * (m.imageSize / m.patchSize)

	hiddenState := m.PatchEmbedding.Forward(ctx, pixelValues, m.patchSize, m.patchSize, 0, 0, 1, 1)
	hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize)
	hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)

Michael Yang's avatar
Michael Yang committed
95
96
97
98
99
100
101
102
103
104
105
	positions := make([]int32, numPatches)
	for i := range positions {
		positions[i] = int32(i)
	}

	positionIDs, err := ctx.Input().FromIntSlice(positions, len(positions))
	if err != nil {
		panic(err)
	}

	hiddenState = hiddenState.Add(ctx, m.PositionEmbedding.Forward(ctx, positionIDs))
Michael Yang's avatar
Michael Yang committed
106

Michael Yang's avatar
Michael Yang committed
107
	for _, layer := range m.Layers {
Michael Yang's avatar
Michael Yang committed
108
109
110
111
112
113
114
		hiddenState = layer.Forward(ctx, hiddenState, m.VisionModelOptions)
	}

	hiddenState = m.PostLayerNorm.Forward(ctx, hiddenState, m.eps)
	return hiddenState
}

115
func newVisionModel(c fs.Config) *VisionModel {
Michael Yang's avatar
Michael Yang committed
116
	return &VisionModel{
Michael Yang's avatar
Michael Yang committed
117
		Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count")),
Michael Yang's avatar
Michael Yang committed
118
119
120
121
122
123
124
125
126
127
128
		VisionModelOptions: &VisionModelOptions{
			hiddenSize: int(c.Uint("vision.embedding_length")),
			numHeads:   int(c.Uint("vision.attention.head_count")),

			imageSize: int(c.Uint("vision.image_size")),
			patchSize: int(c.Uint("vision.patch_size")),

			eps: c.Float("vision.attention.layer_norm_epsilon"),
		},
	}
}