model_vision.go 3.76 KB
Newer Older
Michael Yang's avatar
Michael Yang committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
package gemma3

import (
	"math"

	"github.com/ollama/ollama/ml"
	"github.com/ollama/ollama/ml/nn"
)

var batchSize int = 1

type VisionSelfAttention struct {
	Query  *nn.Linear `gguf:"attn_q"`
	Key    *nn.Linear `gguf:"attn_k"`
	Value  *nn.Linear `gguf:"attn_v"`
	Output *nn.Linear `gguf:"attn_output"`
}

func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
	headDim := opts.hiddenSize / opts.numHeads

	query := sa.Query.Forward(ctx, hiddenState)
	key := sa.Key.Forward(ctx, hiddenState)
	value := sa.Value.Forward(ctx, hiddenState)

Michael Yang's avatar
Michael Yang committed
26
27
28
	query = query.Reshape(ctx, headDim, opts.numHeads, query.Dim(1), batchSize)
	key = key.Reshape(ctx, headDim, opts.numHeads, key.Dim(1), batchSize)
	value = value.Reshape(ctx, headDim, opts.numHeads, value.Dim(1), batchSize)
Michael Yang's avatar
Michael Yang committed
29

Michael Yang's avatar
Michael Yang committed
30
	attention := nn.Attention(ctx, query, key, value, 1.0/math.Sqrt(float64(headDim)), nil)
Michael Yang's avatar
Michael Yang committed
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
	attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize)

	hiddenState = sa.Output.Forward(ctx, attention)
	return hiddenState
}

type VisionMLP struct {
	FC1 *nn.Linear `gguf:"fc1"`
	FC2 *nn.Linear `gguf:"fc2"`
}

func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
	hiddenState = mlp.FC1.Forward(ctx, hiddenState).GELU(ctx)
	hiddenState = mlp.FC2.Forward(ctx, hiddenState)
	return hiddenState
}

type VisionEncoderLayer struct {
Jesse Gross's avatar
Jesse Gross committed
49
	LayerNorm1    *nn.LayerNorm `gguf:"layer_norm1"`
Michael Yang's avatar
Michael Yang committed
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
	SelfAttention *VisionSelfAttention

	LayerNorm2 *nn.LayerNorm `gguf:"layer_norm2"`
	MLP        *VisionMLP    `gguf:"mlp"`
}

func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
	residual := hiddenState

	// self attention
	hiddenState = e.LayerNorm1.Forward(ctx, hiddenState, opts.eps)
	hiddenState = e.SelfAttention.Forward(ctx, hiddenState, opts)
	hiddenState = hiddenState.Add(ctx, residual)
	residual = hiddenState

	// feed forward
	hiddenState = e.LayerNorm2.Forward(ctx, hiddenState, opts.eps)
	hiddenState = e.MLP.Forward(ctx, hiddenState, opts)
	return hiddenState.Add(ctx, residual)
}

type VisionModelOptions struct {
jmorganca's avatar
jmorganca committed
72
73
74
	hiddenSize, numHeads int
	imageSize, patchSize int
	eps                  float32
Michael Yang's avatar
Michael Yang committed
75
76
77
78
79
80
81
}

type VisionModel struct {
	PatchEmbedding    *nn.Conv2D    `gguf:"patch_embedding"`
	PositionEmbedding *nn.Embedding `gguf:"position_embedding"`
	PostLayerNorm     *nn.LayerNorm `gguf:"post_layernorm"`

Michael Yang's avatar
Michael Yang committed
82
	Layers []VisionEncoderLayer `gguf:"blk"`
Michael Yang's avatar
Michael Yang committed
83
84
85
86

	*VisionModelOptions
}

Michael Yang's avatar
Michael Yang committed
87
func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
Michael Yang's avatar
Michael Yang committed
88
89
90
91
92
93
	numPatches := (m.imageSize / m.patchSize) * (m.imageSize / m.patchSize)

	hiddenState := m.PatchEmbedding.Forward(ctx, pixelValues, m.patchSize, m.patchSize, 0, 0, 1, 1)
	hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize)
	hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)

Michael Yang's avatar
Michael Yang committed
94
95
96
97
98
99
100
101
102
103
104
	positions := make([]int32, numPatches)
	for i := range positions {
		positions[i] = int32(i)
	}

	positionIDs, err := ctx.Input().FromIntSlice(positions, len(positions))
	if err != nil {
		panic(err)
	}

	hiddenState = hiddenState.Add(ctx, m.PositionEmbedding.Forward(ctx, positionIDs))
Michael Yang's avatar
Michael Yang committed
105

Michael Yang's avatar
Michael Yang committed
106
	for _, layer := range m.Layers {
Michael Yang's avatar
Michael Yang committed
107
108
109
110
111
112
113
114
115
		hiddenState = layer.Forward(ctx, hiddenState, m.VisionModelOptions)
	}

	hiddenState = m.PostLayerNorm.Forward(ctx, hiddenState, m.eps)
	return hiddenState
}

func newVisionModel(c ml.Config) *VisionModel {
	return &VisionModel{
Michael Yang's avatar
Michael Yang committed
116
		Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count")),
Michael Yang's avatar
Michael Yang committed
117
118
119
120
121
122
123
124
125
126
127
		VisionModelOptions: &VisionModelOptions{
			hiddenSize: int(c.Uint("vision.embedding_length")),
			numHeads:   int(c.Uint("vision.attention.head_count")),

			imageSize: int(c.Uint("vision.image_size")),
			patchSize: int(c.Uint("vision.patch_size")),

			eps: c.Float("vision.attention.layer_norm_epsilon"),
		},
	}
}