"vscode:/vscode.git/clone" did not exist on "7eb9d8e5947b4232daf3ce708cbd728e2c677999"
convert_gemma3.go 6.88 KB
Newer Older
Patrick Devine's avatar
Patrick Devine committed
1
2
package convert

Michael Yang's avatar
Michael Yang committed
3
4
import (
	"cmp"
5
	"slices"
Michael Yang's avatar
Michael Yang committed
6
)
Patrick Devine's avatar
Patrick Devine committed
7
8
9

type gemma3Model struct {
	gemmaModel
Patrick Devine's avatar
Patrick Devine committed
10
11
	Architecture string
	TextModel    struct {
Patrick Devine's avatar
Patrick Devine committed
12
		HeadDim          uint32 `json:"head_dim"`
Patrick Devine's avatar
Patrick Devine committed
13
14
15
16
17
		HiddenSize       uint32 `json:"hidden_size"`
		HiddenLayers     uint32 `json:"num_hidden_layers"`
		IntermediateSize uint32 `json:"intermediate_size"`
		SlidingWindow    uint32 `json:"sliding_window"`
	} `json:"text_config"`
Michael Yang's avatar
Michael Yang committed
18
19
20
21
22
23
24
25
26
27
	VisionModel struct {
		NumAttentionHeads uint32  `json:"num_attention_heads"` // attention.head_count 16
		LayerNormEpsilon  float32 `json:"layer_norm_eps"`      // attention.layer_norm_epsilon 1e-05
		NumHiddenLayers   uint32  `json:"num_hidden_layers"`   // block_count 32
		HiddenSize        uint32  `json:"hidden_size"`         // embedding_length 1280
		IntermediateSize  uint32  `json:"intermediate_size"`   // feed_forward_length 5120
		ImageSize         uint32  `json:"image_size"`          // image_size 560
		NumChannels       uint32  `json:"num_channels"`        // num_channels 3
		PatchSize         uint32  `json:"patch_size"`          // patch_size 14
	} `json:"vision_config"`
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
	MaxPositionEmbeddings    uint32   `json:"max_position_embeddings"`
	NumAttentionHeads        uint32   `json:"num_attention_heads"`
	NumKeyValueHeads         uint32   `json:"num_key_value_heads"`
	RMSNormEPS               float32  `json:"rms_norm_eps"`
	HeadDim                  uint32   `json:"head_dim"`
	FinalLogitSoftcap        float32  `json:"final_logit_softcapping"`
	RopeLocalTheta           float32  `json:"rope_local_base_freq"`
	RopeTheta                float32  `json:"rope_theta"`
	SlidingWindow            uint32   `json:"sliding_window"`
	SlidingWindowPattern     *uint32  `json:"sliding_window_pattern"`
	LayerTypes               []string `json:"layer_types"`
	MultiModalTokensPerImage uint32   `json:"mm_tokens_per_image"`
	RopeScaling              *struct {
		Type                          string  `json:"rope_type"`
		Factor                        float32 `json:"factor"`
		OriginalMaxPositionEmbeddings uint32  `json:"original_max_position_embeddings"`
		ExtrapolationFactor           float32 `json:"extrapolation_factor"`
		BetaFast                      float32 `json:"beta_fast"`
		BetaSlow                      float32 `json:"beta_slow"`
	} `json:"rope_scaling"`
Patrick Devine's avatar
Patrick Devine committed
48
49
}

Patrick Devine's avatar
Patrick Devine committed
50
51
52
53
54
55
const (
	gemma4BLayerCount  = 34
	gemma12BLayerCount = 48
	gemma27BLayerCount = 62
)

56
func (p *gemma3Model) KV(t *Tokenizer) KV {
Patrick Devine's avatar
Patrick Devine committed
57
58
	kv := p.ModelParameters.KV(t)
	kv["general.architecture"] = "gemma3"
Michael Yang's avatar
Michael Yang committed
59

Patrick Devine's avatar
Patrick Devine committed
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
	numBlocks := cmp.Or(p.HiddenLayers, p.TextModel.HiddenLayers)
	kv["gemma3.block_count"] = numBlocks

	var (
		numHeads   uint32
		numKVHeads uint32
	)

	switch numBlocks {
	case gemma4BLayerCount:
		numHeads = 8
		numKVHeads = 4
	case gemma12BLayerCount:
		numHeads = 16
		numKVHeads = 8
	case gemma27BLayerCount:
		numHeads = 32
		numKVHeads = 16
	default:
		numHeads = p.NumAttentionHeads
		numKVHeads = p.NumKeyValueHeads
	}

	kv["gemma3.attention.head_count"] = numHeads
	kv["gemma3.attention.head_count_kv"] = numKVHeads

Patrick Devine's avatar
Patrick Devine committed
86
87
88
	switch p.Architecture {
	case "Gemma3ForCausalLM":
		kv["gemma3.context_length"] = p.MaxPositionEmbeddings
Michael Yang's avatar
Michael Yang committed
89
		kv["gemma3.attention.layer_norm_rms_epsilon"] = p.RMSNormEPS
Patrick Devine's avatar
Patrick Devine committed
90
91
		kv["gemma3.attention.key_length"] = p.HeadDim
		kv["gemma3.attention.value_length"] = p.HeadDim
Michael Yang's avatar
Michael Yang committed
92
		kv["gemma3.attention.sliding_window"] = p.SlidingWindow
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113

		// The sliding window pattern is either provided as the sliding_window_pattern
		// key (an int) or as the layer_types key (a list of strings).
		if p.SlidingWindowPattern != nil || len(p.LayerTypes) > 0 {
			kv["gemma3.attention.sliding_window_pattern"] = slices.Collect(func(yield func(bool) bool) {
				for i := range numBlocks {
					var isLocal bool
					if len(p.LayerTypes) > 0 && int(i) < len(p.LayerTypes) {
						isLocal = p.LayerTypes[i] == "sliding_attention"
					} else if p.SlidingWindowPattern != nil && *p.SlidingWindowPattern > 0 {
						isLocal = (i+1)%*p.SlidingWindowPattern != 0
					}
					if !yield(isLocal) {
						break
					}
				}
			})
		}
		if p.FinalLogitSoftcap > 0 {
			kv["gemma3.final_logit_softcapping"] = p.FinalLogitSoftcap
		}
Patrick Devine's avatar
Patrick Devine committed
114
		kv["gemma3.rope.local.freq_base"] = cmp.Or(p.RopeLocalTheta, 10000.0)
115
116
117
118
119
120
121
122
123
124
		kv["gemma3.rope.freq_base"] = cmp.Or(p.RopeTheta, 1000000.0)
		if p.RopeScaling != nil && p.RopeScaling.Type == "yarn" && p.RopeScaling.Factor > 0 {
			kv["gemma3.rope.scaling.type"] = "yarn"
			kv["gemma3.rope.scaling.factor"] = p.RopeScaling.Factor
			kv["gemma3.rope.scaling.original_context_length"] = p.RopeScaling.OriginalMaxPositionEmbeddings
			kv["gemma3.rope.scaling.extrapolation_factor"] = cmp.Or(p.RopeScaling.ExtrapolationFactor, float32(1.0))
			kv["gemma3.rope.scaling.beta_fast"] = cmp.Or(p.RopeScaling.BetaFast, float32(64.0))
			kv["gemma3.rope.scaling.beta_slow"] = cmp.Or(p.RopeScaling.BetaSlow, float32(1.0))
		}

Patrick Devine's avatar
Patrick Devine committed
125
		kv["gemma3.embedding_length"] = p.HiddenSize
Michael Yang's avatar
Michael Yang committed
126
		kv["gemma3.feed_forward_length"] = p.IntermediateSize
Patrick Devine's avatar
Patrick Devine committed
127
	default:
128
		kv["gemma3.context_length"] = cmp.Or(p.MaxPositionEmbeddings, 131072)
Patrick Devine's avatar
Patrick Devine committed
129
		kv["gemma3.embedding_length"] = p.TextModel.HiddenSize
Michael Yang's avatar
Michael Yang committed
130
131
		kv["gemma3.feed_forward_length"] = p.TextModel.IntermediateSize
		kv["gemma3.attention.sliding_window"] = p.TextModel.SlidingWindow
Patrick Devine's avatar
Patrick Devine committed
132
133
134
135
136
		kv["gemma3.vision.block_count"] = p.VisionModel.NumHiddenLayers
		kv["gemma3.vision.embedding_length"] = p.VisionModel.HiddenSize
		kv["gemma3.vision.feed_forward_length"] = p.VisionModel.IntermediateSize
		kv["gemma3.vision.image_size"] = p.VisionModel.ImageSize
		kv["gemma3.vision.patch_size"] = p.VisionModel.PatchSize
Michael Yang's avatar
Michael Yang committed
137
		kv["gemma3.vision.num_channels"] = cmp.Or(p.VisionModel.NumChannels, 3)
Patrick Devine's avatar
Patrick Devine committed
138
		kv["gemma3.vision.attention.head_count"] = p.VisionModel.NumAttentionHeads
Michael Yang's avatar
Michael Yang committed
139
		kv["gemma3.vision.attention.layer_norm_epsilon"] = cmp.Or(p.VisionModel.LayerNormEpsilon, 1e-6)
Patrick Devine's avatar
Patrick Devine committed
140
141
		kv["gemma3.attention.key_length"] = cmp.Or(p.TextModel.HeadDim, 256)
		kv["gemma3.attention.value_length"] = cmp.Or(p.TextModel.HeadDim, 256)
Patrick Devine's avatar
Patrick Devine committed
142
143
	}

Michael Yang's avatar
Michael Yang committed
144
145
146
147
	if p.MultiModalTokensPerImage > 0 {
		kv["gemma3.mm.tokens_per_image"] = p.MultiModalTokensPerImage
	}

Patrick Devine's avatar
Patrick Devine committed
148
149
150
151
152
153
154
155
	return kv
}

func (p *gemma3Model) Replacements() []string {
	return []string{
		"lm_head", "output",
		"model.embed_tokens", "token_embd",
		"model.norm", "output_norm",
Michael Yang's avatar
Michael Yang committed
156
157
		"vision_tower.vision_model.embeddings", "v",
		"vision_tower.vision_model", "v",
Patrick Devine's avatar
Patrick Devine committed
158
159
		"vision_model.vision_model.embeddings", "v",
		"vision_model.vision_model", "v",
Patrick Devine's avatar
Patrick Devine committed
160
161
162
163
164
165
166
167
168
169
		"language_model.", "",
		"model.layers", "blk",
		"encoder.layers", "blk",
		"input_layernorm", "attn_norm",
		"self_attn.q_proj", "attn_q",
		"self_attn.q_norm", "attn_q_norm",
		"self_attn.k_proj", "attn_k",
		"self_attn.k_norm", "attn_k_norm",
		"self_attn.v_proj", "attn_v",
		"self_attn.o_proj", "attn_output",
Michael Yang's avatar
Michael Yang committed
170
		"self_attn.out_proj", "attn_output",
Patrick Devine's avatar
Patrick Devine committed
171
172
173
174
175
176
		"mlp.gate_proj", "ffn_gate",
		"mlp.down_proj", "ffn_down",
		"mlp.up_proj", "ffn_up",
		"post_attention_layernorm", "post_attention_norm",
		"pre_feedforward_layernorm", "ffn_norm",
		"post_feedforward_layernorm", "post_ffw_norm",
Michael Yang's avatar
Michael Yang committed
177
178
		"input_projection_weight", "input_projection.weight",
		"multi_modal_projector", "mm",
Patrick Devine's avatar
Patrick Devine committed
179
180
	}
}