Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
564b558c
Unverified
Commit
564b558c
authored
Sep 17, 2025
by
Michael Yang
Committed by
GitHub
Sep 17, 2025
Browse files
fix(llama): other llama flavours (#12308)
* fix(llama): rope scale * spm llama * skip moe models * cleanup
parent
a417ac97
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
74 additions
and
66 deletions
+74
-66
model/models/gemma2/model.go
model/models/gemma2/model.go
+3
-3
model/models/gemma3/model_text.go
model/models/gemma3/model_text.go
+3
-3
model/models/gemma3n/model_text.go
model/models/gemma3n/model_text.go
+4
-4
model/models/llama/model.go
model/models/llama/model.go
+40
-32
model/models/llama4/model_text.go
model/models/llama4/model_text.go
+4
-4
model/models/mistral3/model_text.go
model/models/mistral3/model_text.go
+4
-4
model/models/mllama/model_text.go
model/models/mllama/model_text.go
+4
-4
model/models/qwen2/model.go
model/models/qwen2/model.go
+4
-4
model/models/qwen25vl/model_text.go
model/models/qwen25vl/model_text.go
+4
-4
model/models/qwen3/model.go
model/models/qwen3/model.go
+4
-4
No files found.
model/models/gemma2/model.go
View file @
564b558c
...
...
@@ -63,7 +63,7 @@ func New(c fs.Config) (model.Model, error) {
attnValLen
:
int
(
c
.
Uint
(
"attention.value_length"
)),
eps
:
c
.
Float
(
"attention.layer_norm_rms_epsilon"
),
ropeBase
:
c
.
Float
(
"rope.freq_base"
,
10000.0
),
ropeScale
:
c
.
Float
(
"rope.
freq_scale
"
,
1.0
),
ropeScale
:
c
.
Float
(
"rope.
scaling.factor
"
,
1.0
),
attnLogitSoftcap
:
c
.
Float
(
"attn_logit_softcapping"
),
finalLogitSoftcap
:
c
.
Float
(
"final_logit_softcapping"
),
},
...
...
@@ -88,7 +88,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
q
:=
sa
.
Query
.
Forward
(
ctx
,
hiddenState
)
q
=
q
.
Reshape
(
ctx
,
opts
.
attnKeyLen
,
opts
.
numHeads
,
batchSize
)
q
=
fast
.
RoPE
(
ctx
,
q
,
positionIDs
,
opts
.
attnKeyLen
,
opts
.
ropeBase
,
opts
.
ropeScale
,
rope
.
WithTypeNeoX
())
q
=
fast
.
RoPE
(
ctx
,
q
,
positionIDs
,
opts
.
attnKeyLen
,
opts
.
ropeBase
,
1.
/
opts
.
ropeScale
,
rope
.
WithTypeNeoX
())
if
opts
.
largeModelScaling
{
q
=
q
.
Scale
(
ctx
,
1.0
/
math
.
Sqrt
(
float64
(
opts
.
hiddenSize
/
opts
.
numHeads
)))
...
...
@@ -98,7 +98,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
k
:=
sa
.
Key
.
Forward
(
ctx
,
hiddenState
)
k
=
k
.
Reshape
(
ctx
,
opts
.
attnKeyLen
,
opts
.
numKVHeads
,
batchSize
)
k
=
fast
.
RoPE
(
ctx
,
k
,
positionIDs
,
opts
.
attnKeyLen
,
opts
.
ropeBase
,
opts
.
ropeScale
,
rope
.
WithTypeNeoX
())
k
=
fast
.
RoPE
(
ctx
,
k
,
positionIDs
,
opts
.
attnKeyLen
,
opts
.
ropeBase
,
1.
/
opts
.
ropeScale
,
rope
.
WithTypeNeoX
())
v
:=
sa
.
Value
.
Forward
(
ctx
,
hiddenState
)
v
=
v
.
Reshape
(
ctx
,
opts
.
attnValLen
,
opts
.
numKVHeads
,
batchSize
)
...
...
model/models/gemma3/model_text.go
View file @
564b558c
...
...
@@ -53,7 +53,7 @@ func newTextModel(c fs.Config) *TextModel {
eps
:
c
.
Float
(
"attention.layer_norm_rms_epsilon"
,
1e-06
),
ropeLocalBase
:
c
.
Float
(
"rope.local.freq_base"
,
10000.0
),
ropeGlobalBase
:
c
.
Float
(
"rope.global.freq_base"
,
1000000.0
),
ropeScale
:
c
.
Float
(
"rope.
freq_scale
"
,
1.0
),
ropeScale
:
c
.
Float
(
"rope.
scaling.factor
"
,
1.0
),
},
}
...
...
@@ -84,7 +84,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, pos
q
:=
sa
.
Query
.
Forward
(
ctx
,
hiddenState
)
q
=
q
.
Reshape
(
ctx
,
opts
.
attnKeyLen
,
opts
.
numHeads
,
batchSize
)
q
=
sa
.
QueryNorm
.
Forward
(
ctx
,
q
,
opts
.
eps
)
q
=
fast
.
RoPE
(
ctx
,
q
,
positionIDs
,
opts
.
attnKeyLen
,
ropeBase
,
opts
.
ropeScale
,
rope
.
WithTypeNeoX
())
q
=
fast
.
RoPE
(
ctx
,
q
,
positionIDs
,
opts
.
attnKeyLen
,
ropeBase
,
1.
/
opts
.
ropeScale
,
rope
.
WithTypeNeoX
())
if
opts
.
largeModelScaling
{
q
=
q
.
Scale
(
ctx
,
1.0
/
math
.
Sqrt
(
float64
(
opts
.
hiddenSize
/
opts
.
numHeads
)))
...
...
@@ -95,7 +95,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, pos
k
:=
sa
.
Key
.
Forward
(
ctx
,
hiddenState
)
k
=
k
.
Reshape
(
ctx
,
opts
.
attnKeyLen
,
opts
.
numKVHeads
,
batchSize
)
k
=
sa
.
KeyNorm
.
Forward
(
ctx
,
k
,
opts
.
eps
)
k
=
fast
.
RoPE
(
ctx
,
k
,
positionIDs
,
opts
.
attnKeyLen
,
ropeBase
,
opts
.
ropeScale
,
rope
.
WithTypeNeoX
())
k
=
fast
.
RoPE
(
ctx
,
k
,
positionIDs
,
opts
.
attnKeyLen
,
ropeBase
,
1.
/
opts
.
ropeScale
,
rope
.
WithTypeNeoX
())
v
:=
sa
.
Value
.
Forward
(
ctx
,
hiddenState
)
v
=
v
.
Reshape
(
ctx
,
opts
.
attnValLen
,
opts
.
numKVHeads
,
batchSize
)
...
...
model/models/gemma3n/model_text.go
View file @
564b558c
...
...
@@ -95,7 +95,7 @@ func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.T
ropeBase
=
m
.
ropeBaseLocal
}
return
fast
.
RoPE
(
ctx
,
key
,
shift
,
m
.
headDim
(),
ropeBase
,
m
.
ropeScale
,
rope
.
WithTypeNeoX
()),
nil
return
fast
.
RoPE
(
ctx
,
key
,
shift
,
m
.
headDim
(),
ropeBase
,
1.
/
m
.
ropeScale
,
rope
.
WithTypeNeoX
()),
nil
}
type
TextScaledWordEmbedding
struct
{
...
...
@@ -256,14 +256,14 @@ func (attn TextAttention) Forward(ctx ml.Context, hiddenStates, positions ml.Ten
query
:=
attn
.
Query
.
Forward
(
ctx
,
hiddenStates
)
query
=
query
.
Reshape
(
ctx
,
opts
.
headDim
(),
opts
.
numHeads
,
batchSize
)
query
=
attn
.
QueryNorm
.
Forward
(
ctx
,
query
,
opts
.
eps
)
query
=
fast
.
RoPE
(
ctx
,
query
,
positions
,
opts
.
headDim
(),
ropeBase
,
opts
.
ropeScale
,
rope
.
WithTypeNeoX
())
query
=
fast
.
RoPE
(
ctx
,
query
,
positions
,
opts
.
headDim
(),
ropeBase
,
1.
/
opts
.
ropeScale
,
rope
.
WithTypeNeoX
())
var
key
,
value
ml
.
Tensor
if
!
sharedKV
{
key
=
attn
.
Key
.
Forward
(
ctx
,
hiddenStates
)
key
=
key
.
Reshape
(
ctx
,
opts
.
headDim
(),
opts
.
numKVHeads
,
batchSize
)
key
=
attn
.
KeyNorm
.
Forward
(
ctx
,
key
,
opts
.
eps
)
key
=
fast
.
RoPE
(
ctx
,
key
,
positions
,
opts
.
headDim
(),
ropeBase
,
opts
.
ropeScale
,
rope
.
WithTypeNeoX
())
key
=
fast
.
RoPE
(
ctx
,
key
,
positions
,
opts
.
headDim
(),
ropeBase
,
1.
/
opts
.
ropeScale
,
rope
.
WithTypeNeoX
())
value
=
attn
.
Value
.
Forward
(
ctx
,
hiddenStates
)
value
=
value
.
Reshape
(
ctx
,
opts
.
headDim
(),
opts
.
numKVHeads
,
batchSize
)
...
...
@@ -349,7 +349,7 @@ func newTextModel(c fs.Config) *TextModel {
eps
:
c
.
Float
(
"attention.layer_norm_rms_epsilon"
,
1e-06
),
ropeBase
:
c
.
Float
(
"rope.freq_base"
,
1
_000_000
),
ropeBaseLocal
:
c
.
Float
(
"rope.freq_base_local"
,
10
_000
),
ropeScale
:
c
.
Float
(
"rope.
freq_scale
"
,
1.0
),
ropeScale
:
c
.
Float
(
"rope.
scaling.factor
"
,
1.0
),
slidingWindowPattern
:
c
.
Bools
(
"attention.sliding_window_pattern"
),
activationSparsityScale
:
c
.
Floats
(
"activation_sparsity_scale"
),
...
...
model/models/llama/model.go
View file @
564b558c
...
...
@@ -2,7 +2,6 @@ package llama
import
(
"cmp"
"fmt"
"math"
"github.com/ollama/ollama/fs"
...
...
@@ -23,51 +22,60 @@ type Options struct {
type
Model
struct
{
model
.
Base
model
.
BytePairEncoding
model
.
TextProcessor
TokenEmbedding
*
nn
.
Embedding
`gguf:"token_embd"`
Layers
[]
Layer
`gguf:"blk"`
OutputNorm
*
nn
.
RMSNorm
`gguf:"output_norm"`
Output
*
nn
.
Linear
`gguf:"output,alt:token_embd"`
*
Options
Options
}
func
New
(
c
fs
.
Config
)
(
model
.
Model
,
error
)
{
// This model currently only supports the gpt2 tokenizer
if
c
.
String
(
"tokenizer.ggml.model"
)
==
"llama"
{
return
nil
,
fmt
.
Errorf
(
"unsupported tokenizer: llama"
)
if
c
.
Uint
(
"expert_count"
)
>
0
{
// TODO: support mixtures of experts
return
nil
,
model
.
ErrUnsupportedModel
}
// Best effort detection of library/deepseek-coder model(s) which are incompatible
if
c
.
String
(
"general.name"
)
==
"deepseek-ai"
{
return
nil
,
fmt
.
Errorf
(
"unsupported model: %s"
,
c
.
String
(
"general.name"
))
var
processor
model
.
TextProcessor
vocabulary
:=
model
.
Vocabulary
{
Values
:
c
.
Strings
(
"tokenizer.ggml.tokens"
),
Scores
:
c
.
Floats
(
"tokenizer.ggml.scores"
),
Types
:
c
.
Ints
(
"tokenizer.ggml.token_type"
),
Merges
:
c
.
Strings
(
"tokenizer.ggml.merges"
),
AddBOS
:
c
.
Bool
(
"tokenizer.ggml.add_bos_token"
,
true
),
BOS
:
[]
int32
{
int32
(
c
.
Uint
(
"tokenizer.ggml.bos_token_id"
))},
AddEOS
:
c
.
Bool
(
"tokenizer.ggml.add_eos_token"
,
false
),
EOS
:
append
(
[]
int32
{
int32
(
c
.
Uint
(
"tokenizer.ggml.eos_token_id"
))},
c
.
Ints
(
"tokenizer.ggml.eos_token_ids"
)
...
,
),
}
switch
c
.
String
(
"tokenizer.ggml.model"
)
{
case
"gpt2"
:
processor
=
model
.
NewBytePairEncoding
(
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`
,
&
vocabulary
,
)
case
"llama"
:
processor
=
model
.
NewSentencePiece
(
&
vocabulary
)
default
:
return
nil
,
model
.
ErrUnsupportedTokenizer
}
m
:=
Model
{
BytePairEncoding
:
model
.
NewBytePairEncoding
(
c
.
String
(
"tokenizer.ggml.pretokenizer"
,
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`
),
&
model
.
Vocabulary
{
Values
:
c
.
Strings
(
"tokenizer.ggml.tokens"
),
Types
:
c
.
Ints
(
"tokenizer.ggml.token_type"
),
Merges
:
c
.
Strings
(
"tokenizer.ggml.merges"
),
AddBOS
:
c
.
Bool
(
"tokenizer.ggml.add_bos_token"
,
true
),
BOS
:
[]
int32
{
int32
(
c
.
Uint
(
"tokenizer.ggml.bos_token_id"
))},
AddEOS
:
c
.
Bool
(
"tokenizer.ggml.add_eos_token"
,
false
),
EOS
:
append
(
[]
int32
{
int32
(
c
.
Uint
(
"tokenizer.ggml.eos_token_id"
))},
c
.
Ints
(
"tokenizer.ggml.eos_token_ids"
)
...
,
),
},
),
Layers
:
make
([]
Layer
,
c
.
Uint
(
"block_count"
)),
Options
:
&
Options
{
TextProcessor
:
processor
,
Layers
:
make
([]
Layer
,
c
.
Uint
(
"block_count"
)),
Options
:
Options
{
hiddenSize
:
int
(
c
.
Uint
(
"embedding_length"
)),
numHeads
:
int
(
c
.
Uint
(
"attention.head_count"
)),
numKVHeads
:
int
(
c
.
Uint
(
"attention.head_count_kv"
)),
headDim
:
int
(
c
.
Uint
(
"attention.key_length"
)),
ropeDim
:
int
(
c
.
Uint
(
"rope.dimension_count"
)),
eps
:
c
.
Float
(
"attention.layer_norm_rms_epsilon"
),
ropeBase
:
c
.
Float
(
"rope.freq_base"
),
ropeScale
:
c
.
Float
(
"rope.
freq_scale
"
,
1
),
ropeBase
:
c
.
Float
(
"rope.freq_base"
,
1e5
),
ropeScale
:
c
.
Float
(
"rope.
scaling.factor
"
,
1
),
},
}
...
...
@@ -98,8 +106,8 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.Tenso
value
:=
sa
.
Value
.
Forward
(
ctx
,
hiddenState
)
value
=
value
.
Reshape
(
ctx
,
headDim
,
opts
.
numKVHeads
,
batchSize
)
query
=
fast
.
RoPE
(
ctx
,
query
,
positions
,
ropeDim
,
opts
.
ropeBase
,
opts
.
ropeScale
,
rope
.
WithFactors
(
sa
.
RopeFactors
))
key
=
fast
.
RoPE
(
ctx
,
key
,
positions
,
ropeDim
,
opts
.
ropeBase
,
opts
.
ropeScale
,
rope
.
WithFactors
(
sa
.
RopeFactors
))
query
=
fast
.
RoPE
(
ctx
,
query
,
positions
,
ropeDim
,
opts
.
ropeBase
,
1.
/
opts
.
ropeScale
,
rope
.
WithFactors
(
sa
.
RopeFactors
))
key
=
fast
.
RoPE
(
ctx
,
key
,
positions
,
ropeDim
,
opts
.
ropeBase
,
1.
/
opts
.
ropeScale
,
rope
.
WithFactors
(
sa
.
RopeFactors
))
attention
:=
nn
.
Attention
(
ctx
,
query
,
key
,
value
,
1.0
/
math
.
Sqrt
(
float64
(
headDim
)),
cache
)
attention
=
attention
.
Reshape
(
ctx
,
headDim
*
opts
.
numHeads
,
batchSize
)
...
...
@@ -108,7 +116,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.Tenso
func
(
m
*
Model
)
Shift
(
ctx
ml
.
Context
,
layer
int
,
key
,
shift
ml
.
Tensor
)
(
ml
.
Tensor
,
error
)
{
ropeDim
:=
cmp
.
Or
(
m
.
ropeDim
,
m
.
hiddenSize
/
m
.
numHeads
)
return
fast
.
RoPE
(
ctx
,
key
,
shift
,
ropeDim
,
m
.
ropeBase
,
m
.
ropeScale
,
rope
.
WithFactors
(
m
.
Layers
[
layer
]
.
SelfAttention
.
RopeFactors
)),
nil
return
fast
.
RoPE
(
ctx
,
key
,
shift
,
ropeDim
,
m
.
ropeBase
,
1.
/
m
.
ropeScale
,
rope
.
WithFactors
(
m
.
Layers
[
layer
]
.
SelfAttention
.
RopeFactors
)),
nil
}
type
MLP
struct
{
...
...
@@ -163,7 +171,7 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
outputs
=
batch
.
Outputs
}
hiddenState
=
layer
.
Forward
(
ctx
,
hiddenState
,
positions
,
outputs
,
m
.
Cache
,
m
.
Options
)
hiddenState
=
layer
.
Forward
(
ctx
,
hiddenState
,
positions
,
outputs
,
m
.
Cache
,
&
m
.
Options
)
}
hiddenState
=
m
.
OutputNorm
.
Forward
(
ctx
,
hiddenState
,
m
.
eps
)
...
...
model/models/llama4/model_text.go
View file @
564b558c
...
...
@@ -33,8 +33,8 @@ func (sa *TextAttention) Forward(ctx ml.Context, hiddenStates, positions, attent
value
=
value
.
Reshape
(
ctx
,
headDim
,
opts
.
numKVHeads
,
batchSize
)
if
useRope
{
query
=
fast
.
RoPE
(
ctx
,
query
,
positions
,
opts
.
ropeDim
,
opts
.
ropeBase
,
opts
.
ropeScale
,
rope
.
WithFactors
(
sa
.
RopeFactors
))
key
=
fast
.
RoPE
(
ctx
,
key
,
positions
,
opts
.
ropeDim
,
opts
.
ropeBase
,
opts
.
ropeScale
,
rope
.
WithFactors
(
sa
.
RopeFactors
))
query
=
fast
.
RoPE
(
ctx
,
query
,
positions
,
opts
.
ropeDim
,
opts
.
ropeBase
,
1.
/
opts
.
ropeScale
,
rope
.
WithFactors
(
sa
.
RopeFactors
))
key
=
fast
.
RoPE
(
ctx
,
key
,
positions
,
opts
.
ropeDim
,
opts
.
ropeBase
,
1.
/
opts
.
ropeScale
,
rope
.
WithFactors
(
sa
.
RopeFactors
))
}
if
opts
.
useQKNorm
{
...
...
@@ -196,7 +196,7 @@ func newTextModel(c fs.Config) *TextModel {
numExpertsUsed
:
int
(
c
.
Uint
(
"expert_used_count"
)),
ropeDim
:
int
(
c
.
Uint
(
"rope.dimension_count"
)),
ropeBase
:
c
.
Float
(
"rope.freq_base"
),
ropeScale
:
c
.
Float
(
"rope.
freq_scale
"
,
1
),
ropeScale
:
c
.
Float
(
"rope.
scaling.factor
"
,
1
),
eps
:
c
.
Float
(
"attention.layer_norm_rms_epsilon"
),
interleaveLayerStep
:
int
(
c
.
Uint
(
"interleave_moe_layer_step"
,
1
)),
noRopeInterval
:
int
(
c
.
Uint
(
"no_rope_interval"
,
4
)),
...
...
@@ -248,5 +248,5 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
}
func
(
m
*
TextModel
)
Shift
(
ctx
ml
.
Context
,
layer
int
,
key
,
shift
ml
.
Tensor
)
(
ml
.
Tensor
,
error
)
{
return
fast
.
RoPE
(
ctx
,
key
,
shift
,
m
.
ropeDim
,
m
.
ropeBase
,
m
.
ropeScale
,
rope
.
WithFactors
(
m
.
Layers
[
layer
]
.
Attention
.
RopeFactors
)),
nil
return
fast
.
RoPE
(
ctx
,
key
,
shift
,
m
.
ropeDim
,
m
.
ropeBase
,
1.
/
m
.
ropeScale
,
rope
.
WithFactors
(
m
.
Layers
[
layer
]
.
Attention
.
RopeFactors
)),
nil
}
model/models/mistral3/model_text.go
View file @
564b558c
...
...
@@ -40,11 +40,11 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
q
:=
sa
.
Query
.
Forward
(
ctx
,
hiddenState
)
q
=
q
.
Reshape
(
ctx
,
headDim
,
opts
.
numHeads
,
batchSize
)
q
=
fast
.
RoPE
(
ctx
,
q
,
positionIDs
,
opts
.
ropeDim
,
opts
.
ropeBase
,
opts
.
ropeScale
)
q
=
fast
.
RoPE
(
ctx
,
q
,
positionIDs
,
opts
.
ropeDim
,
opts
.
ropeBase
,
1.
/
opts
.
ropeScale
)
k
:=
sa
.
Key
.
Forward
(
ctx
,
hiddenState
)
k
=
k
.
Reshape
(
ctx
,
headDim
,
opts
.
numKVHeads
,
batchSize
)
k
=
fast
.
RoPE
(
ctx
,
k
,
positionIDs
,
opts
.
ropeDim
,
opts
.
ropeBase
,
opts
.
ropeScale
)
k
=
fast
.
RoPE
(
ctx
,
k
,
positionIDs
,
opts
.
ropeDim
,
opts
.
ropeBase
,
1.
/
opts
.
ropeScale
)
v
:=
sa
.
Value
.
Forward
(
ctx
,
hiddenState
)
v
=
v
.
Reshape
(
ctx
,
headDim
,
opts
.
numKVHeads
,
batchSize
)
...
...
@@ -55,7 +55,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
}
func
(
m
*
TextModel
)
Shift
(
ctx
ml
.
Context
,
layer
int
,
key
,
shift
ml
.
Tensor
)
(
ml
.
Tensor
,
error
)
{
return
fast
.
RoPE
(
ctx
,
key
,
shift
,
m
.
ropeDim
,
m
.
ropeBase
,
m
.
ropeScale
),
nil
return
fast
.
RoPE
(
ctx
,
key
,
shift
,
m
.
ropeDim
,
m
.
ropeBase
,
1.
/
m
.
ropeScale
),
nil
}
type
MLP
struct
{
...
...
@@ -132,7 +132,7 @@ func newTextModel(c fs.Config) *TextModel {
ropeDim
:
int
(
c
.
Uint
(
"rope.dimension_count"
)),
eps
:
c
.
Float
(
"attention.layer_norm_rms_epsilon"
),
ropeBase
:
c
.
Float
(
"rope.freq_base"
),
ropeScale
:
c
.
Float
(
"rope.
freq_scale
"
,
1
),
ropeScale
:
c
.
Float
(
"rope.
scaling.factor
"
,
1
),
},
}
}
model/models/mllama/model_text.go
View file @
564b558c
...
...
@@ -26,11 +26,11 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.T
query
:=
sa
.
Query
.
Forward
(
ctx
,
hiddenState
)
query
=
query
.
Reshape
(
ctx
,
headDim
,
opts
.
numHeads
,
batchSize
)
query
=
fast
.
RoPE
(
ctx
,
query
,
positions
,
opts
.
ropeDim
,
opts
.
ropeBase
,
opts
.
ropeScale
,
rope
.
WithFactors
(
sa
.
RopeFactors
))
query
=
fast
.
RoPE
(
ctx
,
query
,
positions
,
opts
.
ropeDim
,
opts
.
ropeBase
,
1.
/
opts
.
ropeScale
,
rope
.
WithFactors
(
sa
.
RopeFactors
))
key
:=
sa
.
Key
.
Forward
(
ctx
,
hiddenState
)
key
=
key
.
Reshape
(
ctx
,
headDim
,
opts
.
numKVHeads
,
batchSize
)
key
=
fast
.
RoPE
(
ctx
,
key
,
positions
,
opts
.
ropeDim
,
opts
.
ropeBase
,
opts
.
ropeScale
,
rope
.
WithFactors
(
sa
.
RopeFactors
))
key
=
fast
.
RoPE
(
ctx
,
key
,
positions
,
opts
.
ropeDim
,
opts
.
ropeBase
,
1.
/
opts
.
ropeScale
,
rope
.
WithFactors
(
sa
.
RopeFactors
))
value
:=
sa
.
Value
.
Forward
(
ctx
,
hiddenState
)
value
=
value
.
Reshape
(
ctx
,
headDim
,
opts
.
numKVHeads
,
batchSize
)
...
...
@@ -45,7 +45,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.T
func
(
m
*
TextModel
)
Shift
(
ctx
ml
.
Context
,
layer
int
,
key
,
shift
ml
.
Tensor
)
(
ml
.
Tensor
,
error
)
{
// This will only get called for layers in the cache, which are just the self attention layers
if
sa
,
ok
:=
m
.
Transformer
.
Layers
[
layer
]
.
(
*
TextSelfAttentionDecoderLayer
);
ok
{
return
fast
.
RoPE
(
ctx
,
key
,
shift
,
m
.
ropeDim
,
m
.
ropeBase
,
m
.
ropeScale
,
rope
.
WithFactors
(
sa
.
SelfAttention
.
RopeFactors
)),
nil
return
fast
.
RoPE
(
ctx
,
key
,
shift
,
m
.
ropeDim
,
m
.
ropeBase
,
1.
/
m
.
ropeScale
,
rope
.
WithFactors
(
sa
.
SelfAttention
.
RopeFactors
)),
nil
}
return
key
,
nil
...
...
@@ -244,7 +244,7 @@ func newTextModel(c fs.Config) *TextModel {
ropeDim
:
int
(
c
.
Uint
(
"rope.dimension_count"
)),
eps
:
c
.
Float
(
"attention.layer_norm_rms_epsilon"
),
ropeBase
:
c
.
Float
(
"rope.freq_base"
),
ropeScale
:
c
.
Float
(
"rope.
freq_scale
"
,
1
),
ropeScale
:
c
.
Float
(
"rope.
scaling.factor
"
,
1
),
crossAttentionLayers
:
c
.
Ints
(
"attention.cross_attention_layers"
),
},
}
...
...
model/models/qwen2/model.go
View file @
564b558c
...
...
@@ -43,8 +43,8 @@ func (attn Attention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor,
value
:=
attn
.
Value
.
Forward
(
ctx
,
hiddenStates
)
value
=
value
.
Reshape
(
ctx
,
headDim
,
opts
.
numKVHeads
,
batchSize
)
query
=
fast
.
RoPE
(
ctx
,
query
,
positions
,
ropeDim
,
opts
.
ropeBase
,
opts
.
ropeScale
,
rope
.
WithTypeNeoX
())
key
=
fast
.
RoPE
(
ctx
,
key
,
positions
,
ropeDim
,
opts
.
ropeBase
,
opts
.
ropeScale
,
rope
.
WithTypeNeoX
())
query
=
fast
.
RoPE
(
ctx
,
query
,
positions
,
ropeDim
,
opts
.
ropeBase
,
1.
/
opts
.
ropeScale
,
rope
.
WithTypeNeoX
())
key
=
fast
.
RoPE
(
ctx
,
key
,
positions
,
ropeDim
,
opts
.
ropeBase
,
1.
/
opts
.
ropeScale
,
rope
.
WithTypeNeoX
())
attention
:=
nn
.
Attention
(
ctx
,
query
,
key
,
value
,
1.0
/
math
.
Sqrt
(
float64
(
headDim
)),
cache
)
attention
=
attention
.
Reshape
(
ctx
,
headDim
*
opts
.
numHeads
,
batchSize
)
...
...
@@ -124,7 +124,7 @@ func (m Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
func
(
m
Model
)
Shift
(
ctx
ml
.
Context
,
layer
int
,
key
,
shift
ml
.
Tensor
)
(
ml
.
Tensor
,
error
)
{
ropeDim
:=
cmp
.
Or
(
m
.
ropeDim
,
m
.
hiddenSize
/
m
.
numHeads
)
return
fast
.
RoPE
(
ctx
,
key
,
shift
,
ropeDim
,
m
.
ropeBase
,
m
.
ropeScale
,
rope
.
WithTypeNeoX
()),
nil
return
fast
.
RoPE
(
ctx
,
key
,
shift
,
ropeDim
,
m
.
ropeBase
,
1.
/
m
.
ropeScale
,
rope
.
WithTypeNeoX
()),
nil
}
func
New
(
c
fs
.
Config
)
(
model
.
Model
,
error
)
{
...
...
@@ -160,7 +160,7 @@ func New(c fs.Config) (model.Model, error) {
headDim
:
int
(
c
.
Uint
(
"attention.key_length"
)),
ropeDim
:
int
(
c
.
Uint
(
"rope.dimension_count"
)),
ropeBase
:
c
.
Float
(
"rope.freq_base"
),
ropeScale
:
c
.
Float
(
"rope.
freq_scale
"
,
1
),
ropeScale
:
c
.
Float
(
"rope.
scaling.factor
"
,
1
),
eps
:
c
.
Float
(
"attention.layer_norm_rms_epsilon"
),
},
}
...
...
model/models/qwen25vl/model_text.go
View file @
564b558c
...
...
@@ -38,7 +38,7 @@ func NewTextModel(c fs.Config) *TextModel {
originalContextLength
:
int
(
c
.
Uint
(
"context_length"
,
128000
)),
eps
:
c
.
Float
(
"attention.layer_norm_rms_epsilon"
),
ropeBase
:
c
.
Float
(
"rope.freq_base"
),
ropeScale
:
c
.
Float
(
"rope.
freq_scale
"
,
1
),
ropeScale
:
c
.
Float
(
"rope.
scaling.factor
"
,
1
),
},
}
...
...
@@ -60,11 +60,11 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
q
:=
sa
.
Query
.
Forward
(
ctx
,
hiddenState
)
q
=
q
.
Reshape
(
ctx
,
headDim
,
opts
.
numHeads
,
batchSize
)
q
=
fast
.
RoPE
(
ctx
,
q
,
positionIDs
,
opts
.
ropeDim
,
opts
.
ropeBase
,
opts
.
ropeScale
,
rope
.
WithOriginalContextLength
(
opts
.
originalContextLength
),
rope
.
WithTypeNeoX
())
q
=
fast
.
RoPE
(
ctx
,
q
,
positionIDs
,
opts
.
ropeDim
,
opts
.
ropeBase
,
1.
/
opts
.
ropeScale
,
rope
.
WithOriginalContextLength
(
opts
.
originalContextLength
),
rope
.
WithTypeNeoX
())
k
:=
sa
.
Key
.
Forward
(
ctx
,
hiddenState
)
k
=
k
.
Reshape
(
ctx
,
headDim
,
opts
.
numKVHeads
,
batchSize
)
k
=
fast
.
RoPE
(
ctx
,
k
,
positionIDs
,
opts
.
ropeDim
,
opts
.
ropeBase
,
opts
.
ropeScale
,
rope
.
WithOriginalContextLength
(
opts
.
originalContextLength
),
rope
.
WithTypeNeoX
())
k
=
fast
.
RoPE
(
ctx
,
k
,
positionIDs
,
opts
.
ropeDim
,
opts
.
ropeBase
,
1.
/
opts
.
ropeScale
,
rope
.
WithOriginalContextLength
(
opts
.
originalContextLength
),
rope
.
WithTypeNeoX
())
v
:=
sa
.
Value
.
Forward
(
ctx
,
hiddenState
)
v
=
v
.
Reshape
(
ctx
,
headDim
,
opts
.
numKVHeads
,
batchSize
)
...
...
@@ -78,7 +78,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
// Shift applies rotary position embeddings to the key tensor for causal attention caching
func
(
m
*
TextModel
)
Shift
(
ctx
ml
.
Context
,
layer
int
,
key
,
shift
ml
.
Tensor
)
(
ml
.
Tensor
,
error
)
{
return
fast
.
RoPE
(
ctx
,
key
,
shift
,
m
.
ropeDim
,
m
.
ropeBase
,
m
.
ropeScale
,
rope
.
WithOriginalContextLength
(
m
.
originalContextLength
),
rope
.
WithTypeNeoX
()),
nil
return
fast
.
RoPE
(
ctx
,
key
,
shift
,
m
.
ropeDim
,
m
.
ropeBase
,
1.
/
m
.
ropeScale
,
rope
.
WithOriginalContextLength
(
m
.
originalContextLength
),
rope
.
WithTypeNeoX
()),
nil
}
// MLP implements the feed-forward network component with SwiGLU activation
...
...
model/models/qwen3/model.go
View file @
564b558c
...
...
@@ -52,8 +52,8 @@ func (sa *Attention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor,
query
=
sa
.
QueryNorm
.
Forward
(
ctx
,
query
,
opts
.
eps
)
key
=
sa
.
KeyNorm
.
Forward
(
ctx
,
key
,
opts
.
eps
)
query
=
fast
.
RoPE
(
ctx
,
query
,
positions
,
opts
.
headDim
(),
opts
.
ropeBase
,
opts
.
ropeScale
,
rope
.
WithTypeNeoX
())
key
=
fast
.
RoPE
(
ctx
,
key
,
positions
,
opts
.
headDim
(),
opts
.
ropeBase
,
opts
.
ropeScale
,
rope
.
WithTypeNeoX
())
query
=
fast
.
RoPE
(
ctx
,
query
,
positions
,
opts
.
headDim
(),
opts
.
ropeBase
,
1.
/
opts
.
ropeScale
,
rope
.
WithTypeNeoX
())
key
=
fast
.
RoPE
(
ctx
,
key
,
positions
,
opts
.
headDim
(),
opts
.
ropeBase
,
1.
/
opts
.
ropeScale
,
rope
.
WithTypeNeoX
())
attention
:=
nn
.
Attention
(
ctx
,
query
,
key
,
value
,
1.
/
math
.
Sqrt
(
float64
(
opts
.
headDim
())),
cache
)
attention
=
attention
.
Reshape
(
ctx
,
attention
.
Dim
(
0
)
*
attention
.
Dim
(
1
),
batchSize
)
...
...
@@ -173,7 +173,7 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
}
func
(
m
*
Model
)
Shift
(
ctx
ml
.
Context
,
layer
int
,
key
,
shift
ml
.
Tensor
)
(
ml
.
Tensor
,
error
)
{
return
fast
.
RoPE
(
ctx
,
key
,
shift
,
m
.
headDim
(),
m
.
ropeBase
,
m
.
ropeScale
,
rope
.
WithTypeNeoX
()),
nil
return
fast
.
RoPE
(
ctx
,
key
,
shift
,
m
.
headDim
(),
m
.
ropeBase
,
1.
/
m
.
ropeScale
,
rope
.
WithTypeNeoX
()),
nil
}
var
_
model
.
Model
=
(
*
Model
)(
nil
)
...
...
@@ -213,7 +213,7 @@ func New(c fs.Config) (model.Model, error) {
valueLength
:
int
(
c
.
Uint
(
"attention.value_length"
)),
eps
:
c
.
Float
(
"attention.layer_norm_rms_epsilon"
),
ropeBase
:
c
.
Float
(
"rope.freq_base"
),
ropeScale
:
c
.
Float
(
"rope.
freq_scale
"
,
1
),
ropeScale
:
c
.
Float
(
"rope.
scaling.factor
"
,
1
),
numExperts
:
int
(
c
.
Uint
(
"expert_count"
)),
numExpertsUsed
:
int
(
c
.
Uint
(
"expert_used_count"
)),
normTopKProb
:
c
.
Bool
(
"norm_top_k_prob"
,
true
),
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment