Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
a838421e
Unverified
Commit
a838421e
authored
Dec 11, 2025
by
Jeffrey Morgan
Committed by
GitHub
Dec 11, 2025
Browse files
model: conversion and hyperparameter fixes for ministral and devstral (#13424)
parent
1c4e85b4
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
250 additions
and
12 deletions
+250
-12
convert/convert.go
convert/convert.go
+2
-0
convert/convert_mistral.go
convert/convert_mistral.go
+26
-9
convert/convert_mistral_causal.go
convert/convert_mistral_causal.go
+181
-0
model/models/mistral3/model_text.go
model/models/mistral3/model_text.go
+41
-3
No files found.
convert/convert.go
View file @
a838421e
...
@@ -182,6 +182,8 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
...
@@ -182,6 +182,8 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
conv
=
&
llama4Model
{}
conv
=
&
llama4Model
{}
case
"Mistral3ForConditionalGeneration"
:
case
"Mistral3ForConditionalGeneration"
:
conv
=
&
mistral3Model
{}
conv
=
&
mistral3Model
{}
case
"Ministral3ForCausalLM"
:
conv
=
&
mistral3CausalModel
{}
case
"MixtralForCausalLM"
:
case
"MixtralForCausalLM"
:
conv
=
&
mixtralModel
{}
conv
=
&
mixtralModel
{}
case
"GemmaForCausalLM"
:
case
"GemmaForCausalLM"
:
...
...
convert/convert_mistral.go
View file @
a838421e
...
@@ -33,10 +33,12 @@ type mistral3Model struct {
...
@@ -33,10 +33,12 @@ type mistral3Model struct {
BetaFast
float32
`json:"beta_fast"`
BetaFast
float32
`json:"beta_fast"`
BetaSlow
float32
`json:"beta_slow"`
BetaSlow
float32
`json:"beta_slow"`
Factor
float32
`json:"factor"`
Factor
float32
`json:"factor"`
ScalingBeta
float32
`json:"llama_4_scaling_beta"`
Llama4
ScalingBeta
*
float32
`json:"llama_4_scaling_beta"`
OrigMaxPositionEmbeddings
uint32
`json:"original_max_position_embeddings"`
OrigMaxPositionEmbeddings
uint32
`json:"original_max_position_embeddings"`
RopeType
string
`json:"rope_type"`
RopeType
string
`json:"rope_type"`
RopeTheta
float32
`json:"rope_theta"`
RopeTheta
float32
`json:"rope_theta"`
Mscale
*
float32
`json:"mscale"`
MscaleAllDim
*
float32
`json:"mscale_all_dim"`
}
`json:"rope_parameters"`
}
`json:"rope_parameters"`
}
`json:"text_config"`
}
`json:"text_config"`
VisionModel
struct
{
VisionModel
struct
{
...
@@ -50,6 +52,9 @@ type mistral3Model struct {
...
@@ -50,6 +52,9 @@ type mistral3Model struct {
HeadDim
uint32
`json:"head_dim"`
HeadDim
uint32
`json:"head_dim"`
HiddenAct
string
`json:"hidden_act"`
HiddenAct
string
`json:"hidden_act"`
RopeTheta
float32
`json:"rope_theta"`
RopeTheta
float32
`json:"rope_theta"`
RopeParameters
struct
{
RopeTheta
float32
`json:"rope_theta"`
}
`json:"rope_parameters"`
}
`json:"vision_config"`
}
`json:"vision_config"`
MultiModalProjectorBias
bool
`json:"multimodal_projector_bias"`
MultiModalProjectorBias
bool
`json:"multimodal_projector_bias"`
ProjectorHiddenAct
string
`json:"projector_hidden_act"`
ProjectorHiddenAct
string
`json:"projector_hidden_act"`
...
@@ -72,10 +77,22 @@ func (p *mistral3Model) KV(t *Tokenizer) ggml.KV {
...
@@ -72,10 +77,22 @@ func (p *mistral3Model) KV(t *Tokenizer) ggml.KV {
kv
[
"mistral3.attention.value_length"
]
=
p
.
TextModel
.
HeadDim
kv
[
"mistral3.attention.value_length"
]
=
p
.
TextModel
.
HeadDim
kv
[
"mistral3.rope.dimension_count"
]
=
cmp
.
Or
(
p
.
TextModel
.
HeadDim
,
p
.
TextModel
.
HiddenSize
/
p
.
TextModel
.
NumAttentionHeads
)
kv
[
"mistral3.rope.dimension_count"
]
=
cmp
.
Or
(
p
.
TextModel
.
HeadDim
,
p
.
TextModel
.
HiddenSize
/
p
.
TextModel
.
NumAttentionHeads
)
kv
[
"mistral3.rope.freq_base"
]
=
cmp
.
Or
(
p
.
TextModel
.
RopeTheta
,
p
.
TextModel
.
RopeParameters
.
RopeTheta
)
kv
[
"mistral3.rope.freq_base"
]
=
cmp
.
Or
(
p
.
TextModel
.
RopeTheta
,
p
.
TextModel
.
RopeParameters
.
RopeTheta
)
kv
[
"mistral3.rope.scaling.factor"
]
=
p
.
TextModel
.
RopeParameters
.
Factor
kv
[
"mistral3.rope.scaling.type"
]
=
p
.
TextModel
.
RopeParameters
.
RopeType
kv
[
"mistral3.rope.scaling.beta_fast"
]
=
p
.
TextModel
.
RopeParameters
.
BetaFast
kv
[
"mistral3.rope.scaling.beta_slow"
]
=
p
.
TextModel
.
RopeParameters
.
BetaSlow
if
p
.
TextModel
.
RopeParameters
.
Mscale
!=
nil
{
kv
[
"mistral3.rope.scaling.mscale"
]
=
*
p
.
TextModel
.
RopeParameters
.
Mscale
}
if
p
.
TextModel
.
RopeParameters
.
MscaleAllDim
!=
nil
{
kv
[
"mistral3.rope.scaling.mscale_all_dim"
]
=
*
p
.
TextModel
.
RopeParameters
.
MscaleAllDim
}
if
p
.
TextModel
.
RopeParameters
.
OrigMaxPositionEmbeddings
>
0
{
if
p
.
TextModel
.
RopeParameters
.
OrigMaxPositionEmbeddings
>
0
{
kv
[
"mistral3.rope.scaling.original_context_length"
]
=
p
.
TextModel
.
RopeParameters
.
OrigMaxPositionEmbeddings
kv
[
"mistral3.rope.scaling.original_context_length"
]
=
p
.
TextModel
.
RopeParameters
.
OrigMaxPositionEmbeddings
kv
[
"mistral3.rope.scaling_beta"
]
=
p
.
TextModel
.
RopeParameters
.
ScalingBeta
}
if
p
.
TextModel
.
RopeParameters
.
Llama4ScalingBeta
!=
nil
{
kv
[
"mistral3.rope.scaling_beta"
]
=
*
p
.
TextModel
.
RopeParameters
.
Llama4ScalingBeta
}
}
// Vision configuration
// Vision configuration
...
@@ -88,7 +105,7 @@ func (p *mistral3Model) KV(t *Tokenizer) ggml.KV {
...
@@ -88,7 +105,7 @@ func (p *mistral3Model) KV(t *Tokenizer) ggml.KV {
kv
[
"mistral3.vision.patch_size"
]
=
p
.
VisionModel
.
PatchSize
kv
[
"mistral3.vision.patch_size"
]
=
p
.
VisionModel
.
PatchSize
kv
[
"mistral3.vision.num_channels"
]
=
p
.
VisionModel
.
NumChannels
kv
[
"mistral3.vision.num_channels"
]
=
p
.
VisionModel
.
NumChannels
// kv["mistral3.vision.attention.layer_norm_epsilon"] = 1e-05 // Default value
// kv["mistral3.vision.attention.layer_norm_epsilon"] = 1e-05 // Default value
kv
[
"mistral3.vision.rope.freq_base"
]
=
p
.
VisionModel
.
RopeTheta
kv
[
"mistral3.vision.rope.freq_base"
]
=
cmp
.
Or
(
p
.
VisionModel
.
RopeTheta
,
p
.
VisionModel
.
RopeParameters
.
RopeTheta
)
// Multimodal configuration
// Multimodal configuration
kv
[
"mistral3.image_token_index"
]
=
p
.
ImageTokenIndex
kv
[
"mistral3.image_token_index"
]
=
p
.
ImageTokenIndex
...
...
convert/convert_mistral_causal.go
0 → 100644
View file @
a838421e
package
convert
import
(
"cmp"
"fmt"
"strings"
"github.com/pdevine/tensor"
"github.com/pdevine/tensor/native"
"github.com/ollama/ollama/fs/ggml"
)
type
mistral3CausalModel
struct
{
ModelParameters
NumHiddenLayers
uint32
`json:"num_hidden_layers"`
MaxPositionEmbeddings
uint32
`json:"max_position_embeddings"`
HiddenSize
uint32
`json:"hidden_size"`
IntermediateSize
uint32
`json:"intermediate_size"`
NumAttentionHeads
uint32
`json:"num_attention_heads"`
NumKeyValueHeads
uint32
`json:"num_key_value_heads"`
RopeTheta
float32
`json:"rope_theta"`
RMSNormEPS
float32
`json:"rms_norm_eps"`
HeadDim
uint32
`json:"head_dim"`
SlidingWindow
*
uint32
`json:"sliding_window"`
HiddenAct
string
`json:"hidden_act"`
VocabSize
uint32
`json:"vocab_size"`
RopeParameters
struct
{
BetaFast
float32
`json:"beta_fast"`
BetaSlow
float32
`json:"beta_slow"`
Factor
float32
`json:"factor"`
Llama4ScalingBeta
*
float32
`json:"llama_4_scaling_beta"`
OrigMaxPositionEmbeddings
uint32
`json:"original_max_position_embeddings"`
RopeType
string
`json:"rope_type"`
RopeTheta
float32
`json:"rope_theta"`
Mscale
*
float32
`json:"mscale"`
MscaleAllDim
*
float32
`json:"mscale_all_dim"`
}
`json:"rope_parameters"`
}
func
(
p
*
mistral3CausalModel
)
KV
(
t
*
Tokenizer
)
ggml
.
KV
{
kv
:=
p
.
ModelParameters
.
KV
(
t
)
kv
[
"general.architecture"
]
=
"mistral3"
kv
[
"mistral3.vocab_size"
]
=
p
.
VocabSize
// Text configuration
kv
[
"mistral3.block_count"
]
=
p
.
NumHiddenLayers
kv
[
"mistral3.context_length"
]
=
p
.
MaxPositionEmbeddings
kv
[
"mistral3.embedding_length"
]
=
p
.
HiddenSize
kv
[
"mistral3.feed_forward_length"
]
=
p
.
IntermediateSize
kv
[
"mistral3.attention.head_count"
]
=
p
.
NumAttentionHeads
kv
[
"mistral3.attention.head_count_kv"
]
=
p
.
NumKeyValueHeads
kv
[
"mistral3.attention.layer_norm_rms_epsilon"
]
=
p
.
RMSNormEPS
kv
[
"mistral3.attention.key_length"
]
=
p
.
HeadDim
kv
[
"mistral3.attention.value_length"
]
=
p
.
HeadDim
kv
[
"mistral3.rope.dimension_count"
]
=
cmp
.
Or
(
p
.
HeadDim
,
p
.
HiddenSize
/
p
.
NumAttentionHeads
)
kv
[
"mistral3.rope.freq_base"
]
=
cmp
.
Or
(
p
.
RopeTheta
,
p
.
RopeParameters
.
RopeTheta
)
kv
[
"mistral3.rope.scaling.factor"
]
=
p
.
RopeParameters
.
Factor
kv
[
"mistral3.rope.scaling.type"
]
=
p
.
RopeParameters
.
RopeType
kv
[
"mistral3.rope.scaling.beta_fast"
]
=
p
.
RopeParameters
.
BetaFast
kv
[
"mistral3.rope.scaling.beta_slow"
]
=
p
.
RopeParameters
.
BetaSlow
if
p
.
RopeParameters
.
Mscale
!=
nil
{
kv
[
"mistral3.rope.scaling.mscale"
]
=
*
p
.
RopeParameters
.
Mscale
}
if
p
.
RopeParameters
.
MscaleAllDim
!=
nil
{
kv
[
"mistral3.rope.scaling.mscale_all_dim"
]
=
*
p
.
RopeParameters
.
MscaleAllDim
}
if
p
.
RopeParameters
.
OrigMaxPositionEmbeddings
>
0
{
kv
[
"mistral3.rope.scaling.original_context_length"
]
=
p
.
RopeParameters
.
OrigMaxPositionEmbeddings
kv
[
"mistral3.rope.scaling_beta"
]
=
*
p
.
RopeParameters
.
Llama4ScalingBeta
}
if
p
.
RopeParameters
.
Llama4ScalingBeta
!=
nil
{
kv
[
"mistral3.rope.scaling_beta"
]
=
*
p
.
RopeParameters
.
Llama4ScalingBeta
}
return
kv
}
func
(
p
*
mistral3CausalModel
)
Tensors
(
ts
[]
Tensor
)
[]
*
ggml
.
Tensor
{
var
out
[]
*
ggml
.
Tensor
for
_
,
t
:=
range
ts
{
if
!
strings
.
HasPrefix
(
t
.
Name
(),
"v."
)
{
if
strings
.
HasSuffix
(
t
.
Name
(),
".attn_q.weight"
)
||
strings
.
HasSuffix
(
t
.
Name
(),
".attn_k.weight"
)
{
t
.
SetRepacker
(
p
.
repack
)
}
}
out
=
append
(
out
,
&
ggml
.
Tensor
{
Name
:
t
.
Name
(),
Kind
:
t
.
Kind
(),
Shape
:
t
.
Shape
(),
WriterTo
:
t
,
})
}
return
out
}
func
(
p
*
mistral3CausalModel
)
Replacements
()
[]
string
{
return
[]
string
{
"model.norm"
,
"output_norm"
,
"model."
,
""
,
"layers"
,
"blk"
,
"transformer.layers"
,
"blk"
,
"vision_tower"
,
"v"
,
"ln_pre"
,
"encoder_norm"
,
"input_layernorm"
,
"attn_norm"
,
"post_attention_layernorm"
,
"ffn_norm"
,
"embed_tokens"
,
"token_embd"
,
"self_attn.q_proj"
,
"attn_q"
,
"self_attn.k_proj"
,
"attn_k"
,
"self_attn.v_proj"
,
"attn_v"
,
"self_attn.o_proj"
,
"attn_output"
,
"mlp.down_proj"
,
"ffn_down"
,
"mlp.gate_proj"
,
"ffn_gate"
,
"mlp.up_proj"
,
"ffn_up"
,
"attention.q_proj"
,
"attn_q"
,
"attention.k_proj"
,
"attn_k"
,
"attention.v_proj"
,
"attn_v"
,
"attention.o_proj"
,
"attn_output"
,
"attention_norm"
,
"attn_norm"
,
"feed_forward.gate_proj"
,
"ffn_gate"
,
"feed_forward.down_proj"
,
"ffn_down"
,
"feed_forward.up_proj"
,
"ffn_up"
,
"multi_modal_projector"
,
"mm"
,
"ffn_norm"
,
"ffn_norm"
,
"lm_head"
,
"output"
,
}
}
func
(
p
*
mistral3CausalModel
)
repack
(
name
string
,
data
[]
float32
,
shape
[]
uint64
)
([]
float32
,
error
)
{
var
dims
[]
int
for
_
,
dim
:=
range
shape
{
dims
=
append
(
dims
,
int
(
dim
))
}
var
heads
uint32
if
strings
.
HasSuffix
(
name
,
".attn_q.weight"
)
{
heads
=
p
.
NumAttentionHeads
}
else
if
strings
.
HasSuffix
(
name
,
".attn_k.weight"
)
{
heads
=
cmp
.
Or
(
p
.
NumKeyValueHeads
,
p
.
NumAttentionHeads
)
}
else
{
return
nil
,
fmt
.
Errorf
(
"unknown tensor for repack: %s"
,
name
)
}
n
:=
tensor
.
New
(
tensor
.
WithShape
(
dims
...
),
tensor
.
WithBacking
(
data
))
if
err
:=
n
.
Reshape
(
append
([]
int
{
int
(
heads
),
2
,
dims
[
0
]
/
int
(
heads
)
/
2
},
dims
[
1
:
]
...
)
...
);
err
!=
nil
{
return
nil
,
err
}
if
err
:=
n
.
T
(
0
,
2
,
1
,
3
);
err
!=
nil
{
return
nil
,
err
}
if
err
:=
n
.
Reshape
(
dims
...
);
err
!=
nil
{
return
nil
,
err
}
if
err
:=
n
.
Transpose
();
err
!=
nil
{
return
nil
,
err
}
ts
,
err
:=
native
.
SelectF32
(
n
,
1
)
if
err
!=
nil
{
return
nil
,
err
}
var
f32s
[]
float32
for
_
,
t
:=
range
ts
{
f32s
=
append
(
f32s
,
t
...
)
}
return
f32s
,
nil
}
model/models/mistral3/model_text.go
View file @
a838421e
...
@@ -8,6 +8,7 @@ import (
...
@@ -8,6 +8,7 @@ import (
"github.com/ollama/ollama/kvcache"
"github.com/ollama/ollama/kvcache"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/ml/nn"
"github.com/ollama/ollama/ml/nn"
"github.com/ollama/ollama/ml/nn/rope"
"github.com/ollama/ollama/model/input"
"github.com/ollama/ollama/model/input"
)
)
...
@@ -17,10 +18,41 @@ type TextOptions struct {
...
@@ -17,10 +18,41 @@ type TextOptions struct {
eps
,
ropeBase
,
ropeScale
float32
eps
,
ropeBase
,
ropeScale
float32
ropeOrigPosEmbeddings
int
ropeOrigPosEmbeddings
int
ropeScalingBeta
float32
ropeScalingBeta
float32
ropeType
string
ropeExtrapolation
float32
ropeBetaFast
float32
ropeBetaSlow
float32
ropeMscale
float32
ropeMscaleAllDim
float32
}
}
func
(
o
TextOptions
)
applyRotaryPositionEmbeddings
(
ctx
ml
.
Context
,
states
,
positions
ml
.
Tensor
)
ml
.
Tensor
{
func
(
o
TextOptions
)
applyRotaryPositionEmbeddings
(
ctx
ml
.
Context
,
states
,
positions
ml
.
Tensor
)
ml
.
Tensor
{
return
nn
.
RoPE
(
ctx
,
states
,
positions
,
o
.
ropeDim
,
o
.
ropeBase
,
1.
/
o
.
ropeScale
)
var
ropeOpts
[]
func
(
*
rope
.
Options
)
if
o
.
ropeType
==
"yarn"
{
getMscale
:=
func
(
scale
,
mscale
float64
)
float64
{
if
scale
<=
1.0
{
return
1.0
}
return
0.1
*
mscale
*
math
.
Log
(
scale
)
+
1.0
}
var
attnFactor
float32
if
o
.
ropeMscale
!=
0
&&
o
.
ropeMscaleAllDim
!=
0
{
attnFactor
=
float32
(
getMscale
(
float64
(
o
.
ropeScale
),
float64
(
o
.
ropeMscale
))
/
getMscale
(
float64
(
o
.
ropeScale
),
float64
(
o
.
ropeMscaleAllDim
)))
}
else
{
attnFactor
=
float32
(
getMscale
(
float64
(
o
.
ropeScale
),
1
))
}
ropeOpts
=
append
(
ropeOpts
,
rope
.
WithOriginalContextLength
(
o
.
ropeOrigPosEmbeddings
),
rope
.
WithExtrapolationFactor
(
o
.
ropeExtrapolation
),
rope
.
WithAttentionFactor
(
attnFactor
),
rope
.
WithBetaFast
(
o
.
ropeBetaFast
),
rope
.
WithBetaSlow
(
o
.
ropeBetaSlow
),
)
}
return
nn
.
RoPE
(
ctx
,
states
,
positions
,
o
.
ropeDim
,
o
.
ropeBase
,
1.
/
o
.
ropeScale
,
ropeOpts
...
)
}
}
type
TextModel
struct
{
type
TextModel
struct
{
...
@@ -150,9 +182,15 @@ func newTextModel(c fs.Config) *TextModel {
...
@@ -150,9 +182,15 @@ func newTextModel(c fs.Config) *TextModel {
ropeDim
:
int
(
c
.
Uint
(
"rope.dimension_count"
)),
ropeDim
:
int
(
c
.
Uint
(
"rope.dimension_count"
)),
eps
:
c
.
Float
(
"attention.layer_norm_rms_epsilon"
),
eps
:
c
.
Float
(
"attention.layer_norm_rms_epsilon"
),
ropeBase
:
c
.
Float
(
"rope.freq_base"
),
ropeBase
:
c
.
Float
(
"rope.freq_base"
),
ropeScale
:
c
.
Float
(
"rope.scaling.factor"
,
1
),
ropeScale
:
c
.
Float
(
"rope.scaling.factor"
,
1
.0
),
ropeOrigPosEmbeddings
:
int
(
c
.
Uint
(
"rope.scaling.original_context_length"
)),
ropeOrigPosEmbeddings
:
int
(
c
.
Uint
(
"rope.scaling.original_context_length"
)),
ropeScalingBeta
:
c
.
Float
(
"rope.scaling_beta"
),
ropeScalingBeta
:
c
.
Float
(
"rope.scaling_beta"
,
0.1
),
ropeBetaFast
:
c
.
Float
(
"rope.scaling.beta_fast"
,
32.0
),
ropeBetaSlow
:
c
.
Float
(
"rope.scaling.beta_slow"
,
1.0
),
ropeType
:
c
.
String
(
"rope.scaling.type"
),
ropeMscale
:
c
.
Float
(
"rope.scaling.mscale"
),
ropeMscaleAllDim
:
c
.
Float
(
"rope.scaling.mscale_all_dim"
),
ropeExtrapolation
:
c
.
Float
(
"rope.scaling.extrapolation_factor"
,
1
),
},
},
}
}
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment