Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
b2b270ad
Commit
b2b270ad
authored
Jun 23, 2025
by
Devon Rifkin
Browse files
Merge branch 'main' into drifkin/array-head-count-simple
parents
20c5fd39
2bb69b40
Changes
288
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
391 additions
and
1287 deletions
+391
-1287
model/bytepairencoding.go
model/bytepairencoding.go
+15
-120
model/bytepairencoding_test.go
model/bytepairencoding_test.go
+0
-0
model/input/input.go
model/input/input.go
+20
-6
model/model.go
model/model.go
+11
-19
model/models/gemma2/model.go
model/models/gemma2/model.go
+14
-15
model/models/gemma3/model.go
model/models/gemma3/model.go
+18
-24
model/models/gemma3/model_text.go
model/models/gemma3/model_text.go
+6
-18
model/models/llama/model.go
model/models/llama/model.go
+33
-40
model/models/llama4/model.go
model/models/llama4/model.go
+62
-65
model/models/llama4/model_text.go
model/models/llama4/model_text.go
+14
-21
model/models/llama4/model_vision.go
model/models/llama4/model_vision.go
+2
-5
model/models/mistral3/model.go
model/models/mistral3/model.go
+29
-52
model/models/mistral3/model_text.go
model/models/mistral3/model_text.go
+13
-52
model/models/mistral3/model_vision.go
model/models/mistral3/model_vision.go
+4
-14
model/models/mllama/imageproc.go
model/models/mllama/imageproc.go
+0
-201
model/models/mllama/imageproc_test.go
model/models/mllama/imageproc_test.go
+0
-420
model/models/mllama/model.go
model/models/mllama/model.go
+22
-59
model/models/mllama/model_text.go
model/models/mllama/model_text.go
+16
-15
model/models/mllama/model_vision.go
model/models/mllama/model_vision.go
+38
-51
model/models/mllama/process_image.go
model/models/mllama/process_image.go
+74
-90
No files found.
model/
process_text
.go
→
model/
bytepairencoding
.go
View file @
b2b270ad
...
@@ -2,117 +2,17 @@ package model
...
@@ -2,117 +2,17 @@ package model
import
(
import
(
"cmp"
"cmp"
"context"
"fmt"
"iter"
"iter"
"log/slog"
"log/slog"
"slices"
"strings"
"strings"
"sync"
"github.com/dlclark/regexp2"
"github.com/dlclark/regexp2"
heap
"github.com/emirpasic/gods/v2/trees/binaryheap"
heap
"github.com/emirpasic/gods/v2/trees/binaryheap"
"github.com/ollama/ollama/logutil"
)
)
type
Special
int32
const
(
SpecialBOS
Special
=
iota
SpecialEOS
)
const
(
TOKEN_TYPE_NORMAL
=
iota
+
1
TOKEN_TYPE_UNKNOWN
TOKEN_TYPE_CONTROL
TOKEN_TYPE_USER_DEFINED
TOKEN_TYPE_UNUSED
TOKEN_TYPE_BYTE
)
type
TextProcessor
interface
{
Encode
(
s
string
,
addSpecial
bool
)
([]
int32
,
error
)
Decode
([]
int32
)
(
string
,
error
)
Is
(
int32
,
Special
)
bool
Vocabulary
()
*
Vocabulary
}
type
Vocabulary
struct
{
Values
[]
string
Types
[]
int32
Scores
[]
float32
Merges
[]
string
BOS
,
EOS
,
EOT
int32
AddBOS
,
AddEOS
,
AddEOT
bool
specialOnce
sync
.
Once
special
[]
string
valuesOnce
sync
.
Once
values
map
[
string
]
int32
mergeOnce
sync
.
Once
merge
map
[
string
]
int32
}
func
(
v
*
Vocabulary
)
Is
(
id
int32
,
special
Special
)
bool
{
switch
special
{
case
SpecialBOS
:
return
id
==
v
.
BOS
case
SpecialEOS
:
return
id
==
v
.
EOS
||
id
==
v
.
EOT
default
:
return
false
}
}
func
(
v
*
Vocabulary
)
Encode
(
s
string
)
int32
{
v
.
valuesOnce
.
Do
(
func
()
{
v
.
values
=
make
(
map
[
string
]
int32
,
len
(
v
.
Values
))
for
i
,
value
:=
range
v
.
Values
{
v
.
values
[
value
]
=
int32
(
i
)
}
})
if
id
,
ok
:=
v
.
values
[
s
];
ok
{
return
id
}
return
-
1
}
func
(
v
*
Vocabulary
)
Decode
(
id
int32
)
string
{
return
v
.
Values
[
id
]
}
func
(
v
*
Vocabulary
)
SpecialVocabulary
()
[]
string
{
v
.
specialOnce
.
Do
(
func
()
{
for
i
:=
range
v
.
Values
{
if
slices
.
Contains
([]
int
{
105
,
106
},
i
)
{
v
.
special
=
append
(
v
.
special
,
v
.
Values
[
i
])
}
else
if
v
.
Types
[
i
]
==
TOKEN_TYPE_CONTROL
{
v
.
special
=
append
(
v
.
special
,
v
.
Values
[
i
])
}
}
})
return
v
.
special
}
func
(
v
*
Vocabulary
)
Merge
(
left
,
right
string
)
int
{
v
.
mergeOnce
.
Do
(
func
()
{
v
.
merge
=
make
(
map
[
string
]
int32
,
len
(
v
.
Merges
))
for
i
,
merge
:=
range
v
.
Merges
{
v
.
merge
[
merge
]
=
int32
(
i
)
}
})
if
id
,
ok
:=
v
.
merge
[
left
+
" "
+
right
];
ok
{
return
int
(
id
)
}
return
-
1
}
type
BytePairEncoding
struct
{
type
BytePairEncoding
struct
{
pre
*
regexp2
.
Regexp
pre
*
regexp2
.
Regexp
vocab
*
Vocabulary
vocab
*
Vocabulary
...
@@ -302,29 +202,23 @@ func (bpe BytePairEncoding) Encode(s string, addSpecial bool) ([]int32, error) {
...
@@ -302,29 +202,23 @@ func (bpe BytePairEncoding) Encode(s string, addSpecial bool) ([]int32, error) {
}
}
}
}
if
addSpecial
&&
len
(
ids
)
>
0
{
slog
.
Log
(
context
.
TODO
(),
logutil
.
LevelTrace
,
"encoded"
,
"string"
,
s
,
"ids"
,
ids
)
if
bpe
.
vocab
.
AddBOS
{
if
ids
[
0
]
==
bpe
.
vocab
.
BOS
{
slog
.
Warn
(
"adding bos token to prompt which already has it"
,
"id"
,
bpe
.
vocab
.
BOS
)
}
slog
.
Debug
(
"adding bos token to prompt"
,
"id"
,
bpe
.
vocab
.
BOS
)
ids
=
append
([]
int32
{
bpe
.
vocab
.
BOS
},
ids
...
)
}
if
bpe
.
vocab
.
AddEOS
{
if
addSpecial
&&
len
(
ids
)
>
0
{
if
ids
[
len
(
ids
)
-
1
]
==
bpe
.
vocab
.
EOS
{
ids
=
bpe
.
vocab
.
addSpecials
(
ids
)
slog
.
Warn
(
"adding eos token to prompt which already has it"
,
"id"
,
bpe
.
vocab
.
EOS
)
}
slog
.
Debug
(
"adding eos token to prompt"
,
"id"
,
bpe
.
vocab
.
EOS
)
ids
=
append
(
ids
,
bpe
.
vocab
.
EOS
)
}
}
}
return
ids
,
nil
return
ids
,
nil
}
}
type
lazyIdsString
struct
{
ids
[]
int32
}
func
(
l
lazyIdsString
)
LogValue
()
slog
.
Value
{
return
slog
.
AnyValue
(
fmt
.
Sprint
(
l
.
ids
))
}
func
(
bpe
BytePairEncoding
)
Decode
(
ids
[]
int32
)
(
string
,
error
)
{
func
(
bpe
BytePairEncoding
)
Decode
(
ids
[]
int32
)
(
string
,
error
)
{
var
sb
strings
.
Builder
var
sb
strings
.
Builder
for
_
,
id
:=
range
ids
{
for
_
,
id
:=
range
ids
{
...
@@ -349,5 +243,6 @@ func (bpe BytePairEncoding) Decode(ids []int32) (string, error) {
...
@@ -349,5 +243,6 @@ func (bpe BytePairEncoding) Decode(ids []int32) (string, error) {
}
}
}
}
slog
.
Log
(
context
.
TODO
(),
logutil
.
LevelTrace
,
"decoded"
,
"string"
,
sb
.
String
(),
"from"
,
lazyIdsString
{
ids
:
ids
})
return
sb
.
String
(),
nil
return
sb
.
String
(),
nil
}
}
model/
process_text
_test.go
→
model/
bytepairencoding
_test.go
View file @
b2b270ad
File moved
model/input/input.go
View file @
b2b270ad
...
@@ -2,16 +2,30 @@ package input
...
@@ -2,16 +2,30 @@ package input
import
"github.com/ollama/ollama/ml"
import
"github.com/ollama/ollama/ml"
// Multimodal is a multimodal embedding or a component of one.
// For example, it could be a row of an image that can be processed
// independently.
type
Multimodal
struct
{
// Tensor is the embedding data. Implementations may chose what to
// store here or it may be nil if not needed. However, any ml.Tensor
// objects must be stored here and not in Data.
Tensor
ml
.
Tensor
// Data is implementation-specific opaque data, such as metadata on how
// to layout Tensor. It may be nil if not needed. It may also store larger
// objects such as complete images if they are to be processed later.
Data
any
}
// Input represents one token in the input stream
// Input represents one token in the input stream
type
Input
struct
{
type
Input
struct
{
// Token is a single element of text.
// Token is a single element of text.
Token
int32
Token
int32
// Multimodal is opaque data representing a non-text
// Multimodal is represents a non-text element such as an
// element such as an image (or part of one if the image
// image (or part of one if the image can be processed in pieces).
// can be processed in pieces). It may be either together
// It may be used either together with Token or on its own.
// with Token or on its own.
Multimodal
[]
Multimodal
Multimodal
any
// MultimodalHash is a unique representation of the data
// MultimodalHash is a unique representation of the data
// stored in Multimodal, used for caching and comparing
// stored in Multimodal, used for caching and comparing
...
@@ -32,7 +46,7 @@ type Input struct {
...
@@ -32,7 +46,7 @@ type Input struct {
// Positions slice.
// Positions slice.
type
MultimodalIndex
struct
{
type
MultimodalIndex
struct
{
Index
int
Index
int
Multimodal
any
Multimodal
[]
Multimodal
}
}
// Batch contains the inputs for a model forward pass
// Batch contains the inputs for a model forward pass
...
...
model/model.go
View file @
b2b270ad
...
@@ -19,6 +19,7 @@ import (
...
@@ -19,6 +19,7 @@ import (
"github.com/ollama/ollama/fs"
"github.com/ollama/ollama/fs"
fsggml
"github.com/ollama/ollama/fs/ggml"
fsggml
"github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/kvcache"
"github.com/ollama/ollama/kvcache"
"github.com/ollama/ollama/logutil"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/ml"
_
"github.com/ollama/ollama/ml/backend"
_
"github.com/ollama/ollama/ml/backend"
"github.com/ollama/ollama/model/input"
"github.com/ollama/ollama/model/input"
...
@@ -39,12 +40,13 @@ type MultimodalProcessor interface {
...
@@ -39,12 +40,13 @@ type MultimodalProcessor interface {
// EncodeMultimodal processes a single input (such as an image) and
// EncodeMultimodal processes a single input (such as an image) and
// generates an output (typically an embedding) that can be used by the model.
// generates an output (typically an embedding) that can be used by the model.
//
//
// The return value is most typically an ml.Tensor, however, different
// The return value is one or more tensors, each with optional model-specific
// type are possible, such as an object containing a tensor plus
// opaque metadata. Typically, the tensors might be views into an embedding
// additional metadata, a slice of tensors or even just the original input.
// with each view representing a chunk of data that can be processed independently
// in different batches.
//
//
// The result may be cached by the runner.
// The result may be cached by the runner.
EncodeMultimodal
(
ml
.
Context
,
[]
byte
)
(
any
,
error
)
EncodeMultimodal
(
ml
.
Context
,
[]
byte
)
(
[]
input
.
Multimodal
,
error
)
// PostTokenize is called after tokenization to allow the model to edit the
// PostTokenize is called after tokenization to allow the model to edit the
// input stream to correctly arrange multimodal elements.
// input stream to correctly arrange multimodal elements.
...
@@ -96,14 +98,8 @@ func Register(name string, f func(fs.Config) (Model, error)) {
...
@@ -96,14 +98,8 @@ func Register(name string, f func(fs.Config) (Model, error)) {
}
}
// New initializes a new model instance with the provided configuration based on the metadata in the model file
// New initializes a new model instance with the provided configuration based on the metadata in the model file
func
New
(
ctx
context
.
Context
,
modelPath
string
,
params
ml
.
BackendParams
)
(
Model
,
error
)
{
func
New
(
modelPath
string
,
params
ml
.
BackendParams
)
(
Model
,
error
)
{
r
,
err
:=
os
.
Open
(
modelPath
)
b
,
err
:=
ml
.
NewBackend
(
modelPath
,
params
)
if
err
!=
nil
{
return
nil
,
err
}
defer
r
.
Close
()
b
,
err
:=
ml
.
NewBackend
(
ctx
,
r
,
params
)
if
err
!=
nil
{
if
err
!=
nil
{
return
nil
,
err
return
nil
,
err
}
}
...
@@ -132,7 +128,7 @@ func NewTextProcessor(s string) (TextProcessor, error) {
...
@@ -132,7 +128,7 @@ func NewTextProcessor(s string) (TextProcessor, error) {
return
nil
,
err
return
nil
,
err
}
}
defer
r
.
Close
()
defer
r
.
Close
()
meta
,
_
,
err
:=
fsggml
.
Decode
(
r
,
-
1
)
meta
,
err
:=
fsggml
.
Decode
(
r
,
-
1
)
if
err
!=
nil
{
if
err
!=
nil
{
return
nil
,
err
return
nil
,
err
}
}
...
@@ -202,7 +198,7 @@ func populateFields(base Base, v reflect.Value, tags ...Tag) reflect.Value {
...
@@ -202,7 +198,7 @@ func populateFields(base Base, v reflect.Value, tags ...Tag) reflect.Value {
names
:=
fn
(
tagsCopy
)
names
:=
fn
(
tagsCopy
)
for
_
,
name
:=
range
names
{
for
_
,
name
:=
range
names
{
if
tensor
:=
base
.
Backend
()
.
Get
(
strings
.
Join
(
name
,
"."
));
tensor
!=
nil
{
if
tensor
:=
base
.
Backend
()
.
Get
(
strings
.
Join
(
name
,
"."
));
tensor
!=
nil
{
slog
.
Debug
(
"found tensor"
,
""
,
tensor
)
slog
.
Log
(
context
.
TODO
(),
logutil
.
LevelTrace
,
"found tensor"
,
""
,
tensor
)
vv
.
Set
(
reflect
.
ValueOf
(
tensor
))
vv
.
Set
(
reflect
.
ValueOf
(
tensor
))
break
break
}
}
...
@@ -291,11 +287,7 @@ func Forward(ctx ml.Context, m Model, inputs []int32, batch input.Batch) (ml.Ten
...
@@ -291,11 +287,7 @@ func Forward(ctx ml.Context, m Model, inputs []int32, batch input.Batch) (ml.Ten
return
nil
,
errors
.
New
(
"batch size cannot be less than 1"
)
return
nil
,
errors
.
New
(
"batch size cannot be less than 1"
)
}
}
var
err
error
batch
.
Inputs
=
ctx
.
Input
()
.
FromIntSlice
(
inputs
,
len
(
inputs
))
batch
.
Inputs
,
err
=
ctx
.
Input
()
.
FromIntSlice
(
inputs
,
len
(
inputs
))
if
err
!=
nil
{
return
nil
,
err
}
cache
:=
m
.
Config
()
.
Cache
cache
:=
m
.
Config
()
.
Cache
if
cache
!=
nil
{
if
cache
!=
nil
{
...
...
model/models/gemma2/model.go
View file @
b2b270ad
...
@@ -7,6 +7,8 @@ import (
...
@@ -7,6 +7,8 @@ import (
"github.com/ollama/ollama/kvcache"
"github.com/ollama/ollama/kvcache"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/ml/nn"
"github.com/ollama/ollama/ml/nn"
"github.com/ollama/ollama/ml/nn/fast"
"github.com/ollama/ollama/ml/nn/rope"
"github.com/ollama/ollama/model"
"github.com/ollama/ollama/model"
"github.com/ollama/ollama/model/input"
"github.com/ollama/ollama/model/input"
)
)
...
@@ -43,8 +45,13 @@ func New(c fs.Config) (model.Model, error) {
...
@@ -43,8 +45,13 @@ func New(c fs.Config) (model.Model, error) {
Values
:
c
.
Strings
(
"tokenizer.ggml.tokens"
),
Values
:
c
.
Strings
(
"tokenizer.ggml.tokens"
),
Scores
:
c
.
Floats
(
"tokenizer.ggml.scores"
),
Scores
:
c
.
Floats
(
"tokenizer.ggml.scores"
),
Types
:
c
.
Ints
(
"tokenizer.ggml.token_type"
),
Types
:
c
.
Ints
(
"tokenizer.ggml.token_type"
),
BOS
:
int32
(
c
.
Uint
(
"tokenizer.ggml.bos_token_id"
)),
AddBOS
:
c
.
Bool
(
"tokenizer.ggml.add_bos_token"
,
true
),
EOS
:
int32
(
c
.
Uint
(
"tokenizer.ggml.eos_token_id"
)),
BOS
:
[]
int32
{
int32
(
c
.
Uint
(
"tokenizer.ggml.bos_token_id"
))},
AddEOS
:
c
.
Bool
(
"tokenizer.ggml.add_eos_token"
,
false
),
EOS
:
append
(
[]
int32
{
int32
(
c
.
Uint
(
"tokenizer.ggml.eos_token_id"
))},
c
.
Ints
(
"tokenizer.ggml.eos_token_ids"
)
...
,
),
},
},
),
),
Layers
:
make
([]
Layer
,
c
.
Uint
(
"block_count"
)),
Layers
:
make
([]
Layer
,
c
.
Uint
(
"block_count"
)),
...
@@ -78,11 +85,10 @@ type SelfAttention struct {
...
@@ -78,11 +85,10 @@ type SelfAttention struct {
func
(
sa
*
SelfAttention
)
Forward
(
ctx
ml
.
Context
,
hiddenState
,
positionIDs
ml
.
Tensor
,
cache
kvcache
.
Cache
,
opts
*
Options
)
ml
.
Tensor
{
func
(
sa
*
SelfAttention
)
Forward
(
ctx
ml
.
Context
,
hiddenState
,
positionIDs
ml
.
Tensor
,
cache
kvcache
.
Cache
,
opts
*
Options
)
ml
.
Tensor
{
batchSize
:=
hiddenState
.
Dim
(
1
)
batchSize
:=
hiddenState
.
Dim
(
1
)
ropeType
:=
uint32
(
2
)
q
:=
sa
.
Query
.
Forward
(
ctx
,
hiddenState
)
q
:=
sa
.
Query
.
Forward
(
ctx
,
hiddenState
)
q
=
q
.
Reshape
(
ctx
,
opts
.
attnKeyLen
,
opts
.
numHeads
,
batchSize
)
q
=
q
.
Reshape
(
ctx
,
opts
.
attnKeyLen
,
opts
.
numHeads
,
batchSize
)
q
=
q
.
RoPE
(
ctx
,
positionIDs
,
nil
,
uint32
(
opts
.
attnKeyLen
),
ropeType
,
opts
.
ropeBase
,
opts
.
ropeScale
)
q
=
fast
.
RoPE
(
ctx
,
q
,
positionIDs
,
opts
.
attnKeyLen
,
opts
.
ropeBase
,
opts
.
ropeScale
,
rope
.
WithTypeNeoX
()
)
if
opts
.
largeModelScaling
{
if
opts
.
largeModelScaling
{
q
=
q
.
Scale
(
ctx
,
1.0
/
math
.
Sqrt
(
float64
(
opts
.
hiddenSize
/
opts
.
numHeads
)))
q
=
q
.
Scale
(
ctx
,
1.0
/
math
.
Sqrt
(
float64
(
opts
.
hiddenSize
/
opts
.
numHeads
)))
...
@@ -92,7 +98,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
...
@@ -92,7 +98,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
k
:=
sa
.
Key
.
Forward
(
ctx
,
hiddenState
)
k
:=
sa
.
Key
.
Forward
(
ctx
,
hiddenState
)
k
=
k
.
Reshape
(
ctx
,
opts
.
attnKeyLen
,
opts
.
numKVHeads
,
batchSize
)
k
=
k
.
Reshape
(
ctx
,
opts
.
attnKeyLen
,
opts
.
numKVHeads
,
batchSize
)
k
=
k
.
RoPE
(
ctx
,
positionIDs
,
nil
,
uint32
(
opts
.
attnKeyLen
),
ropeType
,
opts
.
ropeBase
,
opts
.
ropeScale
)
k
=
fast
.
RoPE
(
ctx
,
k
,
positionIDs
,
opts
.
attnKeyLen
,
opts
.
ropeBase
,
opts
.
ropeScale
,
rope
.
WithTypeNeoX
()
)
v
:=
sa
.
Value
.
Forward
(
ctx
,
hiddenState
)
v
:=
sa
.
Value
.
Forward
(
ctx
,
hiddenState
)
v
=
v
.
Reshape
(
ctx
,
opts
.
attnValLen
,
opts
.
numKVHeads
,
batchSize
)
v
=
v
.
Reshape
(
ctx
,
opts
.
attnValLen
,
opts
.
numKVHeads
,
batchSize
)
...
@@ -122,7 +128,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
...
@@ -122,7 +128,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
}
}
func
(
m
*
Model
)
Shift
(
ctx
ml
.
Context
,
layer
int
,
key
,
shift
ml
.
Tensor
)
(
ml
.
Tensor
,
error
)
{
func
(
m
*
Model
)
Shift
(
ctx
ml
.
Context
,
layer
int
,
key
,
shift
ml
.
Tensor
)
(
ml
.
Tensor
,
error
)
{
return
key
.
RoPE
(
ctx
,
shift
,
nil
,
uint32
(
m
.
Options
.
attnKeyLen
),
uint32
(
2
)
,
m
.
Options
.
ropeBase
,
m
.
Options
.
ropeScale
),
nil
return
fast
.
RoPE
(
ctx
,
key
,
shift
,
m
.
Options
.
attnKeyLen
,
m
.
Options
.
ropeBase
,
m
.
Options
.
ropeScale
,
rope
.
WithTypeNeoX
()
),
nil
}
}
type
MLP
struct
{
type
MLP
struct
{
...
@@ -169,15 +175,8 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Ten
...
@@ -169,15 +175,8 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Ten
}
}
func
(
m
*
Model
)
Forward
(
ctx
ml
.
Context
,
batch
input
.
Batch
)
(
ml
.
Tensor
,
error
)
{
func
(
m
*
Model
)
Forward
(
ctx
ml
.
Context
,
batch
input
.
Batch
)
(
ml
.
Tensor
,
error
)
{
positions
,
err
:=
ctx
.
Input
()
.
FromIntSlice
(
batch
.
Positions
,
len
(
batch
.
Positions
))
positions
:=
ctx
.
Input
()
.
FromIntSlice
(
batch
.
Positions
,
len
(
batch
.
Positions
))
if
err
!=
nil
{
outputs
:=
ctx
.
Input
()
.
FromIntSlice
(
batch
.
Outputs
,
len
(
batch
.
Outputs
))
return
nil
,
err
}
outputs
,
err
:=
ctx
.
Input
()
.
FromIntSlice
(
batch
.
Outputs
,
len
(
batch
.
Outputs
))
if
err
!=
nil
{
return
nil
,
err
}
hiddenState
:=
m
.
TokenEmbedding
.
Forward
(
ctx
,
batch
.
Inputs
)
hiddenState
:=
m
.
TokenEmbedding
.
Forward
(
ctx
,
batch
.
Inputs
)
hiddenState
=
hiddenState
.
Scale
(
ctx
,
math
.
Sqrt
(
float64
(
m
.
Options
.
hiddenSize
)))
hiddenState
=
hiddenState
.
Scale
(
ctx
,
math
.
Sqrt
(
float64
(
m
.
Options
.
hiddenSize
)))
...
...
model/models/gemma3/model.go
View file @
b2b270ad
...
@@ -60,12 +60,16 @@ func New(c fs.Config) (model.Model, error) {
...
@@ -60,12 +60,16 @@ func New(c fs.Config) (model.Model, error) {
Values
:
c
.
Strings
(
"tokenizer.ggml.tokens"
),
Values
:
c
.
Strings
(
"tokenizer.ggml.tokens"
),
Scores
:
c
.
Floats
(
"tokenizer.ggml.scores"
),
Scores
:
c
.
Floats
(
"tokenizer.ggml.scores"
),
Types
:
c
.
Ints
(
"tokenizer.ggml.token_type"
),
Types
:
c
.
Ints
(
"tokenizer.ggml.token_type"
),
BOS
:
int32
(
c
.
Uint
(
"tokenizer.ggml.bos_token_id"
)),
AddBOS
:
c
.
Bool
(
"tokenizer.ggml.add_bos_token"
,
true
),
AddBOS
:
c
.
Bool
(
"tokenizer.ggml.add_bos_token"
,
true
),
E
OS
:
int32
(
1
)
,
B
OS
:
[]
int32
{
int32
(
c
.
Uint
(
"tokenizer.ggml.bos_token_id"
))}
,
AddEOS
:
c
.
Bool
(
"tokenizer.ggml.add_eos_token"
,
false
),
AddEOS
:
c
.
Bool
(
"tokenizer.ggml.add_eos_token"
,
false
),
EOT
:
int32
(
106
),
EOS
:
append
(
AddEOT
:
c
.
Bool
(
"tokenizer.ggml.add_eot_token"
,
false
),
[]
int32
{
int32
(
c
.
Uint
(
"tokenizer.ggml.eos_token_id"
)),
int32
(
c
.
Uint
(
"tokenizer.ggml.eot_token_id"
,
106
)),
},
c
.
Ints
(
"tokenizer.ggml.eos_token_ids"
)
...
,
),
},
},
),
),
ImageProcessor
:
newImageProcessor
(
c
),
ImageProcessor
:
newImageProcessor
(
c
),
...
@@ -82,7 +86,7 @@ func New(c fs.Config) (model.Model, error) {
...
@@ -82,7 +86,7 @@ func New(c fs.Config) (model.Model, error) {
return
&
m
,
nil
return
&
m
,
nil
}
}
func
(
m
*
Model
)
EncodeMultimodal
(
ctx
ml
.
Context
,
multimodalData
[]
byte
)
(
any
,
error
)
{
func
(
m
*
Model
)
EncodeMultimodal
(
ctx
ml
.
Context
,
multimodalData
[]
byte
)
(
[]
input
.
Multimodal
,
error
)
{
if
len
(
m
.
VisionModel
.
Layers
)
==
0
{
if
len
(
m
.
VisionModel
.
Layers
)
==
0
{
return
nil
,
model
.
ErrNoVisionModel
return
nil
,
model
.
ErrNoVisionModel
}
}
...
@@ -97,33 +101,30 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
...
@@ -97,33 +101,30 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
return
nil
,
err
return
nil
,
err
}
}
pixelValues
,
err
:=
ctx
.
Input
()
.
FromFloatSlice
(
f32s
,
pixelValues
:=
ctx
.
Input
()
.
FromFloatSlice
(
f32s
,
m
.
ImageProcessor
.
imageSize
,
m
.
ImageProcessor
.
imageSize
,
m
.
ImageProcessor
.
imageSize
,
m
.
ImageProcessor
.
imageSize
,
m
.
ImageProcessor
.
numChannels
,
m
.
ImageProcessor
.
numChannels
,
)
)
if
err
!=
nil
{
return
nil
,
err
}
visionOutputs
:=
m
.
VisionModel
.
Forward
(
ctx
,
pixelValues
)
visionOutputs
:=
m
.
VisionModel
.
Forward
(
ctx
,
pixelValues
)
visionOutputs
=
m
.
MultiModalProjector
.
Forward
(
ctx
,
visionOutputs
,
m
.
imageSize
,
m
.
patchSize
,
m
.
VisionModel
.
eps
)
visionOutputs
=
m
.
MultiModalProjector
.
Forward
(
ctx
,
visionOutputs
,
m
.
imageSize
,
m
.
patchSize
,
m
.
VisionModel
.
eps
)
return
visionOutputs
,
nil
return
[]
input
.
Multimodal
{{
Tensor
:
visionOutputs
}}
,
nil
}
}
func
(
m
*
Model
)
PostTokenize
(
inputs
[]
input
.
Input
)
([]
input
.
Input
,
error
)
{
func
(
m
*
Model
)
PostTokenize
(
inputs
[]
input
.
Input
)
([]
input
.
Input
,
error
)
{
var
result
[]
input
.
Input
var
result
[]
input
.
Input
for
_
,
inp
:=
range
inputs
{
for
_
,
inp
:=
range
inputs
{
if
inp
.
Multimodal
==
nil
{
if
len
(
inp
.
Multimodal
)
==
0
{
result
=
append
(
result
,
inp
)
result
=
append
(
result
,
inp
)
}
else
{
}
else
{
inputMultimodal
:=
inp
.
Multimodal
.
(
ml
.
Tensor
)
inputMultimodal
:=
inp
.
Multimodal
[
0
]
.
Tensor
result
=
append
(
result
,
result
=
append
(
result
,
input
.
Input
{
Token
:
108
,
SameBatch
:
inputMultimodal
.
Dim
(
1
)
+
3
},
// "\n\n"
input
.
Input
{
Token
:
108
,
SameBatch
:
inputMultimodal
.
Dim
(
1
)
+
3
},
// "\n\n"
input
.
Input
{
Token
:
255999
},
// "<start_of_image>""
input
.
Input
{
Token
:
255999
},
// "<start_of_image>""
input
.
Input
{
Multimodal
:
inputMultimodal
,
MultimodalHash
:
inp
.
MultimodalHash
},
// image data is on the first placeholder
input
.
Input
{
Multimodal
:
[]
input
.
Multimodal
{{
Tensor
:
inputMultimodal
}}
,
MultimodalHash
:
inp
.
MultimodalHash
},
// image data is on the first placeholder
)
)
// add image token placeholders
// add image token placeholders
...
@@ -140,15 +141,8 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
...
@@ -140,15 +141,8 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
}
}
func
(
m
*
Model
)
Forward
(
ctx
ml
.
Context
,
batch
input
.
Batch
)
(
ml
.
Tensor
,
error
)
{
func
(
m
*
Model
)
Forward
(
ctx
ml
.
Context
,
batch
input
.
Batch
)
(
ml
.
Tensor
,
error
)
{
positions
,
err
:=
ctx
.
Input
()
.
FromIntSlice
(
batch
.
Positions
,
len
(
batch
.
Positions
))
positions
:=
ctx
.
Input
()
.
FromIntSlice
(
batch
.
Positions
,
len
(
batch
.
Positions
))
if
err
!=
nil
{
outputs
:=
ctx
.
Input
()
.
FromIntSlice
(
batch
.
Outputs
,
len
(
batch
.
Outputs
))
return
nil
,
err
}
outputs
,
err
:=
ctx
.
Input
()
.
FromIntSlice
(
batch
.
Outputs
,
len
(
batch
.
Outputs
))
if
err
!=
nil
{
return
nil
,
err
}
return
m
.
TextModel
.
Forward
(
ctx
,
batch
.
Inputs
,
positions
,
outputs
,
batch
,
m
.
Cache
),
nil
return
m
.
TextModel
.
Forward
(
ctx
,
batch
.
Inputs
,
positions
,
outputs
,
batch
,
m
.
Cache
),
nil
}
}
...
...
model/models/gemma3/model_text.go
View file @
b2b270ad
...
@@ -7,7 +7,8 @@ import (
...
@@ -7,7 +7,8 @@ import (
"github.com/ollama/ollama/kvcache"
"github.com/ollama/ollama/kvcache"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/ml/nn"
"github.com/ollama/ollama/ml/nn"
"github.com/ollama/ollama/model"
"github.com/ollama/ollama/ml/nn/fast"
"github.com/ollama/ollama/ml/nn/rope"
"github.com/ollama/ollama/model/input"
"github.com/ollama/ollama/model/input"
)
)
...
@@ -20,9 +21,6 @@ type TextConfig struct {
...
@@ -20,9 +21,6 @@ type TextConfig struct {
}
}
type
TextModel
struct
{
type
TextModel
struct
{
model
.
Base
model
.
SentencePieceModel
TokenEmbedding
*
nn
.
Embedding
`gguf:"token_embd"`
TokenEmbedding
*
nn
.
Embedding
`gguf:"token_embd"`
Layers
[]
TextLayer
`gguf:"blk"`
Layers
[]
TextLayer
`gguf:"blk"`
OutputNorm
*
nn
.
RMSNorm
`gguf:"output_norm"`
OutputNorm
*
nn
.
RMSNorm
`gguf:"output_norm"`
...
@@ -45,15 +43,6 @@ func newTextModel(c fs.Config) *TextModel {
...
@@ -45,15 +43,6 @@ func newTextModel(c fs.Config) *TextModel {
numBlocks
:=
int
(
c
.
Uint
(
"block_count"
))
numBlocks
:=
int
(
c
.
Uint
(
"block_count"
))
m
:=
TextModel
{
m
:=
TextModel
{
SentencePieceModel
:
model
.
NewSentencePieceModel
(
&
model
.
Vocabulary
{
Values
:
c
.
Strings
(
"tokenizer.ggml.tokens"
),
Scores
:
c
.
Floats
(
"tokenizer.ggml.scores"
),
Types
:
c
.
Ints
(
"tokenizer.ggml.token_type"
),
BOS
:
int32
(
c
.
Uint
(
"tokenizer.ggml.bos_token_id"
)),
EOS
:
int32
(
c
.
Uint
(
"tokenizer.ggml.eos_token_id"
)),
},
),
Layers
:
make
([]
TextLayer
,
numBlocks
),
Layers
:
make
([]
TextLayer
,
numBlocks
),
TextConfig
:
&
TextConfig
{
TextConfig
:
&
TextConfig
{
hiddenSize
:
int
(
c
.
Uint
(
"embedding_length"
)),
hiddenSize
:
int
(
c
.
Uint
(
"embedding_length"
)),
...
@@ -86,7 +75,6 @@ type TextSelfAttention struct {
...
@@ -86,7 +75,6 @@ type TextSelfAttention struct {
func
(
sa
*
TextSelfAttention
)
Forward
(
ctx
ml
.
Context
,
layer
int
,
hiddenState
,
positionIDs
ml
.
Tensor
,
cache
kvcache
.
Cache
,
opts
*
TextConfig
)
ml
.
Tensor
{
func
(
sa
*
TextSelfAttention
)
Forward
(
ctx
ml
.
Context
,
layer
int
,
hiddenState
,
positionIDs
ml
.
Tensor
,
cache
kvcache
.
Cache
,
opts
*
TextConfig
)
ml
.
Tensor
{
batchSize
:=
hiddenState
.
Dim
(
1
)
batchSize
:=
hiddenState
.
Dim
(
1
)
ropeType
:=
uint32
(
2
)
ropeBase
:=
opts
.
ropeLocalBase
ropeBase
:=
opts
.
ropeLocalBase
if
(
layer
+
1
)
%
gemmaGlobalCacheCount
==
0
{
if
(
layer
+
1
)
%
gemmaGlobalCacheCount
==
0
{
...
@@ -96,7 +84,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, pos
...
@@ -96,7 +84,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, pos
q
:=
sa
.
Query
.
Forward
(
ctx
,
hiddenState
)
q
:=
sa
.
Query
.
Forward
(
ctx
,
hiddenState
)
q
=
q
.
Reshape
(
ctx
,
opts
.
attnKeyLen
,
opts
.
numHeads
,
batchSize
)
q
=
q
.
Reshape
(
ctx
,
opts
.
attnKeyLen
,
opts
.
numHeads
,
batchSize
)
q
=
sa
.
QueryNorm
.
Forward
(
ctx
,
q
,
opts
.
eps
)
q
=
sa
.
QueryNorm
.
Forward
(
ctx
,
q
,
opts
.
eps
)
q
=
q
.
RoPE
(
ctx
,
positionIDs
,
nil
,
uint32
(
opts
.
attnKeyLen
),
ropeType
,
ropeBase
,
opts
.
ropeScale
)
q
=
fast
.
RoPE
(
ctx
,
q
,
positionIDs
,
opts
.
attnKeyLen
,
ropeBase
,
opts
.
ropeScale
,
rope
.
WithTypeNeoX
()
)
if
opts
.
largeModelScaling
{
if
opts
.
largeModelScaling
{
q
=
q
.
Scale
(
ctx
,
1.0
/
math
.
Sqrt
(
float64
(
opts
.
hiddenSize
/
opts
.
numHeads
)))
q
=
q
.
Scale
(
ctx
,
1.0
/
math
.
Sqrt
(
float64
(
opts
.
hiddenSize
/
opts
.
numHeads
)))
...
@@ -107,7 +95,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, pos
...
@@ -107,7 +95,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, pos
k
:=
sa
.
Key
.
Forward
(
ctx
,
hiddenState
)
k
:=
sa
.
Key
.
Forward
(
ctx
,
hiddenState
)
k
=
k
.
Reshape
(
ctx
,
opts
.
attnKeyLen
,
opts
.
numKVHeads
,
batchSize
)
k
=
k
.
Reshape
(
ctx
,
opts
.
attnKeyLen
,
opts
.
numKVHeads
,
batchSize
)
k
=
sa
.
KeyNorm
.
Forward
(
ctx
,
k
,
opts
.
eps
)
k
=
sa
.
KeyNorm
.
Forward
(
ctx
,
k
,
opts
.
eps
)
k
=
k
.
RoPE
(
ctx
,
positionIDs
,
nil
,
uint32
(
opts
.
attnKeyLen
),
ropeType
,
ropeBase
,
opts
.
ropeScale
)
k
=
fast
.
RoPE
(
ctx
,
k
,
positionIDs
,
opts
.
attnKeyLen
,
ropeBase
,
opts
.
ropeScale
,
rope
.
WithTypeNeoX
()
)
v
:=
sa
.
Value
.
Forward
(
ctx
,
hiddenState
)
v
:=
sa
.
Value
.
Forward
(
ctx
,
hiddenState
)
v
=
v
.
Reshape
(
ctx
,
opts
.
attnValLen
,
opts
.
numKVHeads
,
batchSize
)
v
=
v
.
Reshape
(
ctx
,
opts
.
attnValLen
,
opts
.
numKVHeads
,
batchSize
)
...
@@ -125,7 +113,7 @@ func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.T
...
@@ -125,7 +113,7 @@ func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.T
ropeBase
=
m
.
TextConfig
.
ropeGlobalBase
ropeBase
=
m
.
TextConfig
.
ropeGlobalBase
}
}
return
key
.
RoPE
(
ctx
,
shift
,
nil
,
uint32
(
m
.
TextConfig
.
attnKeyLen
),
uint32
(
2
)
,
ropeBase
,
m
.
TextConfig
.
ropeScale
),
nil
return
fast
.
RoPE
(
ctx
,
key
,
shift
,
m
.
TextConfig
.
attnKeyLen
,
ropeBase
,
m
.
TextConfig
.
ropeScale
,
rope
.
WithTypeNeoX
()
),
nil
}
}
type
TextMLP
struct
{
type
TextMLP
struct
{
...
@@ -178,7 +166,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
...
@@ -178,7 +166,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
// set image embeddings
// set image embeddings
var
except
[]
int
var
except
[]
int
for
_
,
image
:=
range
batch
.
Multimodal
{
for
_
,
image
:=
range
batch
.
Multimodal
{
visionOutputs
:=
image
.
Multimodal
.
(
ml
.
Tensor
)
visionOutputs
:=
image
.
Multimodal
[
0
]
.
Tensor
ctx
.
Forward
(
visionOutputs
.
Copy
(
ctx
,
hiddenState
.
View
(
ctx
,
image
.
Index
*
hiddenState
.
Stride
(
1
),
visionOutputs
.
Dim
(
0
)
*
visionOutputs
.
Dim
(
1
))))
ctx
.
Forward
(
visionOutputs
.
Copy
(
ctx
,
hiddenState
.
View
(
ctx
,
image
.
Index
*
hiddenState
.
Stride
(
1
),
visionOutputs
.
Dim
(
0
)
*
visionOutputs
.
Dim
(
1
))))
for
i
:=
range
visionOutputs
.
Dim
(
1
)
{
for
i
:=
range
visionOutputs
.
Dim
(
1
)
{
...
...
model/models/llama/model.go
View file @
b2b270ad
package
llama
package
llama
import
(
import
(
"
fmt
"
"
cmp
"
"math"
"math"
"strings"
"github.com/ollama/ollama/fs"
"github.com/ollama/ollama/fs"
"github.com/ollama/ollama/kvcache"
"github.com/ollama/ollama/kvcache"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/ml/nn"
"github.com/ollama/ollama/ml/nn"
"github.com/ollama/ollama/ml/nn/fast"
"github.com/ollama/ollama/ml/nn/rope"
"github.com/ollama/ollama/model"
"github.com/ollama/ollama/model"
"github.com/ollama/ollama/model/input"
"github.com/ollama/ollama/model/input"
)
)
type
Options
struct
{
type
Options
struct
{
hiddenSize
,
numHeads
,
numKVHeads
int
hiddenSize
,
numHeads
,
numKVHeads
int
headDim
,
ropeDim
int
eps
,
ropeBase
,
ropeScale
float32
eps
,
ropeBase
,
ropeScale
float32
ropeDim
uint32
}
}
type
Model
struct
{
type
Model
struct
{
...
@@ -32,10 +33,6 @@ type Model struct {
...
@@ -32,10 +33,6 @@ type Model struct {
}
}
func
New
(
c
fs
.
Config
)
(
model
.
Model
,
error
)
{
func
New
(
c
fs
.
Config
)
(
model
.
Model
,
error
)
{
if
!
strings
.
EqualFold
(
c
.
String
(
"tokenizer.ggml.model"
),
"gpt2"
)
{
return
nil
,
fmt
.
Errorf
(
"tokenizer %s not yet supported"
,
c
.
String
(
"tokenizer.ggml.model"
))
}
m
:=
Model
{
m
:=
Model
{
BytePairEncoding
:
model
.
NewBytePairEncoding
(
BytePairEncoding
:
model
.
NewBytePairEncoding
(
c
.
String
(
"tokenizer.ggml.pretokenizer"
,
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`
),
c
.
String
(
"tokenizer.ggml.pretokenizer"
,
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`
),
...
@@ -43,10 +40,13 @@ func New(c fs.Config) (model.Model, error) {
...
@@ -43,10 +40,13 @@ func New(c fs.Config) (model.Model, error) {
Values
:
c
.
Strings
(
"tokenizer.ggml.tokens"
),
Values
:
c
.
Strings
(
"tokenizer.ggml.tokens"
),
Types
:
c
.
Ints
(
"tokenizer.ggml.token_type"
),
Types
:
c
.
Ints
(
"tokenizer.ggml.token_type"
),
Merges
:
c
.
Strings
(
"tokenizer.ggml.merges"
),
Merges
:
c
.
Strings
(
"tokenizer.ggml.merges"
),
BOS
:
int32
(
c
.
Uint
(
"tokenizer.ggml.bos_token_id"
)),
AddBOS
:
c
.
Bool
(
"tokenizer.ggml.add_bos_token"
,
true
),
AddBOS
:
c
.
Bool
(
"tokenizer.ggml.add_bos_token"
,
true
),
E
OS
:
int32
(
c
.
Uint
(
"tokenizer.ggml.
e
os_token_id"
)),
B
OS
:
[]
int32
{
int32
(
c
.
Uint
(
"tokenizer.ggml.
b
os_token_id"
))
}
,
AddEOS
:
c
.
Bool
(
"tokenizer.ggml.add_eos_token"
,
false
),
AddEOS
:
c
.
Bool
(
"tokenizer.ggml.add_eos_token"
,
false
),
EOS
:
append
(
[]
int32
{
int32
(
c
.
Uint
(
"tokenizer.ggml.eos_token_id"
))},
c
.
Ints
(
"tokenizer.ggml.eos_token_ids"
)
...
,
),
},
},
),
),
Layers
:
make
([]
Layer
,
c
.
Uint
(
"block_count"
)),
Layers
:
make
([]
Layer
,
c
.
Uint
(
"block_count"
)),
...
@@ -54,10 +54,11 @@ func New(c fs.Config) (model.Model, error) {
...
@@ -54,10 +54,11 @@ func New(c fs.Config) (model.Model, error) {
hiddenSize
:
int
(
c
.
Uint
(
"embedding_length"
)),
hiddenSize
:
int
(
c
.
Uint
(
"embedding_length"
)),
numHeads
:
int
(
c
.
Uint
(
"attention.head_count"
)),
numHeads
:
int
(
c
.
Uint
(
"attention.head_count"
)),
numKVHeads
:
int
(
c
.
Uint
(
"attention.head_count_kv"
)),
numKVHeads
:
int
(
c
.
Uint
(
"attention.head_count_kv"
)),
headDim
:
int
(
c
.
Uint
(
"attention.key_length"
)),
ropeDim
:
int
(
c
.
Uint
(
"rope.dimension_count"
)),
eps
:
c
.
Float
(
"attention.layer_norm_rms_epsilon"
),
eps
:
c
.
Float
(
"attention.layer_norm_rms_epsilon"
),
ropeBase
:
c
.
Float
(
"rope.freq_base"
),
ropeBase
:
c
.
Float
(
"rope.freq_base"
),
ropeScale
:
c
.
Float
(
"rope.freq_scale"
,
1
),
ropeScale
:
c
.
Float
(
"rope.freq_scale"
,
1
),
ropeDim
:
c
.
Uint
(
"rope.dimension_count"
),
},
},
}
}
...
@@ -74,31 +75,31 @@ type SelfAttention struct {
...
@@ -74,31 +75,31 @@ type SelfAttention struct {
RopeFactors
ml
.
Tensor
`gguf:"rope_freqs.weight"`
RopeFactors
ml
.
Tensor
`gguf:"rope_freqs.weight"`
}
}
func
(
sa
*
SelfAttention
)
Forward
(
ctx
ml
.
Context
,
hiddenState
,
position
ID
s
ml
.
Tensor
,
cache
kvcache
.
Cache
,
opts
*
Options
)
ml
.
Tensor
{
func
(
sa
*
SelfAttention
)
Forward
(
ctx
ml
.
Context
,
hiddenState
,
positions
ml
.
Tensor
,
cache
kvcache
.
Cache
,
opts
*
Options
)
ml
.
Tensor
{
batchSize
:=
hiddenState
.
Dim
(
1
)
batchSize
:=
hiddenState
.
Dim
(
1
)
headDim
:=
opts
.
hiddenSize
/
opts
.
numHeads
headDim
:=
cmp
.
Or
(
opts
.
headDim
,
opts
.
hiddenSize
/
opts
.
numHeads
)
rope
Type
:=
uint32
(
0
)
rope
Dim
:=
cmp
.
Or
(
opts
.
ropeDim
,
headDim
)
q
:=
sa
.
Query
.
Forward
(
ctx
,
hiddenState
)
query
:=
sa
.
Query
.
Forward
(
ctx
,
hiddenState
)
q
=
q
.
Reshape
(
ctx
,
headDim
,
opts
.
numHeads
,
batchSize
)
query
=
query
.
Reshape
(
ctx
,
headDim
,
opts
.
numHeads
,
batchSize
)
q
=
q
.
RoPE
(
ctx
,
positionIDs
,
sa
.
RopeFactors
,
opts
.
ropeDim
,
ropeType
,
opts
.
ropeBase
,
opts
.
ropeScale
)
k
:=
sa
.
Key
.
Forward
(
ctx
,
hiddenState
)
key
:=
sa
.
Key
.
Forward
(
ctx
,
hiddenState
)
k
=
k
.
Reshape
(
ctx
,
headDim
,
opts
.
numKVHeads
,
batchSize
)
key
=
key
.
Reshape
(
ctx
,
headDim
,
opts
.
numKVHeads
,
batchSize
)
k
=
k
.
RoPE
(
ctx
,
positionIDs
,
sa
.
RopeFactors
,
opts
.
ropeDim
,
ropeType
,
opts
.
ropeBase
,
opts
.
ropeScale
)
v
:=
sa
.
Value
.
Forward
(
ctx
,
hiddenState
)
v
alue
:=
sa
.
Value
.
Forward
(
ctx
,
hiddenState
)
v
=
v
.
Reshape
(
ctx
,
headDim
,
opts
.
numKVHeads
,
batchSize
)
v
alue
=
value
.
Reshape
(
ctx
,
headDim
,
opts
.
numKVHeads
,
batchSize
)
scaleFactor
:=
1.0
/
math
.
Sqrt
(
float64
(
headDim
))
query
=
fast
.
RoPE
(
ctx
,
query
,
positions
,
ropeDim
,
opts
.
ropeBase
,
opts
.
ropeScale
,
rope
.
WithFactors
(
sa
.
RopeFactors
))
kqv
:=
nn
.
Attention
(
ctx
,
q
,
k
,
v
,
scaleFactor
,
cache
)
key
=
fast
.
RoPE
(
ctx
,
key
,
positions
,
ropeDim
,
opts
.
ropeBase
,
opts
.
ropeScale
,
rope
.
WithFactors
(
sa
.
RopeFactors
))
kqv
=
kqv
.
Reshape
(
ctx
,
opts
.
hiddenSize
,
batchSize
)
return
sa
.
Output
.
Forward
(
ctx
,
kqv
)
attention
:=
nn
.
Attention
(
ctx
,
query
,
key
,
value
,
1.0
/
math
.
Sqrt
(
float64
(
headDim
)),
cache
)
attention
=
attention
.
Reshape
(
ctx
,
headDim
*
opts
.
numHeads
,
batchSize
)
return
sa
.
Output
.
Forward
(
ctx
,
attention
)
}
}
func
(
m
*
Model
)
Shift
(
ctx
ml
.
Context
,
layer
int
,
key
,
shift
ml
.
Tensor
)
(
ml
.
Tensor
,
error
)
{
func
(
m
*
Model
)
Shift
(
ctx
ml
.
Context
,
layer
int
,
key
,
shift
ml
.
Tensor
)
(
ml
.
Tensor
,
error
)
{
return
key
.
RoPE
(
ctx
,
shift
,
m
.
Layers
[
layer
]
.
SelfAttention
.
RopeFactors
,
uint32
(
0
),
m
.
ropeDim
,
m
.
ropeBase
,
m
.
ropeScale
),
nil
ropeDim
:=
cmp
.
Or
(
m
.
ropeDim
,
m
.
hiddenSize
/
m
.
numHeads
)
return
fast
.
RoPE
(
ctx
,
key
,
shift
,
ropeDim
,
m
.
ropeBase
,
m
.
ropeScale
,
rope
.
WithFactors
(
m
.
Layers
[
layer
]
.
SelfAttention
.
RopeFactors
)),
nil
}
}
type
MLP
struct
{
type
MLP
struct
{
...
@@ -119,11 +120,11 @@ type Layer struct {
...
@@ -119,11 +120,11 @@ type Layer struct {
MLP
*
MLP
MLP
*
MLP
}
}
func
(
l
*
Layer
)
Forward
(
ctx
ml
.
Context
,
hiddenState
,
position
ID
s
,
outputs
ml
.
Tensor
,
cache
kvcache
.
Cache
,
opts
*
Options
)
ml
.
Tensor
{
func
(
l
*
Layer
)
Forward
(
ctx
ml
.
Context
,
hiddenState
,
positions
,
outputs
ml
.
Tensor
,
cache
kvcache
.
Cache
,
opts
*
Options
)
ml
.
Tensor
{
residual
:=
hiddenState
residual
:=
hiddenState
hiddenState
=
l
.
AttentionNorm
.
Forward
(
ctx
,
hiddenState
,
opts
.
eps
)
hiddenState
=
l
.
AttentionNorm
.
Forward
(
ctx
,
hiddenState
,
opts
.
eps
)
hiddenState
=
l
.
SelfAttention
.
Forward
(
ctx
,
hiddenState
,
position
ID
s
,
cache
,
opts
)
hiddenState
=
l
.
SelfAttention
.
Forward
(
ctx
,
hiddenState
,
positions
,
cache
,
opts
)
// In the final layer (outputs != nil), optimize by pruning to just the token positions
// In the final layer (outputs != nil), optimize by pruning to just the token positions
// we need logits for.
// we need logits for.
...
@@ -141,27 +142,19 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Ten
...
@@ -141,27 +142,19 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Ten
}
}
func
(
m
*
Model
)
Forward
(
ctx
ml
.
Context
,
batch
input
.
Batch
)
(
ml
.
Tensor
,
error
)
{
func
(
m
*
Model
)
Forward
(
ctx
ml
.
Context
,
batch
input
.
Batch
)
(
ml
.
Tensor
,
error
)
{
positions
,
err
:=
ctx
.
Input
()
.
FromIntSlice
(
batch
.
Positions
,
len
(
batch
.
Positions
))
positions
:=
ctx
.
Input
()
.
FromIntSlice
(
batch
.
Positions
,
len
(
batch
.
Positions
))
if
err
!=
nil
{
return
nil
,
err
}
outputs
,
err
:=
ctx
.
Input
()
.
FromIntSlice
(
batch
.
Outputs
,
len
(
batch
.
Outputs
))
if
err
!=
nil
{
return
nil
,
err
}
hiddenState
:=
m
.
TokenEmbedding
.
Forward
(
ctx
,
batch
.
Inputs
)
hiddenState
:=
m
.
TokenEmbedding
.
Forward
(
ctx
,
batch
.
Inputs
)
for
i
,
layer
:=
range
m
.
Layers
{
for
i
,
layer
:=
range
m
.
Layers
{
m
.
Cache
.
SetLayer
(
i
)
m
.
Cache
.
SetLayer
(
i
)
var
lastLayerO
utputs
ml
.
Tensor
var
o
utputs
ml
.
Tensor
if
i
==
len
(
m
.
Layers
)
-
1
{
if
i
==
len
(
m
.
Layers
)
-
1
{
lastLayerOutputs
=
o
utputs
outputs
=
ctx
.
Input
()
.
FromIntSlice
(
batch
.
Outputs
,
len
(
batch
.
O
utputs
))
}
}
hiddenState
=
layer
.
Forward
(
ctx
,
hiddenState
,
positions
,
lastLayerO
utputs
,
m
.
Cache
,
m
.
Options
)
hiddenState
=
layer
.
Forward
(
ctx
,
hiddenState
,
positions
,
o
utputs
,
m
.
Cache
,
m
.
Options
)
}
}
hiddenState
=
m
.
OutputNorm
.
Forward
(
ctx
,
hiddenState
,
m
.
eps
)
hiddenState
=
m
.
OutputNorm
.
Forward
(
ctx
,
hiddenState
,
m
.
eps
)
...
...
model/models/llama4/model.go
View file @
b2b270ad
...
@@ -4,7 +4,6 @@ import (
...
@@ -4,7 +4,6 @@ import (
"bytes"
"bytes"
"image"
"image"
"slices"
"slices"
"sync"
"github.com/ollama/ollama/fs"
"github.com/ollama/ollama/fs"
"github.com/ollama/ollama/kvcache"
"github.com/ollama/ollama/kvcache"
...
@@ -41,10 +40,13 @@ func New(c fs.Config) (model.Model, error) {
...
@@ -41,10 +40,13 @@ func New(c fs.Config) (model.Model, error) {
Values
:
c
.
Strings
(
"tokenizer.ggml.tokens"
),
Values
:
c
.
Strings
(
"tokenizer.ggml.tokens"
),
Types
:
c
.
Ints
(
"tokenizer.ggml.token_type"
),
Types
:
c
.
Ints
(
"tokenizer.ggml.token_type"
),
Merges
:
c
.
Strings
(
"tokenizer.ggml.merges"
),
Merges
:
c
.
Strings
(
"tokenizer.ggml.merges"
),
BOS
:
int32
(
c
.
Uint
(
"tokenizer.ggml.bos_token_id"
)),
AddBOS
:
c
.
Bool
(
"tokenizer.ggml.add_bos_token"
,
true
),
AddBOS
:
c
.
Bool
(
"tokenizer.ggml.add_bos_token"
,
true
),
E
OS
:
int32
(
c
.
Uint
(
"tokenizer.ggml.
e
os_token_id"
)),
B
OS
:
[]
int32
{
int32
(
c
.
Uint
(
"tokenizer.ggml.
b
os_token_id"
))
}
,
AddEOS
:
c
.
Bool
(
"tokenizer.ggml.add_eos_token"
,
false
),
AddEOS
:
c
.
Bool
(
"tokenizer.ggml.add_eos_token"
,
false
),
EOS
:
append
(
[]
int32
{
int32
(
c
.
Uint
(
"tokenizer.ggml.eos_token_id"
))},
c
.
Ints
(
"tokenizer.ggml.eos_token_ids"
)
...
,
),
},
},
),
),
ImageProcessor
:
newImageProcessor
(
c
),
ImageProcessor
:
newImageProcessor
(
c
),
...
@@ -60,7 +62,7 @@ func New(c fs.Config) (model.Model, error) {
...
@@ -60,7 +62,7 @@ func New(c fs.Config) (model.Model, error) {
return
&
m
,
nil
return
&
m
,
nil
}
}
func
(
m
*
Model
)
EncodeMultimodal
(
ctx
ml
.
Context
,
multimodalData
[]
byte
)
(
any
,
error
)
{
func
(
m
*
Model
)
EncodeMultimodal
(
ctx
ml
.
Context
,
multimodalData
[]
byte
)
(
[]
input
.
Multimodal
,
error
)
{
if
len
(
m
.
VisionModel
.
Layers
)
<
1
{
if
len
(
m
.
VisionModel
.
Layers
)
<
1
{
return
nil
,
model
.
ErrNoVisionModel
return
nil
,
model
.
ErrNoVisionModel
}
}
...
@@ -75,10 +77,7 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
...
@@ -75,10 +77,7 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
return
nil
,
err
return
nil
,
err
}
}
tilesLocal
,
err
:=
ctx
.
Input
()
.
FromFloatSlice
(
pixelsLocal
,
size
.
X
,
size
.
Y
,
m
.
numChannels
)
tilesLocal
:=
ctx
.
Input
()
.
FromFloatSlice
(
pixelsLocal
,
size
.
X
,
size
.
Y
,
m
.
numChannels
)
if
err
!=
nil
{
return
nil
,
err
}
ratioW
,
ratioH
:=
size
.
X
/
m
.
imageSize
,
size
.
Y
/
m
.
imageSize
ratioW
,
ratioH
:=
size
.
X
/
m
.
imageSize
,
size
.
Y
/
m
.
imageSize
...
@@ -89,81 +88,86 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
...
@@ -89,81 +88,86 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
pixelValues
:=
tilesLocal
pixelValues
:=
tilesLocal
if
len
(
pixelsGlobal
)
>
0
{
if
len
(
pixelsGlobal
)
>
0
{
tilesGlobal
,
err
:=
ctx
.
Input
()
.
FromFloatSlice
(
pixelsGlobal
,
m
.
imageSize
,
m
.
imageSize
,
m
.
numChannels
)
tilesGlobal
:=
ctx
.
Input
()
.
FromFloatSlice
(
pixelsGlobal
,
m
.
imageSize
,
m
.
imageSize
,
m
.
numChannels
)
if
err
!=
nil
{
return
nil
,
err
}
pixelValues
=
pixelValues
.
Concat
(
ctx
,
tilesGlobal
,
3
)
pixelValues
=
pixelValues
.
Concat
(
ctx
,
tilesGlobal
,
3
)
}
}
visionOutputs
:=
m
.
VisionModel
.
Forward
(
ctx
,
pixelValues
)
visionOutputs
:=
m
.
VisionModel
.
Forward
(
ctx
,
pixelValues
)
visionOutputs
=
visionOutputs
.
Reshape
(
ctx
,
visionOutputs
.
Dim
(
0
),
visionOutputs
.
Dim
(
1
)
*
visionOutputs
.
Dim
(
2
)
*
visionOutputs
.
Dim
(
3
))
visionOutputs
=
visionOutputs
.
Reshape
(
ctx
,
visionOutputs
.
Dim
(
0
),
visionOutputs
.
Dim
(
1
)
*
visionOutputs
.
Dim
(
2
)
*
visionOutputs
.
Dim
(
3
))
projectedOutputs
:=
m
.
Projector
.
Forward
(
ctx
,
visionOutputs
)
projectedOutputs
:=
m
.
Projector
.
Forward
(
ctx
,
visionOutputs
)
return
&
chunks
{
Model
:
m
,
Tensor
:
projectedOutputs
,
aspectRatio
:
image
.
Point
{
ratioW
,
ratioH
}},
nil
}
type
chunks
struct
{
var
multimodal
[]
input
.
Multimodal
*
Model
aspectRatio
:=
image
.
Point
{
ratioW
,
ratioH
}
ml
.
Tensor
aspectRatio
image
.
Point
var
offset
int
patchesPerChunk
:=
projectedOutputs
.
Dim
(
1
)
if
aspectRatio
.
Y
*
aspectRatio
.
X
>
1
{
patchesPerChunk
=
projectedOutputs
.
Dim
(
1
)
/
(
aspectRatio
.
X
*
aspectRatio
.
Y
+
1
)
for
range
aspectRatio
.
Y
{
for
x
:=
range
aspectRatio
.
X
{
view
:=
projectedOutputs
.
View
(
ctx
,
projectedOutputs
.
Stride
(
1
)
*
offset
,
projectedOutputs
.
Dim
(
0
),
projectedOutputs
.
Stride
(
1
),
patchesPerChunk
)
var
separator
separator
if
x
<
aspectRatio
.
X
-
1
{
separator
.
x
=
true
// <|tile_x_separator|>
}
else
{
separator
.
y
=
true
// <|tile_y_separator|>
}
multimodal
=
append
(
multimodal
,
input
.
Multimodal
{
Tensor
:
view
,
Data
:
&
separator
})
offset
+=
patchesPerChunk
}
}
}
dataOnce
sync
.
Once
view
:=
projectedOutputs
.
View
(
ctx
,
projectedOutputs
.
Stride
(
1
)
*
offset
,
data
[]
float32
projectedOutputs
.
Dim
(
0
),
projectedOutputs
.
Stride
(
1
),
}
patchesPerChunk
)
multimodal
=
append
(
multimodal
,
input
.
Multimodal
{
Tensor
:
view
,
Data
:
&
separator
{}})
type
chunk
struct
{
return
multimodal
,
nil
*
chunks
s
,
n
int
}
}
func
(
r
*
chunk
)
floats
()
[]
float32
{
type
separator
struct
{
r
.
dataOnce
.
Do
(
func
()
{
x
bool
temp
:=
r
.
Backend
()
.
NewContext
()
y
bool
defer
temp
.
Close
()
temp
.
Forward
(
r
.
Tensor
)
.
Compute
(
r
.
Tensor
)
r
.
data
=
r
.
Floats
()
})
return
r
.
data
[
r
.
s
*
r
.
Dim
(
0
)
:
(
r
.
s
+
r
.
n
)
*
r
.
Dim
(
0
)]
}
}
func
(
m
*
Model
)
PostTokenize
(
inputs
[]
input
.
Input
)
([]
input
.
Input
,
error
)
{
func
(
m
*
Model
)
PostTokenize
(
inputs
[]
input
.
Input
)
([]
input
.
Input
,
error
)
{
var
result
[]
input
.
Input
var
result
[]
input
.
Input
for
_
,
inp
:=
range
inputs
{
for
_
,
inp
:=
range
inputs
{
if
inp
.
Multimodal
==
nil
{
if
len
(
inp
.
Multimodal
)
==
0
{
result
=
append
(
result
,
inp
)
result
=
append
(
result
,
inp
)
continue
continue
}
}
t
:=
inp
.
Multimodal
.
(
*
chunks
)
var
imageInputs
[]
input
.
Input
var
imageInputs
[]
input
.
Input
imageInputs
=
append
(
imageInputs
,
input
.
Input
{
Token
:
200080
})
// <|image_start|>
imageInputs
=
append
(
imageInputs
,
input
.
Input
{
Token
:
200080
})
// <|image_start|>
var
offset
int
for
i
,
mm
:=
range
inp
.
Multimodal
{
patchesPerChunk
:=
t
.
Dim
(
1
)
patchesPerChunk
:=
mm
.
Tensor
.
Dim
(
1
)
if
t
.
aspectRatio
.
Y
*
t
.
aspectRatio
.
X
>
1
{
patchesPerChunk
=
t
.
Dim
(
1
)
/
(
t
.
aspectRatio
.
X
*
t
.
aspectRatio
.
Y
+
1
)
for
range
t
.
aspectRatio
.
Y
{
for
x
:=
range
t
.
aspectRatio
.
X
{
imageInputs
=
append
(
imageInputs
,
input
.
Input
{
Token
:
200092
,
Multimodal
:
&
chunk
{
t
,
offset
,
patchesPerChunk
},
MultimodalHash
:
inp
.
MultimodalHash
,
SameBatch
:
patchesPerChunk
})
// <|patch|>
imageInputs
=
append
(
imageInputs
,
slices
.
Repeat
([]
input
.
Input
{{
Token
:
200092
}},
patchesPerChunk
-
1
)
...
)
if
x
<
t
.
aspectRatio
.
X
-
1
{
imageInputs
=
append
(
imageInputs
,
input
.
Input
{
Token
:
200084
})
// <|tile_x_separator|>
}
offset
+=
patchesPerChunk
}
imageInputs
=
append
(
imageInputs
,
input
.
Input
{
Token
:
200085
})
// <|tile_y_separator|>
if
i
<
len
(
inp
.
Multimodal
)
-
1
{
separator
:=
mm
.
Data
.
(
*
separator
)
imageInputs
=
append
(
imageInputs
,
input
.
Input
{
Token
:
200092
,
Multimodal
:
[]
input
.
Multimodal
{{
Tensor
:
mm
.
Tensor
}},
MultimodalHash
:
inp
.
MultimodalHash
,
SameBatch
:
patchesPerChunk
})
// <|patch|>
imageInputs
=
append
(
imageInputs
,
slices
.
Repeat
([]
input
.
Input
{{
Token
:
200092
}},
patchesPerChunk
-
1
)
...
)
if
separator
.
x
{
imageInputs
=
append
(
imageInputs
,
input
.
Input
{
Token
:
200084
})
// <|tile_x_separator|>
}
if
separator
.
y
{
imageInputs
=
append
(
imageInputs
,
input
.
Input
{
Token
:
200085
})
// <|tile_y_separator|>
}
}
else
{
imageInputs
=
append
(
imageInputs
,
input
.
Input
{
Token
:
200090
})
// <|image|>
imageInputs
=
append
(
imageInputs
,
input
.
Input
{
Token
:
200092
,
Multimodal
:
[]
input
.
Multimodal
{{
Tensor
:
mm
.
Tensor
}},
MultimodalHash
:
inp
.
MultimodalHash
,
SameBatch
:
patchesPerChunk
})
// <|patch|>
imageInputs
=
append
(
imageInputs
,
slices
.
Repeat
([]
input
.
Input
{{
Token
:
200092
}},
patchesPerChunk
-
1
)
...
)
imageInputs
=
append
(
imageInputs
,
input
.
Input
{
Token
:
200080
})
// <|image_end|>
}
}
}
}
imageInputs
=
append
(
imageInputs
,
input
.
Input
{
Token
:
200090
})
// <|image|>
imageInputs
=
append
(
imageInputs
,
input
.
Input
{
Token
:
200092
,
Multimodal
:
&
chunk
{
t
,
offset
,
patchesPerChunk
},
MultimodalHash
:
inp
.
MultimodalHash
,
SameBatch
:
patchesPerChunk
})
// <|patch|>
imageInputs
=
append
(
imageInputs
,
slices
.
Repeat
([]
input
.
Input
{{
Token
:
200092
}},
patchesPerChunk
-
1
)
...
)
imageInputs
=
append
(
imageInputs
,
input
.
Input
{
Token
:
200080
})
// <|image_end|>
result
=
append
(
result
,
imageInputs
...
)
result
=
append
(
result
,
imageInputs
...
)
}
}
...
@@ -171,15 +175,8 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
...
@@ -171,15 +175,8 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
}
}
func
(
m
*
Model
)
Forward
(
ctx
ml
.
Context
,
batch
input
.
Batch
)
(
ml
.
Tensor
,
error
)
{
func
(
m
*
Model
)
Forward
(
ctx
ml
.
Context
,
batch
input
.
Batch
)
(
ml
.
Tensor
,
error
)
{
positions
,
err
:=
ctx
.
Input
()
.
FromIntSlice
(
batch
.
Positions
,
len
(
batch
.
Positions
))
positions
:=
ctx
.
Input
()
.
FromIntSlice
(
batch
.
Positions
,
len
(
batch
.
Positions
))
if
err
!=
nil
{
outputs
:=
ctx
.
Input
()
.
FromIntSlice
(
batch
.
Outputs
,
len
(
batch
.
Outputs
))
return
nil
,
err
}
outputs
,
err
:=
ctx
.
Input
()
.
FromIntSlice
(
batch
.
Outputs
,
len
(
batch
.
Outputs
))
if
err
!=
nil
{
return
nil
,
err
}
return
m
.
TextModel
.
Forward
(
ctx
,
batch
.
Inputs
,
positions
,
outputs
,
batch
,
m
.
Cache
),
nil
return
m
.
TextModel
.
Forward
(
ctx
,
batch
.
Inputs
,
positions
,
outputs
,
batch
,
m
.
Cache
),
nil
}
}
...
...
model/models/llama4/model_text.go
View file @
b2b270ad
...
@@ -8,6 +8,8 @@ import (
...
@@ -8,6 +8,8 @@ import (
"github.com/ollama/ollama/kvcache"
"github.com/ollama/ollama/kvcache"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/ml/nn"
"github.com/ollama/ollama/ml/nn"
"github.com/ollama/ollama/ml/nn/fast"
"github.com/ollama/ollama/ml/nn/rope"
"github.com/ollama/ollama/model/input"
"github.com/ollama/ollama/model/input"
)
)
...
@@ -31,8 +33,8 @@ func (sa *TextAttention) Forward(ctx ml.Context, hiddenStates, positions, attent
...
@@ -31,8 +33,8 @@ func (sa *TextAttention) Forward(ctx ml.Context, hiddenStates, positions, attent
value
=
value
.
Reshape
(
ctx
,
headDim
,
opts
.
numKVHeads
,
batchSize
)
value
=
value
.
Reshape
(
ctx
,
headDim
,
opts
.
numKVHeads
,
batchSize
)
if
useRope
{
if
useRope
{
query
=
query
.
RoPE
(
ctx
,
positions
,
sa
.
RopeFactors
,
uint32
(
opts
.
ropeDim
),
uint32
(
0
),
opts
.
ropeBase
,
opts
.
ropeScale
)
query
=
fast
.
RoPE
(
ctx
,
query
,
positions
,
opts
.
ropeDim
,
opts
.
ropeBase
,
opts
.
ropeScale
,
rope
.
WithFactors
(
sa
.
RopeFactors
)
)
key
=
key
.
RoPE
(
ctx
,
positions
,
sa
.
RopeFactors
,
uint32
(
opts
.
ropeDim
),
uint32
(
0
),
opts
.
ropeBase
,
opts
.
ropeScale
)
key
=
fast
.
RoPE
(
ctx
,
key
,
positions
,
opts
.
ropeDim
,
opts
.
ropeBase
,
opts
.
ropeScale
,
rope
.
WithFactors
(
sa
.
RopeFactors
)
)
}
}
if
opts
.
useQKNorm
{
if
opts
.
useQKNorm
{
...
@@ -61,9 +63,9 @@ func (mlp *TextMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *TextOp
...
@@ -61,9 +63,9 @@ func (mlp *TextMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *TextOp
}
}
type
TextExperts
struct
{
type
TextExperts
struct
{
Gate
ml
.
Tenso
r
`gguf:"ffn_gate_exps
.weight
"`
Gate
*
nn
.
Linea
r
`gguf:"ffn_gate_exps"`
Up
ml
.
Tenso
r
`gguf:"ffn_up_exps
.weight
"`
Up
*
nn
.
Linea
r
`gguf:"ffn_up_exps"`
Down
ml
.
Tenso
r
`gguf:"ffn_down_exps
.weight
"`
Down
*
nn
.
Linea
r
`gguf:"ffn_down_exps"`
}
}
func
(
e
*
TextExperts
)
Forward
(
ctx
ml
.
Context
,
hiddenStates
,
routerLogits
ml
.
Tensor
,
opts
*
TextOptions
)
ml
.
Tensor
{
func
(
e
*
TextExperts
)
Forward
(
ctx
ml
.
Context
,
hiddenStates
,
routerLogits
ml
.
Tensor
,
opts
*
TextOptions
)
ml
.
Tensor
{
...
@@ -74,13 +76,13 @@ func (e *TextExperts) Forward(ctx ml.Context, hiddenStates, routerLogits ml.Tens
...
@@ -74,13 +76,13 @@ func (e *TextExperts) Forward(ctx ml.Context, hiddenStates, routerLogits ml.Tens
hiddenStates
=
hiddenStates
.
Repeat
(
ctx
,
1
,
opts
.
numExpertsUsed
)
hiddenStates
=
hiddenStates
.
Repeat
(
ctx
,
1
,
opts
.
numExpertsUsed
)
hiddenStates
=
hiddenStates
.
Mul
(
ctx
,
scores
)
hiddenStates
=
hiddenStates
.
Mul
(
ctx
,
scores
)
upStates
:=
e
.
Up
.
MulmatID
(
ctx
,
hiddenStates
,
experts
)
upStates
:=
e
.
Up
.
Weight
.
MulmatID
(
ctx
,
hiddenStates
,
experts
)
gateStates
:=
e
.
Gate
.
MulmatID
(
ctx
,
hiddenStates
,
experts
)
gateStates
:=
e
.
Gate
.
Weight
.
MulmatID
(
ctx
,
hiddenStates
,
experts
)
downStates
:=
e
.
Down
.
MulmatID
(
ctx
,
upStates
.
Mul
(
ctx
,
gateStates
.
SILU
(
ctx
)),
experts
)
downStates
:=
e
.
Down
.
Weight
.
MulmatID
(
ctx
,
upStates
.
Mul
(
ctx
,
gateStates
.
SILU
(
ctx
)),
experts
)
nextStates
:=
downStates
.
View
(
ctx
,
0
,
hiddenStates
.
Dim
(
0
),
downStates
.
Stride
(
2
),
hiddenStates
.
Dim
(
2
))
nextStates
:=
downStates
.
View
(
ctx
,
0
,
hiddenStates
.
Dim
(
0
),
downStates
.
Stride
(
2
),
hiddenStates
.
Dim
(
2
))
for
i
:=
1
;
i
<
opts
.
numExpertsUsed
;
i
++
{
for
i
:=
1
;
i
<
opts
.
numExpertsUsed
;
i
++
{
nextStates
.
Add
(
ctx
,
downStates
.
View
(
ctx
,
i
*
downStates
.
Stride
(
1
),
hiddenStates
.
Dim
(
0
),
downStates
.
Stride
(
2
),
hiddenStates
.
Dim
(
2
)))
nextStates
=
nextStates
.
Add
(
ctx
,
downStates
.
View
(
ctx
,
i
*
downStates
.
Stride
(
1
),
hiddenStates
.
Dim
(
0
),
downStates
.
Stride
(
2
),
hiddenStates
.
Dim
(
2
)))
}
}
return
nextStates
return
nextStates
...
@@ -210,12 +212,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
...
@@ -210,12 +212,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
hiddenStates
:=
m
.
TokenEmbedding
.
Forward
(
ctx
,
inputs
)
.
Duplicate
(
ctx
)
hiddenStates
:=
m
.
TokenEmbedding
.
Forward
(
ctx
,
inputs
)
.
Duplicate
(
ctx
)
for
_
,
mi
:=
range
batch
.
Multimodal
{
for
_
,
mi
:=
range
batch
.
Multimodal
{
f32s
:=
mi
.
Multimodal
.
(
*
chunk
)
.
floats
()
img
:=
mi
.
Multimodal
[
0
]
.
Tensor
img
,
err
:=
ctx
.
Input
()
.
FromFloatSlice
(
f32s
,
len
(
f32s
)
/
m
.
hiddenSize
,
m
.
hiddenSize
)
if
err
!=
nil
{
panic
(
err
)
}
ctx
.
Forward
(
img
.
Copy
(
ctx
,
hiddenStates
.
View
(
ctx
,
mi
.
Index
*
hiddenStates
.
Stride
(
1
),
img
.
Dim
(
0
)
*
img
.
Dim
(
1
))))
ctx
.
Forward
(
img
.
Copy
(
ctx
,
hiddenStates
.
View
(
ctx
,
mi
.
Index
*
hiddenStates
.
Stride
(
1
),
img
.
Dim
(
0
)
*
img
.
Dim
(
1
))))
}
}
...
@@ -226,11 +223,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
...
@@ -226,11 +223,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
scales
[
i
]
=
float32
(
math
.
Log
(
math
.
Floor
(((
float64
(
p
)
+
1.0
)
/
float64
(
m
.
attentionFloorScale
))
+
1.0
))
*
m
.
attentionScale
+
1.0
)
scales
[
i
]
=
float32
(
math
.
Log
(
math
.
Floor
(((
float64
(
p
)
+
1.0
)
/
float64
(
m
.
attentionFloorScale
))
+
1.0
))
*
m
.
attentionScale
+
1.0
)
}
}
var
err
error
attentionScales
=
ctx
.
Input
()
.
FromFloatSlice
(
scales
,
1
,
1
,
len
(
scales
))
attentionScales
,
err
=
ctx
.
Input
()
.
FromFloatSlice
(
scales
,
1
,
1
,
len
(
scales
))
if
err
!=
nil
{
panic
(
err
)
}
}
}
for
i
,
layer
:=
range
m
.
Layers
{
for
i
,
layer
:=
range
m
.
Layers
{
...
@@ -255,5 +248,5 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
...
@@ -255,5 +248,5 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
}
}
func
(
m
*
TextModel
)
Shift
(
ctx
ml
.
Context
,
layer
int
,
key
,
shift
ml
.
Tensor
)
(
ml
.
Tensor
,
error
)
{
func
(
m
*
TextModel
)
Shift
(
ctx
ml
.
Context
,
layer
int
,
key
,
shift
ml
.
Tensor
)
(
ml
.
Tensor
,
error
)
{
return
key
.
RoPE
(
ctx
,
shift
,
m
.
Layers
[
layer
]
.
Attention
.
RopeFactors
,
uint32
(
0
),
uint32
(
m
.
ropeDim
),
m
.
ropeBase
,
m
.
ropeScale
),
nil
return
fast
.
RoPE
(
ctx
,
key
,
shift
,
m
.
ropeDim
,
m
.
ropeBase
,
m
.
ropeScale
,
rope
.
WithFactors
(
m
.
Layers
[
layer
]
.
Attention
.
RopeFactors
)
),
nil
}
}
model/models/llama4/model_vision.go
View file @
b2b270ad
...
@@ -208,7 +208,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
...
@@ -208,7 +208,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
}
}
hiddenStates
=
m
.
LayerNormPost
.
Forward
(
ctx
,
hiddenStates
,
m
.
eps
)
hiddenStates
=
m
.
LayerNormPost
.
Forward
(
ctx
,
hiddenStates
,
m
.
eps
)
hiddenStates
=
hiddenStates
.
Unp
ad
(
ctx
,
0
,
1
,
0
,
0
)
hiddenStates
=
hiddenStates
.
P
ad
(
ctx
,
0
,
-
1
,
0
,
0
)
hiddenStates
=
m
.
VisionAdapter
.
Forward
(
ctx
,
hiddenStates
,
m
.
VisionOptions
)
hiddenStates
=
m
.
VisionAdapter
.
Forward
(
ctx
,
hiddenStates
,
m
.
VisionOptions
)
return
hiddenStates
return
hiddenStates
}
}
...
@@ -245,10 +245,7 @@ func (m *VisionModel) rotaryEmbedding(ctx ml.Context) (ml.Tensor, ml.Tensor) {
...
@@ -245,10 +245,7 @@ func (m *VisionModel) rotaryEmbedding(ctx ml.Context) (ml.Tensor, ml.Tensor) {
}
}
}
}
ropeFreqs
,
err
:=
ctx
.
Input
()
.
FromFloatSlice
(
freqs
,
freqDim
/
2
,
numPatches
,
2
)
ropeFreqs
:=
ctx
.
Input
()
.
FromFloatSlice
(
freqs
,
freqDim
/
2
,
numPatches
,
2
)
if
err
!=
nil
{
panic
(
err
)
}
ropeFreqs
=
ropeFreqs
.
Permute
(
ctx
,
0
,
2
,
1
,
3
)
.
Contiguous
(
ctx
)
ropeFreqs
=
ropeFreqs
.
Permute
(
ctx
,
0
,
2
,
1
,
3
)
.
Contiguous
(
ctx
)
ropeFreqs
=
ropeFreqs
.
Reshape
(
ctx
,
freqDim
,
1
,
numPatches
)
ropeFreqs
=
ropeFreqs
.
Reshape
(
ctx
,
freqDim
,
1
,
numPatches
)
...
...
model/models/mistral3/model.go
View file @
b2b270ad
...
@@ -4,7 +4,6 @@ import (
...
@@ -4,7 +4,6 @@ import (
"bytes"
"bytes"
"image"
"image"
"slices"
"slices"
"sync"
"github.com/ollama/ollama/fs"
"github.com/ollama/ollama/fs"
"github.com/ollama/ollama/kvcache"
"github.com/ollama/ollama/kvcache"
...
@@ -16,6 +15,8 @@ import (
...
@@ -16,6 +15,8 @@ import (
type
Model
struct
{
type
Model
struct
{
model
.
Base
model
.
Base
model
.
BytePairEncoding
*
TextModel
*
TextModel
*
VisionModel
`gguf:"v,vision"`
*
VisionModel
`gguf:"v,vision"`
*
MultiModalProjector
`gguf:"mm"`
*
MultiModalProjector
`gguf:"mm"`
...
@@ -30,13 +31,23 @@ var _ model.MultimodalProcessor = (*Model)(nil)
...
@@ -30,13 +31,23 @@ var _ model.MultimodalProcessor = (*Model)(nil)
var
_
model
.
TextProcessor
=
(
*
Model
)(
nil
)
var
_
model
.
TextProcessor
=
(
*
Model
)(
nil
)
func
New
(
c
fs
.
Config
)
(
model
.
Model
,
error
)
{
func
New
(
c
fs
.
Config
)
(
model
.
Model
,
error
)
{
textModel
,
err
:=
NewTextModel
(
c
)
if
err
!=
nil
{
return
nil
,
err
}
m
:=
&
Model
{
m
:=
&
Model
{
TextModel
:
textModel
,
BytePairEncoding
:
model
.
NewBytePairEncoding
(
c
.
String
(
"tokenizer.ggml.pretokenizer"
,
`[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`
),
&
model
.
Vocabulary
{
Values
:
c
.
Strings
(
"tokenizer.ggml.tokens"
),
Types
:
c
.
Ints
(
"tokenizer.ggml.token_type"
),
Merges
:
c
.
Strings
(
"tokenizer.ggml.merges"
),
AddBOS
:
c
.
Bool
(
"tokenizer.ggml.add_bos_token"
,
true
),
BOS
:
[]
int32
{
int32
(
c
.
Uint
(
"tokenizer.ggml.bos_token_id"
))},
AddEOS
:
c
.
Bool
(
"tokenizer.ggml.add_eos_token"
,
false
),
EOS
:
append
(
[]
int32
{
int32
(
c
.
Uint
(
"tokenizer.ggml.eos_token_id"
))},
c
.
Ints
(
"tokenizer.ggml.eos_token_ids"
)
...
,
),
},
),
TextModel
:
newTextModel
(
c
),
VisionModel
:
newVisionModel
(
c
),
VisionModel
:
newVisionModel
(
c
),
ImageProcessor
:
newImageProcessor
(
c
),
ImageProcessor
:
newImageProcessor
(
c
),
MultiModalProjector
:
newMultiModalProjector
(
c
),
MultiModalProjector
:
newMultiModalProjector
(
c
),
...
@@ -88,7 +99,7 @@ func newMultiModalProjector(c fs.Config) *MultiModalProjector {
...
@@ -88,7 +99,7 @@ func newMultiModalProjector(c fs.Config) *MultiModalProjector {
}
}
}
}
func
(
m
*
Model
)
EncodeMultimodal
(
ctx
ml
.
Context
,
multimodalData
[]
byte
)
(
any
,
error
)
{
func
(
m
*
Model
)
EncodeMultimodal
(
ctx
ml
.
Context
,
multimodalData
[]
byte
)
(
[]
input
.
Multimodal
,
error
)
{
if
len
(
m
.
VisionModel
.
Layers
)
==
0
{
if
len
(
m
.
VisionModel
.
Layers
)
==
0
{
return
nil
,
model
.
ErrNoVisionModel
return
nil
,
model
.
ErrNoVisionModel
}
}
...
@@ -103,46 +114,20 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
...
@@ -103,46 +114,20 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
return
nil
,
err
return
nil
,
err
}
}
pixelValues
,
err
:=
ctx
.
Input
()
.
FromFloatSlice
(
f32s
,
size
.
X
,
size
.
Y
,
m
.
ImageProcessor
.
numChannels
)
pixelValues
:=
ctx
.
Input
()
.
FromFloatSlice
(
f32s
,
size
.
X
,
size
.
Y
,
m
.
ImageProcessor
.
numChannels
)
if
err
!=
nil
{
return
nil
,
err
}
visionOutputs
:=
m
.
VisionModel
.
Forward
(
ctx
,
pixelValues
)
visionOutputs
:=
m
.
VisionModel
.
Forward
(
ctx
,
pixelValues
)
features
,
size
:=
m
.
MultiModalProjector
.
Forward
(
ctx
,
visionOutputs
,
size
)
features
,
size
:=
m
.
MultiModalProjector
.
Forward
(
ctx
,
visionOutputs
,
size
)
// split into patches to be sent to the text transformer
// split into patches to be sent to the text transformer
parent
:=
imageFeatures
{
tensor
:
features
}
rows
:=
make
([]
input
.
Multimodal
,
size
.
Y
)
rows
:=
make
([]
*
imageRow
,
size
.
Y
)
for
i
:=
range
rows
{
for
i
:=
range
rows
{
rows
[
i
]
=
&
imageRow
{
parent
:
&
parent
,
s
:
i
,
shape
:
[]
int
{
features
.
Dim
(
0
),
size
.
X
}}
rows
[
i
]
.
Tensor
=
features
.
View
(
ctx
,
features
.
Stride
(
1
)
*
size
.
X
*
i
,
features
.
Dim
(
0
),
features
.
Stride
(
1
),
size
.
X
)
}
}
return
rows
,
nil
return
rows
,
nil
}
}
type
imageFeatures
struct
{
tensor
ml
.
Tensor
dataOnce
sync
.
Once
data
[]
float32
}
type
imageRow
struct
{
parent
*
imageFeatures
s
int
shape
[]
int
}
func
(
r
*
imageRow
)
data
()
[]
float32
{
n
:=
1
for
_
,
s
:=
range
r
.
shape
{
n
*=
s
}
return
r
.
parent
.
data
[
r
.
s
*
n
:
(
r
.
s
+
1
)
*
n
]
}
// PostTokenize arranges Mistral 3's inputs for the forward pass
// PostTokenize arranges Mistral 3's inputs for the forward pass
// In Mistral 3 and Pixtral, the input patches are arranged as follows:
// In Mistral 3 and Pixtral, the input patches are arranged as follows:
// [IMG]...[IMG][IMG_BREAK][IMG]...[IMG][IMG_BREAK][IMG]...[IMG][IMG_END]
// [IMG]...[IMG][IMG_BREAK][IMG]...[IMG][IMG_BREAK][IMG]...[IMG][IMG_END]
...
@@ -151,15 +136,14 @@ func (r *imageRow) data() []float32 {
...
@@ -151,15 +136,14 @@ func (r *imageRow) data() []float32 {
func
(
m
*
Model
)
PostTokenize
(
inputs
[]
input
.
Input
)
([]
input
.
Input
,
error
)
{
func
(
m
*
Model
)
PostTokenize
(
inputs
[]
input
.
Input
)
([]
input
.
Input
,
error
)
{
var
result
[]
input
.
Input
var
result
[]
input
.
Input
for
_
,
inp
:=
range
inputs
{
for
_
,
inp
:=
range
inputs
{
if
inp
.
Multimodal
==
nil
{
if
len
(
inp
.
Multimodal
)
==
0
{
result
=
append
(
result
,
inp
)
result
=
append
(
result
,
inp
)
}
else
{
}
else
{
inputMultimodal
:=
inp
.
Multimodal
.
([]
*
imageRow
)
for
i
,
row
:=
range
inp
.
Multimodal
{
for
i
,
row
:=
range
inputMultimodal
{
// [IMG]
// [IMG]
result
=
append
(
result
,
input
.
Input
{
Token
:
10
,
Multimodal
:
row
,
MultimodalHash
:
inp
.
MultimodalHash
,
SameBatch
:
row
.
shape
[
1
]
})
result
=
append
(
result
,
input
.
Input
{
Token
:
10
,
Multimodal
:
[]
input
.
Multimodal
{{
Tensor
:
row
.
Tensor
}}
,
MultimodalHash
:
inp
.
MultimodalHash
,
SameBatch
:
row
.
Tensor
.
Dim
(
1
)
})
result
=
append
(
result
,
slices
.
Repeat
([]
input
.
Input
{{
Token
:
10
}},
row
.
shape
[
1
]
-
1
)
...
)
result
=
append
(
result
,
slices
.
Repeat
([]
input
.
Input
{{
Token
:
10
}},
row
.
Tensor
.
Dim
(
1
)
-
1
)
...
)
if
i
==
len
(
inp
ut
Multimodal
)
-
1
{
if
i
==
len
(
inp
.
Multimodal
)
-
1
{
// [IMG_END]
// [IMG_END]
result
=
append
(
result
,
input
.
Input
{
Token
:
13
})
result
=
append
(
result
,
input
.
Input
{
Token
:
13
})
}
else
{
}
else
{
...
@@ -174,15 +158,8 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
...
@@ -174,15 +158,8 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
}
}
func
(
m
*
Model
)
Forward
(
ctx
ml
.
Context
,
batch
input
.
Batch
)
(
ml
.
Tensor
,
error
)
{
func
(
m
*
Model
)
Forward
(
ctx
ml
.
Context
,
batch
input
.
Batch
)
(
ml
.
Tensor
,
error
)
{
positions
,
err
:=
ctx
.
Input
()
.
FromIntSlice
(
batch
.
Positions
,
len
(
batch
.
Positions
))
positions
:=
ctx
.
Input
()
.
FromIntSlice
(
batch
.
Positions
,
len
(
batch
.
Positions
))
if
err
!=
nil
{
outputs
:=
ctx
.
Input
()
.
FromIntSlice
(
batch
.
Outputs
,
len
(
batch
.
Outputs
))
return
nil
,
err
}
outputs
,
err
:=
ctx
.
Input
()
.
FromIntSlice
(
batch
.
Outputs
,
len
(
batch
.
Outputs
))
if
err
!=
nil
{
return
nil
,
err
}
return
m
.
TextModel
.
Forward
(
ctx
,
batch
.
Inputs
,
positions
,
outputs
,
batch
,
m
.
Cache
),
nil
return
m
.
TextModel
.
Forward
(
ctx
,
batch
.
Inputs
,
positions
,
outputs
,
batch
,
m
.
Cache
),
nil
}
}
...
...
model/models/mistral3/model_text.go
View file @
b2b270ad
package
mistral3
package
mistral3
import
(
import
(
"
fmt
"
"
cmp
"
"math"
"math"
"strings"
"github.com/ollama/ollama/fs"
"github.com/ollama/ollama/fs"
"github.com/ollama/ollama/kvcache"
"github.com/ollama/ollama/kvcache"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/ml/nn"
"github.com/ollama/ollama/ml/nn"
"github.com/ollama/ollama/m
odel
"
"github.com/ollama/ollama/m
l/nn/fast
"
"github.com/ollama/ollama/model/input"
"github.com/ollama/ollama/model/input"
)
)
type
TextOptions
struct
{
type
TextOptions
struct
{
hiddenSize
,
numHeads
,
numKVHeads
,
headDim
int
hiddenSize
,
numHeads
,
numKVHeads
int
eps
,
ropeBase
,
ropeScale
float32
headDim
,
ropeDim
int
ropeDim
uin
t32
eps
,
ropeBase
,
ropeScale
floa
t32
}
}
type
TextModel
struct
{
type
TextModel
struct
{
model
.
Base
model
.
BytePairEncoding
TokenEmbedding
*
nn
.
Embedding
`gguf:"token_embd"`
TokenEmbedding
*
nn
.
Embedding
`gguf:"token_embd"`
Layers
[]
Layer
`gguf:"blk"`
Layers
[]
Layer
`gguf:"blk"`
OutputNorm
*
nn
.
RMSNorm
`gguf:"output_norm"`
OutputNorm
*
nn
.
RMSNorm
`gguf:"output_norm"`
...
@@ -40,19 +36,15 @@ type SelfAttention struct {
...
@@ -40,19 +36,15 @@ type SelfAttention struct {
func
(
sa
*
SelfAttention
)
Forward
(
ctx
ml
.
Context
,
hiddenState
,
positionIDs
ml
.
Tensor
,
cache
kvcache
.
Cache
,
opts
*
TextOptions
)
ml
.
Tensor
{
func
(
sa
*
SelfAttention
)
Forward
(
ctx
ml
.
Context
,
hiddenState
,
positionIDs
ml
.
Tensor
,
cache
kvcache
.
Cache
,
opts
*
TextOptions
)
ml
.
Tensor
{
batchSize
:=
hiddenState
.
Dim
(
1
)
batchSize
:=
hiddenState
.
Dim
(
1
)
ropeType
:=
uint32
(
0
)
headDim
:=
cmp
.
Or
(
opts
.
headDim
,
opts
.
hiddenSize
/
opts
.
numHeads
)
headDim
:=
opts
.
headDim
if
headDim
==
0
{
headDim
=
opts
.
hiddenSize
/
opts
.
numHeads
}
q
:=
sa
.
Query
.
Forward
(
ctx
,
hiddenState
)
q
:=
sa
.
Query
.
Forward
(
ctx
,
hiddenState
)
q
=
q
.
Reshape
(
ctx
,
headDim
,
opts
.
numHeads
,
batchSize
)
q
=
q
.
Reshape
(
ctx
,
headDim
,
opts
.
numHeads
,
batchSize
)
q
=
q
.
RoPE
(
ctx
,
positionIDs
,
nil
,
opts
.
ropeDim
,
ropeType
,
opts
.
ropeBase
,
opts
.
ropeScale
)
q
=
fast
.
RoPE
(
ctx
,
q
,
positionIDs
,
opts
.
ropeDim
,
opts
.
ropeBase
,
opts
.
ropeScale
)
k
:=
sa
.
Key
.
Forward
(
ctx
,
hiddenState
)
k
:=
sa
.
Key
.
Forward
(
ctx
,
hiddenState
)
k
=
k
.
Reshape
(
ctx
,
headDim
,
opts
.
numKVHeads
,
batchSize
)
k
=
k
.
Reshape
(
ctx
,
headDim
,
opts
.
numKVHeads
,
batchSize
)
k
=
k
.
RoPE
(
ctx
,
positionIDs
,
nil
,
opts
.
ropeDim
,
ropeType
,
opts
.
ropeBase
,
opts
.
ropeScale
)
k
=
fast
.
RoPE
(
ctx
,
k
,
positionIDs
,
opts
.
ropeDim
,
opts
.
ropeBase
,
opts
.
ropeScale
)
v
:=
sa
.
Value
.
Forward
(
ctx
,
hiddenState
)
v
:=
sa
.
Value
.
Forward
(
ctx
,
hiddenState
)
v
=
v
.
Reshape
(
ctx
,
headDim
,
opts
.
numKVHeads
,
batchSize
)
v
=
v
.
Reshape
(
ctx
,
headDim
,
opts
.
numKVHeads
,
batchSize
)
...
@@ -63,7 +55,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
...
@@ -63,7 +55,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
}
}
func
(
m
*
TextModel
)
Shift
(
ctx
ml
.
Context
,
layer
int
,
key
,
shift
ml
.
Tensor
)
(
ml
.
Tensor
,
error
)
{
func
(
m
*
TextModel
)
Shift
(
ctx
ml
.
Context
,
layer
int
,
key
,
shift
ml
.
Tensor
)
(
ml
.
Tensor
,
error
)
{
return
key
.
RoPE
(
ctx
,
shift
,
nil
,
uint32
(
0
)
,
m
.
ropeDim
,
m
.
ropeBase
,
m
.
ropeScale
),
nil
return
fast
.
RoPE
(
ctx
,
key
,
shift
,
m
.
ropeDim
,
m
.
ropeBase
,
m
.
ropeScale
),
nil
}
}
type
MLP
struct
{
type
MLP
struct
{
...
@@ -110,20 +102,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
...
@@ -110,20 +102,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
// image embeddings
// image embeddings
for
_
,
image
:=
range
batch
.
Multimodal
{
for
_
,
image
:=
range
batch
.
Multimodal
{
row
:=
image
.
Multimodal
.
(
*
imageRow
)
imageFeature
:=
image
.
Multimodal
[
0
]
.
Tensor
row
.
parent
.
dataOnce
.
Do
(
func
()
{
// use a new, throwaway context so the image tensor is not added to the graph
temp
:=
m
.
Backend
()
.
NewContext
()
temp
.
Forward
(
row
.
parent
.
tensor
)
.
Compute
(
row
.
parent
.
tensor
)
row
.
parent
.
data
=
row
.
parent
.
tensor
.
Floats
()
temp
.
Close
()
})
imageFeature
,
err
:=
ctx
.
Input
()
.
FromFloatSlice
(
row
.
data
(),
row
.
shape
...
)
if
err
!=
nil
{
panic
(
err
)
}
ctx
.
Forward
(
imageFeature
.
Copy
(
ctx
,
hiddenState
.
View
(
ctx
,
image
.
Index
*
hiddenState
.
Stride
(
1
),
imageFeature
.
Dim
(
0
)
*
imageFeature
.
Dim
(
1
))))
ctx
.
Forward
(
imageFeature
.
Copy
(
ctx
,
hiddenState
.
View
(
ctx
,
image
.
Index
*
hiddenState
.
Stride
(
1
),
imageFeature
.
Dim
(
0
)
*
imageFeature
.
Dim
(
1
))))
}
}
...
@@ -142,36 +121,18 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
...
@@ -142,36 +121,18 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
return
m
.
Output
.
Forward
(
ctx
,
hiddenState
)
return
m
.
Output
.
Forward
(
ctx
,
hiddenState
)
}
}
func
NewTextModel
(
c
fs
.
Config
)
(
*
TextModel
,
error
)
{
func
newTextModel
(
c
fs
.
Config
)
*
TextModel
{
if
!
strings
.
EqualFold
(
c
.
String
(
"tokenizer.ggml.model"
),
"gpt2"
)
{
return
&
TextModel
{
return
nil
,
fmt
.
Errorf
(
"tokenizer %s not yet supported"
,
c
.
String
(
"tokenizer.ggml.model"
))
}
textModel
:=
&
TextModel
{
BytePairEncoding
:
model
.
NewBytePairEncoding
(
c
.
String
(
"tokenizer.ggml.pretokenizer"
,
`[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`
),
&
model
.
Vocabulary
{
Values
:
c
.
Strings
(
"tokenizer.ggml.tokens"
),
Types
:
c
.
Ints
(
"tokenizer.ggml.token_type"
),
Merges
:
c
.
Strings
(
"tokenizer.ggml.merges"
),
BOS
:
int32
(
c
.
Uint
(
"tokenizer.ggml.bos_token_id"
,
1
)),
AddBOS
:
c
.
Bool
(
"tokenizer.ggml.add_bos_token"
,
true
),
EOS
:
int32
(
c
.
Uint
(
"tokenizer.ggml.eos_token_id"
,
2
)),
AddEOS
:
c
.
Bool
(
"tokenizer.ggml.add_eos_token"
,
false
),
},
),
Layers
:
make
([]
Layer
,
c
.
Uint
(
"block_count"
)),
Layers
:
make
([]
Layer
,
c
.
Uint
(
"block_count"
)),
TextOptions
:
&
TextOptions
{
TextOptions
:
&
TextOptions
{
hiddenSize
:
int
(
c
.
Uint
(
"embedding_length"
)),
hiddenSize
:
int
(
c
.
Uint
(
"embedding_length"
)),
numHeads
:
int
(
c
.
Uint
(
"attention.head_count"
)),
numHeads
:
int
(
c
.
Uint
(
"attention.head_count"
)),
numKVHeads
:
int
(
c
.
Uint
(
"attention.head_count_kv"
)),
numKVHeads
:
int
(
c
.
Uint
(
"attention.head_count_kv"
)),
headDim
:
int
(
c
.
Uint
(
"attention.key_length"
)),
headDim
:
int
(
c
.
Uint
(
"attention.key_length"
)),
ropeDim
:
int
(
c
.
Uint
(
"rope.dimension_count"
)),
eps
:
c
.
Float
(
"attention.layer_norm_rms_epsilon"
),
eps
:
c
.
Float
(
"attention.layer_norm_rms_epsilon"
),
ropeBase
:
c
.
Float
(
"rope.freq_base"
),
ropeBase
:
c
.
Float
(
"rope.freq_base"
),
ropeScale
:
c
.
Float
(
"rope.freq_scale"
,
1
),
ropeScale
:
c
.
Float
(
"rope.freq_scale"
,
1
),
ropeDim
:
c
.
Uint
(
"rope.dimension_count"
),
},
},
}
}
return
textModel
,
nil
}
}
model/models/mistral3/model_vision.go
View file @
b2b270ad
...
@@ -110,15 +110,8 @@ func (m *VisionModel) positionalEmbedding(ctx ml.Context, positionIDs ml.Tensor)
...
@@ -110,15 +110,8 @@ func (m *VisionModel) positionalEmbedding(ctx ml.Context, positionIDs ml.Tensor)
}
}
}
}
h
,
err
:=
ctx
.
Input
()
.
FromFloatSlice
(
frequenciesHeight
,
maxPatchesPerSide
,
frequencies
/
2
)
h
:=
ctx
.
Input
()
.
FromFloatSlice
(
frequenciesHeight
,
maxPatchesPerSide
,
frequencies
/
2
)
if
err
!=
nil
{
w
:=
ctx
.
Input
()
.
FromFloatSlice
(
frequenciesWidth
,
maxPatchesPerSide
,
frequencies
/
2
)
panic
(
err
)
}
w
,
err
:=
ctx
.
Input
()
.
FromFloatSlice
(
frequenciesWidth
,
maxPatchesPerSide
,
frequencies
/
2
)
if
err
!=
nil
{
panic
(
err
)
}
h
=
h
.
Permute
(
ctx
,
1
,
0
,
2
,
3
)
.
Contiguous
(
ctx
)
h
=
h
.
Permute
(
ctx
,
1
,
0
,
2
,
3
)
.
Contiguous
(
ctx
)
w
=
w
.
Permute
(
ctx
,
1
,
0
,
2
,
3
)
.
Contiguous
(
ctx
)
w
=
w
.
Permute
(
ctx
,
1
,
0
,
2
,
3
)
.
Contiguous
(
ctx
)
...
@@ -151,10 +144,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
...
@@ -151,10 +144,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
}
}
}
}
positionIDs
,
err
:=
ctx
.
Input
()
.
FromIntSlice
(
positions
,
len
(
positions
))
positionIDs
:=
ctx
.
Input
()
.
FromIntSlice
(
positions
,
len
(
positions
))
if
err
!=
nil
{
panic
(
err
)
}
positionEmbedding
:=
m
.
positionalEmbedding
(
ctx
,
positionIDs
)
positionEmbedding
:=
m
.
positionalEmbedding
(
ctx
,
positionIDs
)
cos
,
sin
:=
positionEmbedding
.
Cos
(
ctx
),
positionEmbedding
.
Sin
(
ctx
)
cos
,
sin
:=
positionEmbedding
.
Cos
(
ctx
),
positionEmbedding
.
Sin
(
ctx
)
...
@@ -170,7 +160,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
...
@@ -170,7 +160,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
func
newVisionModel
(
c
fs
.
Config
)
*
VisionModel
{
func
newVisionModel
(
c
fs
.
Config
)
*
VisionModel
{
return
&
VisionModel
{
return
&
VisionModel
{
Layers
:
make
([]
VisionEncoderLayer
,
c
.
Uint
(
"vision.block_count"
,
24
)),
Layers
:
make
([]
VisionEncoderLayer
,
c
.
Uint
(
"vision.block_count"
)),
VisionModelOptions
:
&
VisionModelOptions
{
VisionModelOptions
:
&
VisionModelOptions
{
hiddenSize
:
int
(
c
.
Uint
(
"vision.embedding_length"
,
1024
)),
hiddenSize
:
int
(
c
.
Uint
(
"vision.embedding_length"
,
1024
)),
numHeads
:
int
(
c
.
Uint
(
"vision.attention.head_count"
,
16
)),
numHeads
:
int
(
c
.
Uint
(
"vision.attention.head_count"
,
16
)),
...
...
model/models/mllama/imageproc.go
deleted
100644 → 0
View file @
20c5fd39
package
mllama
import
(
"fmt"
"image"
_
"image/jpeg"
_
"image/png"
"io"
"math"
"slices"
"golang.org/x/image/draw"
"github.com/ollama/ollama/model/imageproc"
)
func
getSupportedAspectRatios
(
maxTiles
int
)
[]
image
.
Point
{
ratios
:=
[]
image
.
Point
{}
for
w
:=
range
maxTiles
{
for
h
:=
range
maxTiles
{
if
(
w
+
1
)
*
(
h
+
1
)
<=
maxTiles
{
ratios
=
append
(
ratios
,
image
.
Point
{
w
+
1
,
h
+
1
})
}
}
}
return
ratios
}
func
clip
(
a
,
a_min
,
a_max
int
)
int
{
if
a
<
a_min
{
return
a_min
}
else
if
a
>
a_max
{
return
a_max
}
return
a
}
func
getOptimalTiledCanvas
(
imageSize
image
.
Point
,
maxImageTiles
,
tileSize
int
)
image
.
Point
{
possibleTileArrangements
:=
getSupportedAspectRatios
(
maxImageTiles
)
possibleCanvasSizes
:=
[]
image
.
Point
{}
for
_
,
pta
:=
range
possibleTileArrangements
{
possibleCanvasSizes
=
append
(
possibleCanvasSizes
,
image
.
Point
{
pta
.
X
*
tileSize
,
pta
.
Y
*
tileSize
})
}
scales
:=
[]
float64
{}
for
_
,
pcs
:=
range
possibleCanvasSizes
{
scaleHeight
:=
float64
(
pcs
.
Y
)
/
float64
(
imageSize
.
Y
)
scaleWidth
:=
float64
(
pcs
.
X
)
/
float64
(
imageSize
.
X
)
if
scaleWidth
>
scaleHeight
{
scales
=
append
(
scales
,
scaleHeight
)
}
else
{
scales
=
append
(
scales
,
scaleWidth
)
}
}
var
minUpscale
float64
var
maxDownscale
float64
var
upscale
bool
for
_
,
s
:=
range
scales
{
if
s
>
1.0
{
upscale
=
true
if
minUpscale
==
0
{
minUpscale
=
s
}
else
{
minUpscale
=
math
.
Min
(
minUpscale
,
s
)
}
}
else
{
maxDownscale
=
math
.
Max
(
maxDownscale
,
s
)
}
}
selectedScale
:=
maxDownscale
if
upscale
{
selectedScale
=
minUpscale
}
var
selectedCanvas
image
.
Point
for
n
,
pcs
:=
range
possibleCanvasSizes
{
if
scales
[
n
]
==
selectedScale
{
// choose the smallest possible canvas
if
selectedCanvas
.
X
==
0
&&
selectedCanvas
.
Y
==
0
{
selectedCanvas
=
pcs
}
else
if
pcs
.
X
*
pcs
.
Y
<
selectedCanvas
.
X
*
selectedCanvas
.
Y
{
selectedCanvas
=
pcs
}
}
}
return
selectedCanvas
}
func
getImageSizeFitToCanvas
(
imageSize
,
canvasSize
image
.
Point
,
tileSize
int
)
image
.
Point
{
targetWidth
:=
clip
(
imageSize
.
X
,
tileSize
,
canvasSize
.
X
)
targetHeight
:=
clip
(
imageSize
.
Y
,
tileSize
,
canvasSize
.
Y
)
scaleWidth
:=
float64
(
targetWidth
)
/
float64
(
imageSize
.
X
)
scaleHeight
:=
float64
(
targetHeight
)
/
float64
(
imageSize
.
Y
)
var
w
,
h
int
if
scaleWidth
<
scaleHeight
{
w
=
targetWidth
h
=
min
(
int
(
math
.
Floor
(
float64
(
imageSize
.
Y
)
*
scaleWidth
)),
targetHeight
)
}
else
{
w
=
min
(
int
(
math
.
Floor
(
float64
(
imageSize
.
X
)
*
scaleHeight
)),
targetWidth
)
h
=
targetHeight
}
return
image
.
Point
{
w
,
h
}
}
func
resizeImage
(
img
image
.
Image
,
format
string
,
outputSize
image
.
Point
,
maxImageTiles
int
)
(
image
.
Image
,
image
.
Point
)
{
if
format
==
"png"
{
img
=
imageproc
.
Composite
(
img
)
}
b
:=
img
.
Bounds
()
tileSize
:=
outputSize
.
Y
canvasSize
:=
getOptimalTiledCanvas
(
b
.
Max
,
maxImageTiles
,
tileSize
)
aspectRatio
:=
image
.
Point
{
canvasSize
.
X
/
tileSize
,
canvasSize
.
Y
/
tileSize
}
newSize
:=
getImageSizeFitToCanvas
(
b
.
Max
,
canvasSize
,
tileSize
)
return
imageproc
.
Resize
(
img
,
newSize
,
imageproc
.
ResizeBilinear
),
aspectRatio
}
func
padImage
(
img
image
.
Image
,
outputSize
,
aspectRatio
image
.
Point
)
image
.
Image
{
paddedSize
:=
image
.
Point
{
X
:
outputSize
.
X
*
aspectRatio
.
X
,
Y
:
outputSize
.
Y
*
aspectRatio
.
Y
,
}
dst
:=
image
.
NewRGBA
(
image
.
Rect
(
0
,
0
,
paddedSize
.
X
,
paddedSize
.
Y
))
draw
.
Draw
(
dst
,
img
.
Bounds
(),
img
,
image
.
Point
{
0
,
0
},
draw
.
Over
)
return
dst
}
func
splitToTiles
(
img
image
.
Image
,
numTilesSize
image
.
Point
)
[]
image
.
Image
{
b
:=
img
.
Bounds
()
width
:=
b
.
Max
.
X
-
b
.
Min
.
X
height
:=
b
.
Max
.
Y
-
b
.
Min
.
Y
tileHeight
:=
height
/
numTilesSize
.
Y
tileWidth
:=
width
/
numTilesSize
.
X
images
:=
[]
image
.
Image
{}
for
h
:=
range
numTilesSize
.
Y
{
for
w
:=
range
numTilesSize
.
X
{
rect
:=
image
.
Rect
(
tileWidth
*
w
,
tileHeight
*
h
,
tileWidth
*
(
w
+
1
),
tileHeight
*
(
h
+
1
))
images
=
append
(
images
,
img
.
(
interface
{
SubImage
(
image
.
Rectangle
)
image
.
Image
})
.
SubImage
(
rect
))
}
}
return
images
}
func
packImages
(
img
image
.
Image
,
aspectRatio
image
.
Point
)
[]
float32
{
subImages
:=
splitToTiles
(
img
,
aspectRatio
)
var
pixelVals
[]
float32
rescale
:=
true
channelFirst
:=
true
for
_
,
subImg
:=
range
subImages
{
vals
:=
imageproc
.
Normalize
(
subImg
,
imageproc
.
ClipDefaultMean
,
imageproc
.
ClipDefaultSTD
,
rescale
,
channelFirst
)
pixelVals
=
append
(
pixelVals
,
vals
...
)
}
return
pixelVals
}
func
Preprocess
(
imageData
io
.
Reader
)
([]
float32
,
map
[
string
]
any
,
error
)
{
outputSize
:=
image
.
Point
{
560
,
560
}
maxTiles
:=
4
img
,
format
,
err
:=
image
.
Decode
(
imageData
)
if
err
!=
nil
{
return
nil
,
nil
,
fmt
.
Errorf
(
"failed to decode image: %w"
,
err
)
}
newImage
,
aspectRatio
:=
resizeImage
(
img
,
format
,
outputSize
,
maxTiles
)
newImage
=
padImage
(
newImage
,
outputSize
,
aspectRatio
)
data
:=
packImages
(
newImage
,
aspectRatio
)
aspectRatioIndex
:=
slices
.
Index
(
getSupportedAspectRatios
(
maxTiles
),
aspectRatio
)
+
1
opts
:=
map
[
string
]
any
{
"aspectRatioIndex"
:
aspectRatioIndex
,
}
return
data
,
opts
,
nil
}
model/models/mllama/imageproc_test.go
deleted
100644 → 0
View file @
20c5fd39
package
mllama
import
(
"bytes"
"image"
"image/png"
"testing"
"github.com/google/go-cmp/cmp"
)
func
TestAspectRatios
(
t
*
testing
.
T
)
{
type
aspectCase
struct
{
MaxTiles
int
Expected
[]
image
.
Point
}
cases
:=
[]
aspectCase
{
{
MaxTiles
:
1
,
Expected
:
[]
image
.
Point
{{
1
,
1
}},
},
{
MaxTiles
:
2
,
Expected
:
[]
image
.
Point
{{
1
,
1
},
{
1
,
2
},
{
2
,
1
}},
},
{
MaxTiles
:
3
,
Expected
:
[]
image
.
Point
{{
1
,
1
},
{
1
,
2
},
{
1
,
3
},
{
2
,
1
},
{
3
,
1
}},
},
{
MaxTiles
:
4
,
Expected
:
[]
image
.
Point
{{
1
,
1
},
{
1
,
2
},
{
1
,
3
},
{
1
,
4
},
{
2
,
1
},
{
2
,
2
},
{
3
,
1
},
{
4
,
1
}},
},
}
for
_
,
c
:=
range
cases
{
actual
:=
getSupportedAspectRatios
(
c
.
MaxTiles
)
if
diff
:=
cmp
.
Diff
(
actual
,
c
.
Expected
);
diff
!=
""
{
t
.
Errorf
(
"mismatch (-got +want):
\n
%s"
,
diff
)
}
}
}
func
TestGetImageSizeFitToCanvas
(
t
*
testing
.
T
)
{
type
imageSizeCase
struct
{
ImageRect
image
.
Point
CanvasRect
image
.
Point
TileSize
int
Expected
image
.
Point
}
cases
:=
[]
imageSizeCase
{
{
ImageRect
:
image
.
Point
{
400
,
400
},
CanvasRect
:
image
.
Point
{
640
,
480
},
TileSize
:
200
,
Expected
:
image
.
Point
{
400
,
400
},
},
{
ImageRect
:
image
.
Point
{
1024
,
768
},
CanvasRect
:
image
.
Point
{
640
,
480
},
TileSize
:
200
,
Expected
:
image
.
Point
{
640
,
480
},
},
{
ImageRect
:
image
.
Point
{
500
,
500
},
CanvasRect
:
image
.
Point
{
1000
,
1000
},
TileSize
:
750
,
Expected
:
image
.
Point
{
750
,
750
},
},
{
ImageRect
:
image
.
Point
{
500
,
1000
},
CanvasRect
:
image
.
Point
{
2000
,
2000
},
TileSize
:
2000
,
Expected
:
image
.
Point
{
1000
,
2000
},
},
{
ImageRect
:
image
.
Point
{
4000
,
3000
},
CanvasRect
:
image
.
Point
{
2000
,
1000
},
TileSize
:
1000
,
Expected
:
image
.
Point
{
1333
,
1000
},
},
{
ImageRect
:
image
.
Point
{
667
,
1000
},
CanvasRect
:
image
.
Point
{
1000
,
1000
},
TileSize
:
560
,
Expected
:
image
.
Point
{
667
,
1000
},
},
}
for
_
,
c
:=
range
cases
{
actual
:=
getImageSizeFitToCanvas
(
c
.
ImageRect
,
c
.
CanvasRect
,
c
.
TileSize
)
if
actual
!=
c
.
Expected
{
t
.
Errorf
(
"incorrect image rect: '%#v'. expected: '%#v'"
,
actual
,
c
.
Expected
)
}
}
}
func
TestGetOptimalTiledCanvas
(
t
*
testing
.
T
)
{
type
tiledCanvasSizeCase
struct
{
ImageSize
image
.
Point
MaxImageTiles
int
TileSize
int
Expected
image
.
Point
}
cases
:=
[]
tiledCanvasSizeCase
{
{
ImageSize
:
image
.
Point
{
1024
,
768
},
MaxImageTiles
:
4
,
TileSize
:
1000
,
Expected
:
image
.
Point
{
2000
,
1000
},
},
{
ImageSize
:
image
.
Point
{
1024
,
768
},
MaxImageTiles
:
4
,
TileSize
:
560
,
Expected
:
image
.
Point
{
1120
,
1120
},
},
{
ImageSize
:
image
.
Point
{
800
,
600
},
MaxImageTiles
:
4
,
TileSize
:
560
,
Expected
:
image
.
Point
{
1120
,
1120
},
},
{
ImageSize
:
image
.
Point
{
640
,
480
},
MaxImageTiles
:
4
,
TileSize
:
560
,
Expected
:
image
.
Point
{
1120
,
560
},
},
{
ImageSize
:
image
.
Point
{
320
,
200
},
MaxImageTiles
:
4
,
TileSize
:
560
,
Expected
:
image
.
Point
{
560
,
560
},
},
{
ImageSize
:
image
.
Point
{
1320
,
200
},
MaxImageTiles
:
4
,
TileSize
:
560
,
Expected
:
image
.
Point
{
1680
,
560
},
},
{
ImageSize
:
image
.
Point
{
2000
,
200
},
MaxImageTiles
:
4
,
TileSize
:
560
,
Expected
:
image
.
Point
{
2240
,
560
},
},
{
ImageSize
:
image
.
Point
{
10000
,
200
},
MaxImageTiles
:
4
,
TileSize
:
560
,
Expected
:
image
.
Point
{
2240
,
560
},
},
{
ImageSize
:
image
.
Point
{
480
,
640
},
MaxImageTiles
:
4
,
TileSize
:
560
,
Expected
:
image
.
Point
{
560
,
1120
},
},
{
ImageSize
:
image
.
Point
{
200
,
320
},
MaxImageTiles
:
4
,
TileSize
:
560
,
Expected
:
image
.
Point
{
560
,
560
},
},
{
ImageSize
:
image
.
Point
{
200
,
1320
},
MaxImageTiles
:
4
,
TileSize
:
560
,
Expected
:
image
.
Point
{
560
,
1680
},
},
{
ImageSize
:
image
.
Point
{
200
,
2000
},
MaxImageTiles
:
4
,
TileSize
:
560
,
Expected
:
image
.
Point
{
560
,
2240
},
},
{
ImageSize
:
image
.
Point
{
200
,
10000
},
MaxImageTiles
:
4
,
TileSize
:
560
,
Expected
:
image
.
Point
{
560
,
2240
},
},
{
ImageSize
:
image
.
Point
{
10000
,
10000
},
MaxImageTiles
:
4
,
TileSize
:
560
,
Expected
:
image
.
Point
{
1120
,
1120
},
},
}
for
_
,
c
:=
range
cases
{
actual
:=
getOptimalTiledCanvas
(
c
.
ImageSize
,
c
.
MaxImageTiles
,
c
.
TileSize
)
if
actual
!=
c
.
Expected
{
t
.
Errorf
(
"incorrect tiled canvas: '%#v'. expected: '%#v'"
,
actual
,
c
.
Expected
)
}
}
}
func
TestSplitToTiles
(
t
*
testing
.
T
)
{
type
splitCase
struct
{
TestImage
image
.
Image
NumTilesSize
image
.
Point
Expected
[]
image
.
Image
}
cases
:=
[]
splitCase
{
{
TestImage
:
image
.
NewRGBA
(
image
.
Rect
(
0
,
0
,
1024
,
768
)),
NumTilesSize
:
image
.
Point
{
1
,
1
},
Expected
:
[]
image
.
Image
{
image
.
NewRGBA
(
image
.
Rect
(
0
,
0
,
1024
,
768
))},
},
{
TestImage
:
image
.
NewRGBA
(
image
.
Rect
(
0
,
0
,
1000
,
500
)),
NumTilesSize
:
image
.
Point
{
2
,
1
},
Expected
:
[]
image
.
Image
{
image
.
NewRGBA
(
image
.
Rect
(
0
,
0
,
500
,
500
)),
image
.
NewRGBA
(
image
.
Rect
(
500
,
0
,
1000
,
500
)),
},
},
{
TestImage
:
image
.
NewRGBA
(
image
.
Rect
(
0
,
0
,
1000
,
1000
)),
NumTilesSize
:
image
.
Point
{
2
,
2
},
Expected
:
[]
image
.
Image
{
image
.
NewRGBA
(
image
.
Rect
(
0
,
0
,
500
,
500
)),
image
.
NewRGBA
(
image
.
Rect
(
500
,
0
,
1000
,
500
)),
image
.
NewRGBA
(
image
.
Rect
(
0
,
500
,
500
,
1000
)),
image
.
NewRGBA
(
image
.
Rect
(
500
,
500
,
1000
,
1000
)),
},
},
}
for
_
,
c
:=
range
cases
{
actual
:=
splitToTiles
(
c
.
TestImage
,
c
.
NumTilesSize
)
if
len
(
actual
)
!=
len
(
c
.
Expected
)
{
t
.
Errorf
(
"incorrect number of images '%d': expected: '%d'"
,
len
(
actual
),
len
(
c
.
Expected
))
}
for
i
:=
range
actual
{
if
actual
[
i
]
.
Bounds
()
!=
c
.
Expected
[
i
]
.
Bounds
()
{
t
.
Errorf
(
"image size incorrect: '%#v': expected: '%#v'"
,
actual
[
i
]
.
Bounds
(),
c
.
Expected
[
i
]
.
Bounds
())
}
}
}
}
func
TestResize
(
t
*
testing
.
T
)
{
type
resizeCase
struct
{
TestImage
image
.
Image
OutputSize
image
.
Point
MaxImageTiles
int
ExpectedImage
image
.
Image
ExpectedAspectRatio
image
.
Point
}
cases
:=
[]
resizeCase
{
{
TestImage
:
image
.
NewRGBA
(
image
.
Rect
(
0
,
0
,
200
,
200
)),
OutputSize
:
image
.
Point
{
100
,
100
},
MaxImageTiles
:
1
,
ExpectedImage
:
image
.
NewRGBA
(
image
.
Rect
(
0
,
0
,
100
,
100
)),
ExpectedAspectRatio
:
image
.
Point
{
1
,
1
},
},
{
TestImage
:
image
.
NewRGBA
(
image
.
Rect
(
0
,
0
,
200
,
200
)),
OutputSize
:
image
.
Point
{
100
,
100
},
MaxImageTiles
:
2
,
ExpectedImage
:
image
.
NewRGBA
(
image
.
Rect
(
0
,
0
,
100
,
100
)),
ExpectedAspectRatio
:
image
.
Point
{
1
,
1
},
},
{
TestImage
:
image
.
NewRGBA
(
image
.
Rect
(
0
,
0
,
10
,
10
)),
OutputSize
:
image
.
Point
{
560
,
560
},
MaxImageTiles
:
4
,
ExpectedImage
:
image
.
NewRGBA
(
image
.
Rect
(
0
,
0
,
560
,
560
)),
ExpectedAspectRatio
:
image
.
Point
{
1
,
1
},
},
{
TestImage
:
image
.
NewRGBA
(
image
.
Rect
(
0
,
0
,
2560
,
1920
)),
OutputSize
:
image
.
Point
{
560
,
560
},
MaxImageTiles
:
4
,
ExpectedImage
:
image
.
NewRGBA
(
image
.
Rect
(
0
,
0
,
1120
,
840
)),
ExpectedAspectRatio
:
image
.
Point
{
2
,
2
},
},
{
TestImage
:
image
.
NewRGBA
(
image
.
Rect
(
0
,
0
,
1024
,
768
)),
OutputSize
:
image
.
Point
{
560
,
560
},
MaxImageTiles
:
4
,
ExpectedImage
:
image
.
NewRGBA
(
image
.
Rect
(
0
,
0
,
1024
,
768
)),
ExpectedAspectRatio
:
image
.
Point
{
2
,
2
},
},
}
for
_
,
c
:=
range
cases
{
actualImage
,
actualAspectRatio
:=
resizeImage
(
c
.
TestImage
,
"png"
,
c
.
OutputSize
,
c
.
MaxImageTiles
)
if
actualImage
.
Bounds
()
!=
c
.
ExpectedImage
.
Bounds
()
{
t
.
Errorf
(
"image size incorrect: '%#v': expected: '%#v'"
,
actualImage
.
Bounds
(),
c
.
ExpectedImage
.
Bounds
())
}
if
actualAspectRatio
!=
c
.
ExpectedAspectRatio
{
t
.
Errorf
(
"aspect ratio incorrect: '%#v': expected: '%#v'"
,
actualAspectRatio
,
c
.
ExpectedAspectRatio
)
}
}
}
func
TestPad
(
t
*
testing
.
T
)
{
type
padCase
struct
{
TestImage
image
.
Image
OutputSize
image
.
Point
AspectRatio
image
.
Point
Expected
image
.
Image
}
cases
:=
[]
padCase
{
{
TestImage
:
image
.
NewRGBA
(
image
.
Rect
(
0
,
0
,
1000
,
667
)),
OutputSize
:
image
.
Point
{
560
,
560
},
AspectRatio
:
image
.
Point
{
2
,
2
},
Expected
:
image
.
NewRGBA
(
image
.
Rect
(
0
,
0
,
1120
,
1120
)),
},
}
for
_
,
c
:=
range
cases
{
actual
:=
padImage
(
c
.
TestImage
,
c
.
OutputSize
,
c
.
AspectRatio
)
if
actual
.
Bounds
()
!=
c
.
Expected
.
Bounds
()
{
t
.
Errorf
(
"image size incorrect: '%#v': expected: '%#v'"
,
actual
.
Bounds
(),
c
.
Expected
.
Bounds
())
}
}
}
func
TestPackImages
(
t
*
testing
.
T
)
{
type
packCase
struct
{
TestImage
image
.
Image
AspectRatio
image
.
Point
ExpectedVals
int
}
cases
:=
[]
packCase
{
{
TestImage
:
image
.
NewRGBA
(
image
.
Rect
(
0
,
0
,
1120
,
1120
)),
AspectRatio
:
image
.
Point
{
2
,
2
},
ExpectedVals
:
2
*
2
*
3
*
560
*
560
,
},
{
TestImage
:
image
.
NewRGBA
(
image
.
Rect
(
0
,
0
,
560
,
560
)),
AspectRatio
:
image
.
Point
{
1
,
1
},
ExpectedVals
:
1
*
1
*
3
*
560
*
560
,
},
{
TestImage
:
image
.
NewRGBA
(
image
.
Rect
(
0
,
0
,
1120
,
560
)),
AspectRatio
:
image
.
Point
{
1
,
2
},
ExpectedVals
:
1
*
2
*
3
*
560
*
560
,
},
}
for
_
,
c
:=
range
cases
{
actualVals
:=
packImages
(
c
.
TestImage
,
c
.
AspectRatio
)
if
len
(
actualVals
)
!=
c
.
ExpectedVals
{
t
.
Errorf
(
"packed image size incorrect: '%d': expected: '%d'"
,
len
(
actualVals
),
c
.
ExpectedVals
)
}
}
}
func
TestPreprocess
(
t
*
testing
.
T
)
{
type
preprocessCase
struct
{
TestImage
image
.
Image
ExpectedVals
int
ExpectedAspectRatioID
int
}
cases
:=
[]
preprocessCase
{
{
TestImage
:
image
.
NewRGBA
(
image
.
Rect
(
0
,
0
,
10
,
10
)),
ExpectedVals
:
0
,
ExpectedAspectRatioID
:
1
,
},
{
TestImage
:
image
.
NewRGBA
(
image
.
Rect
(
0
,
0
,
1024
,
768
)),
ExpectedVals
:
0
,
ExpectedAspectRatioID
:
6
,
},
}
for
_
,
c
:=
range
cases
{
var
buf
bytes
.
Buffer
err
:=
png
.
Encode
(
&
buf
,
c
.
TestImage
)
if
err
!=
nil
{
t
.
Fatal
(
err
)
}
imgData
,
opts
,
err
:=
Preprocess
(
&
buf
)
if
err
!=
nil
{
t
.
Fatalf
(
"error processing: %q"
,
err
)
}
if
len
(
imgData
)
==
0
{
t
.
Errorf
(
"no image data returned"
)
}
ar
,
ok
:=
opts
[
"aspectRatioIndex"
]
if
!
ok
{
t
.
Fatalf
(
"no aspect ratio found"
)
}
aspectRatioID
:=
ar
.
(
int
)
if
aspectRatioID
!=
c
.
ExpectedAspectRatioID
{
t
.
Errorf
(
"aspect ratio incorrect: '%d': expected: '%d'"
,
aspectRatioID
,
c
.
ExpectedAspectRatioID
)
}
}
}
model/models/mllama/model.go
View file @
b2b270ad
...
@@ -2,9 +2,6 @@ package mllama
...
@@ -2,9 +2,6 @@ package mllama
import
(
import
(
"bytes"
"bytes"
"encoding/binary"
"fmt"
"hash/fnv"
"image"
"image"
"slices"
"slices"
...
@@ -34,10 +31,6 @@ const (
...
@@ -34,10 +31,6 @@ const (
)
)
func
New
(
c
fs
.
Config
)
(
model
.
Model
,
error
)
{
func
New
(
c
fs
.
Config
)
(
model
.
Model
,
error
)
{
// Verify unified config
if
c
.
Uint
(
"vision.block_count"
)
==
0
{
return
nil
,
fmt
.
Errorf
(
"non-unified vision model not supported"
)
}
m
:=
Model
{
m
:=
Model
{
BytePairEncoding
:
model
.
NewBytePairEncoding
(
BytePairEncoding
:
model
.
NewBytePairEncoding
(
c
.
String
(
"tokenizer.ggml.pretokenizer"
,
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`
),
c
.
String
(
"tokenizer.ggml.pretokenizer"
,
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`
),
...
@@ -45,10 +38,13 @@ func New(c fs.Config) (model.Model, error) {
...
@@ -45,10 +38,13 @@ func New(c fs.Config) (model.Model, error) {
Values
:
c
.
Strings
(
"tokenizer.ggml.tokens"
),
Values
:
c
.
Strings
(
"tokenizer.ggml.tokens"
),
Types
:
c
.
Ints
(
"tokenizer.ggml.token_type"
),
Types
:
c
.
Ints
(
"tokenizer.ggml.token_type"
),
Merges
:
c
.
Strings
(
"tokenizer.ggml.merges"
),
Merges
:
c
.
Strings
(
"tokenizer.ggml.merges"
),
BOS
:
int32
(
c
.
Uint
(
"tokenizer.ggml.bos_token_id"
)),
AddBOS
:
c
.
Bool
(
"tokenizer.ggml.add_bos_token"
,
true
),
AddBOS
:
c
.
Bool
(
"tokenizer.ggml.add_bos_token"
,
true
),
E
OS
:
int32
(
c
.
Uint
(
"tokenizer.ggml.
e
os_token_id"
)),
B
OS
:
[]
int32
{
int32
(
c
.
Uint
(
"tokenizer.ggml.
b
os_token_id"
))
}
,
AddEOS
:
c
.
Bool
(
"tokenizer.ggml.add_eos_token"
,
false
),
AddEOS
:
c
.
Bool
(
"tokenizer.ggml.add_eos_token"
,
false
),
EOS
:
append
(
[]
int32
{
int32
(
c
.
Uint
(
"tokenizer.ggml.eos_token_id"
))},
c
.
Ints
(
"tokenizer.ggml.eos_token_ids"
)
...
,
),
},
},
),
),
ImageProcessor
:
newImageProcessor
(
c
),
ImageProcessor
:
newImageProcessor
(
c
),
...
@@ -63,7 +59,7 @@ func New(c fs.Config) (model.Model, error) {
...
@@ -63,7 +59,7 @@ func New(c fs.Config) (model.Model, error) {
return
&
m
,
nil
return
&
m
,
nil
}
}
func
(
m
*
Model
)
EncodeMultimodal
(
ctx
ml
.
Context
,
multimodalData
[]
byte
)
(
any
,
error
)
{
func
(
m
*
Model
)
EncodeMultimodal
(
ctx
ml
.
Context
,
multimodalData
[]
byte
)
(
[]
input
.
Multimodal
,
error
)
{
if
len
(
m
.
VisionModel
.
Transformer
.
Layers
)
==
0
||
len
(
m
.
GlobalTransformer
.
Layers
)
==
0
{
if
len
(
m
.
VisionModel
.
Transformer
.
Layers
)
==
0
||
len
(
m
.
GlobalTransformer
.
Layers
)
==
0
{
return
nil
,
model
.
ErrNoVisionModel
return
nil
,
model
.
ErrNoVisionModel
}
}
...
@@ -73,81 +69,48 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
...
@@ -73,81 +69,48 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
return
nil
,
err
return
nil
,
err
}
}
f32s
,
aspectR
atio
ID
,
err
:=
m
.
ImageProcessor
.
ProcessImage
(
image
)
f32s
,
r
atio
,
err
:=
m
.
ImageProcessor
.
ProcessImage
(
image
)
if
err
!=
nil
{
if
err
!=
nil
{
return
nil
,
err
return
nil
,
err
}
}
pixelValues
,
err
:=
ctx
.
Input
()
.
FromFloatSlice
(
f32s
,
if
ratio
.
numTiles
()
<
m
.
maxNumTiles
{
m
.
ImageProcessor
.
imageSize
,
// Pad tiles to maxNumTiles
m
.
ImageProcessor
.
imageSize
,
f32s
=
slices
.
Grow
(
f32s
,
m
.
imageSize
*
m
.
imageSize
*
m
.
numChannels
*
m
.
maxNumTiles
)
m
.
ImageProcessor
.
numChannels
,
f32s
=
f32s
[
:
m
.
imageSize
*
m
.
imageSize
*
m
.
numChannels
*
m
.
maxNumTiles
]
m
.
ImageProcessor
.
maxNumTiles
,
)
if
err
!=
nil
{
return
nil
,
err
}
}
aspectRatio
,
err
:=
ctx
.
Input
()
.
FromIntSlice
([]
int32
{
int32
(
aspectRatioID
)},
1
)
pixelValues
:=
ctx
.
Input
()
.
FromFloatSlice
(
f32s
,
m
.
imageSize
,
m
.
imageSize
,
m
.
numChannels
,
m
.
maxNumTiles
)
if
err
!=
nil
{
aspectRatio
:=
ctx
.
Input
()
.
FromIntSlice
([]
int32
{
int32
(
ratio
.
rank
)},
1
)
return
nil
,
err
}
positionIDs
:=
ctx
.
Arange
(
0
,
1601
,
1
,
ml
.
DTypeI32
)
positionIDs
:=
ctx
.
Arange
(
0
,
1601
,
1
,
ml
.
DTypeI32
)
crossAttentionStates
:=
m
.
VisionModel
.
Forward
(
ctx
,
pixelValues
,
positionIDs
,
aspectRatio
)
crossAttentionStates
:=
m
.
VisionModel
.
Forward
(
ctx
,
pixelValues
,
positionIDs
,
aspectRatio
)
return
m
.
Projector
.
Forward
(
ctx
,
crossAttentionStates
),
nil
projectedOutputs
:=
m
.
Projector
.
Forward
(
ctx
,
crossAttentionStates
)
return
[]
input
.
Multimodal
{{
Tensor
:
projectedOutputs
}},
nil
}
}
func
(
m
*
Model
)
PostTokenize
(
inputs
[]
input
.
Input
)
([]
input
.
Input
,
error
)
{
func
(
m
*
Model
)
PostTokenize
(
inputs
[]
input
.
Input
)
([]
input
.
Input
,
error
)
{
var
images
[]
input
.
Input
fnvHash
:=
fnv
.
New64a
()
for
i
:=
range
inputs
{
for
i
:=
range
inputs
{
if
inputs
[
i
]
.
Multimodal
==
nil
{
if
inputs
[
i
]
.
Multimodal
!=
nil
{
if
len
(
images
)
>
0
{
inputs
[
i
]
.
Token
=
128256
// <|image|>
inputs
[
i
]
.
Multimodal
=
[]
ml
.
Tensor
{
images
[
0
]
.
Multimodal
.
(
ml
.
Tensor
)}
inputs
[
i
]
.
MultimodalHash
=
images
[
0
]
.
MultimodalHash
for
j
:=
1
;
j
<
len
(
images
);
j
++
{
inputs
[
i
]
.
Multimodal
=
append
(
inputs
[
i
]
.
Multimodal
.
([]
ml
.
Tensor
),
images
[
0
]
.
Multimodal
.
(
ml
.
Tensor
))
fnvHash
.
Reset
()
binary
.
Write
(
fnvHash
,
binary
.
NativeEndian
,
inputs
[
i
]
.
MultimodalHash
)
binary
.
Write
(
fnvHash
,
binary
.
NativeEndian
,
inputs
[
j
]
.
MultimodalHash
)
inputs
[
i
]
.
MultimodalHash
=
fnvHash
.
Sum64
()
}
images
=
nil
}
}
else
{
images
=
append
(
images
,
inputs
[
i
])
inputs
[
i
]
.
Token
=
-
1
}
}
}
}
inputs
=
slices
.
DeleteFunc
(
inputs
,
func
(
input
input
.
Input
)
bool
{
return
input
.
Token
==
-
1
})
return
inputs
,
nil
return
inputs
,
nil
}
}
func
(
m
*
Model
)
Forward
(
ctx
ml
.
Context
,
batch
input
.
Batch
)
(
ml
.
Tensor
,
error
)
{
func
(
m
*
Model
)
Forward
(
ctx
ml
.
Context
,
batch
input
.
Batch
)
(
ml
.
Tensor
,
error
)
{
var
crossAttentionStates
ml
.
Tensor
var
crossAttentionStates
ml
.
Tensor
if
len
(
batch
.
Multimodal
)
>
0
{
if
len
(
batch
.
Multimodal
)
>
0
{
images
:=
batch
.
Multimodal
[
len
(
batch
.
Multimodal
)
-
1
]
.
Multimodal
.
([]
ml
.
Tensor
)
crossAttentionStates
=
batch
.
Multimodal
[
len
(
batch
.
Multimodal
)
-
1
]
.
Multimodal
[
0
]
.
Tensor
if
len
(
images
)
>
0
{
crossAttentionStates
=
images
[
len
(
images
)
-
1
]
}
}
positions
,
err
:=
ctx
.
Input
()
.
FromIntSlice
(
batch
.
Positions
,
len
(
batch
.
Positions
))
if
err
!=
nil
{
return
nil
,
err
}
}
outputs
,
err
:=
ctx
.
Input
()
.
FromIntSlice
(
batch
.
Outputs
,
len
(
batch
.
Outputs
))
positions
:=
ctx
.
Input
()
.
FromIntSlice
(
batch
.
Positions
,
len
(
batch
.
Positions
))
if
err
!=
nil
{
outputs
:=
ctx
.
Input
()
.
FromIntSlice
(
batch
.
Outputs
,
len
(
batch
.
Outputs
))
return
nil
,
err
}
// TODO: attention mask, cross attention mask
// TODO: attention mask, cross attention mask
return
m
.
TextModel
.
Forward
(
ctx
,
batch
.
Inputs
,
positions
,
outputs
,
nil
,
crossAttentionStates
,
nil
,
m
.
Cache
.
(
*
kvcache
.
WrapperCache
)),
nil
return
m
.
TextModel
.
Forward
(
ctx
,
batch
.
Inputs
,
positions
,
outputs
,
crossAttentionStates
,
nil
,
m
.
Cache
.
(
*
kvcache
.
WrapperCache
)),
nil
}
}
func
init
()
{
func
init
()
{
...
...
model/models/mllama/model_text.go
View file @
b2b270ad
...
@@ -8,6 +8,8 @@ import (
...
@@ -8,6 +8,8 @@ import (
"github.com/ollama/ollama/kvcache"
"github.com/ollama/ollama/kvcache"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/ml/nn"
"github.com/ollama/ollama/ml/nn"
"github.com/ollama/ollama/ml/nn/fast"
"github.com/ollama/ollama/ml/nn/rope"
)
)
type
TextSelfAttention
struct
{
type
TextSelfAttention
struct
{
...
@@ -18,18 +20,17 @@ type TextSelfAttention struct {
...
@@ -18,18 +20,17 @@ type TextSelfAttention struct {
RopeFactors
ml
.
Tensor
`gguf:"rope_freqs.weight"`
RopeFactors
ml
.
Tensor
`gguf:"rope_freqs.weight"`
}
}
func
(
sa
*
TextSelfAttention
)
Forward
(
ctx
ml
.
Context
,
hiddenState
,
positions
,
_
ml
.
Tensor
,
cache
*
kvcache
.
WrapperCache
,
opts
*
TextModelOptions
)
ml
.
Tensor
{
func
(
sa
*
TextSelfAttention
)
Forward
(
ctx
ml
.
Context
,
hiddenState
,
positions
ml
.
Tensor
,
cache
*
kvcache
.
WrapperCache
,
opts
*
TextModelOptions
)
ml
.
Tensor
{
batchSize
:=
hiddenState
.
Dim
(
1
)
batchSize
:=
hiddenState
.
Dim
(
1
)
headDim
:=
opts
.
hiddenSize
/
opts
.
numHeads
headDim
:=
opts
.
hiddenSize
/
opts
.
numHeads
ropeType
:=
uint32
(
0
)
query
:=
sa
.
Query
.
Forward
(
ctx
,
hiddenState
)
query
:=
sa
.
Query
.
Forward
(
ctx
,
hiddenState
)
query
=
query
.
Reshape
(
ctx
,
headDim
,
opts
.
numHeads
,
batchSize
)
query
=
query
.
Reshape
(
ctx
,
headDim
,
opts
.
numHeads
,
batchSize
)
query
=
query
.
RoPE
(
ctx
,
positions
,
sa
.
RopeFactors
,
opts
.
ropeDim
,
ropeType
,
opts
.
ropeBase
,
opts
.
ropeScale
)
query
=
fast
.
RoPE
(
ctx
,
query
,
positions
,
opts
.
ropeDim
,
opts
.
ropeBase
,
opts
.
ropeScale
,
rope
.
WithFactors
(
sa
.
RopeFactors
)
)
key
:=
sa
.
Key
.
Forward
(
ctx
,
hiddenState
)
key
:=
sa
.
Key
.
Forward
(
ctx
,
hiddenState
)
key
=
key
.
Reshape
(
ctx
,
headDim
,
opts
.
numKVHeads
,
batchSize
)
key
=
key
.
Reshape
(
ctx
,
headDim
,
opts
.
numKVHeads
,
batchSize
)
key
=
key
.
RoPE
(
ctx
,
positions
,
sa
.
RopeFactors
,
opts
.
ropeDim
,
ropeType
,
opts
.
ropeBase
,
opts
.
ropeScale
)
key
=
fast
.
RoPE
(
ctx
,
key
,
positions
,
opts
.
ropeDim
,
opts
.
ropeBase
,
opts
.
ropeScale
,
rope
.
WithFactors
(
sa
.
RopeFactors
)
)
value
:=
sa
.
Value
.
Forward
(
ctx
,
hiddenState
)
value
:=
sa
.
Value
.
Forward
(
ctx
,
hiddenState
)
value
=
value
.
Reshape
(
ctx
,
headDim
,
opts
.
numKVHeads
,
batchSize
)
value
=
value
.
Reshape
(
ctx
,
headDim
,
opts
.
numKVHeads
,
batchSize
)
...
@@ -44,7 +45,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, _ m
...
@@ -44,7 +45,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, _ m
func
(
m
*
TextModel
)
Shift
(
ctx
ml
.
Context
,
layer
int
,
key
,
shift
ml
.
Tensor
)
(
ml
.
Tensor
,
error
)
{
func
(
m
*
TextModel
)
Shift
(
ctx
ml
.
Context
,
layer
int
,
key
,
shift
ml
.
Tensor
)
(
ml
.
Tensor
,
error
)
{
// This will only get called for layers in the cache, which are just the self attention layers
// This will only get called for layers in the cache, which are just the self attention layers
if
sa
,
ok
:=
m
.
Transformer
.
Layers
[
layer
]
.
(
*
TextSelfAttentionDecoderLayer
);
ok
{
if
sa
,
ok
:=
m
.
Transformer
.
Layers
[
layer
]
.
(
*
TextSelfAttentionDecoderLayer
);
ok
{
return
key
.
RoPE
(
ctx
,
shift
,
sa
.
SelfAttention
.
RopeFactors
,
m
.
ropeDim
,
uint32
(
0
),
m
.
ropeBase
,
m
.
ropeScale
),
nil
return
fast
.
RoPE
(
ctx
,
key
,
shift
,
m
.
ropeDim
,
m
.
ropeBase
,
m
.
ropeScale
,
rope
.
WithFactors
(
sa
.
SelfAttention
.
RopeFactors
)
),
nil
}
}
return
key
,
nil
return
key
,
nil
...
@@ -69,11 +70,11 @@ type TextSelfAttentionDecoderLayer struct {
...
@@ -69,11 +70,11 @@ type TextSelfAttentionDecoderLayer struct {
MLP
*
TextMLP
MLP
*
TextMLP
}
}
func
(
d
*
TextSelfAttentionDecoderLayer
)
Forward
(
ctx
ml
.
Context
,
hiddenState
,
positions
,
outputs
,
mask
,
_
,
_
ml
.
Tensor
,
cache
*
kvcache
.
WrapperCache
,
opts
*
TextModelOptions
)
ml
.
Tensor
{
func
(
d
*
TextSelfAttentionDecoderLayer
)
Forward
(
ctx
ml
.
Context
,
hiddenState
,
positions
,
outputs
,
_
,
_
ml
.
Tensor
,
cache
*
kvcache
.
WrapperCache
,
opts
*
TextModelOptions
)
ml
.
Tensor
{
residual
:=
hiddenState
residual
:=
hiddenState
hiddenState
=
d
.
AttentionNorm
.
Forward
(
ctx
,
hiddenState
,
opts
.
eps
)
hiddenState
=
d
.
AttentionNorm
.
Forward
(
ctx
,
hiddenState
,
opts
.
eps
)
hiddenState
=
d
.
SelfAttention
.
Forward
(
ctx
,
hiddenState
,
positions
,
mask
,
cache
,
opts
)
hiddenState
=
d
.
SelfAttention
.
Forward
(
ctx
,
hiddenState
,
positions
,
cache
,
opts
)
// In the final layer (outputs != nil), optimize by pruning to just the token positions
// In the final layer (outputs != nil), optimize by pruning to just the token positions
// we need logits for.
// we need logits for.
...
@@ -151,7 +152,7 @@ type TextCrossAttentionDecoderLayer struct {
...
@@ -151,7 +152,7 @@ type TextCrossAttentionDecoderLayer struct {
MLPGate
ml
.
Tensor
`gguf:"cross_attn_mlp_gate"`
MLPGate
ml
.
Tensor
`gguf:"cross_attn_mlp_gate"`
}
}
func
(
d
*
TextCrossAttentionDecoderLayer
)
Forward
(
ctx
ml
.
Context
,
hiddenState
,
_
,
_
,
_
,
crossAttentionStates
,
crossAttentionMask
ml
.
Tensor
,
cache
*
kvcache
.
WrapperCache
,
opts
*
TextModelOptions
)
ml
.
Tensor
{
func
(
d
*
TextCrossAttentionDecoderLayer
)
Forward
(
ctx
ml
.
Context
,
hiddenState
,
_
,
_
,
crossAttentionStates
,
crossAttentionMask
ml
.
Tensor
,
cache
*
kvcache
.
WrapperCache
,
opts
*
TextModelOptions
)
ml
.
Tensor
{
residual
:=
hiddenState
residual
:=
hiddenState
hiddenState
=
d
.
AttentionNorm
.
Forward
(
ctx
,
hiddenState
,
opts
.
eps
)
hiddenState
=
d
.
AttentionNorm
.
Forward
(
ctx
,
hiddenState
,
opts
.
eps
)
...
@@ -167,14 +168,14 @@ func (d *TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _,
...
@@ -167,14 +168,14 @@ func (d *TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _,
}
}
type
TextDecoderLayer
interface
{
type
TextDecoderLayer
interface
{
Forward
(
ctx
ml
.
Context
,
hiddenState
,
positionIDs
,
outputs
,
mask
,
crossAttentionStates
,
crossAttentionMask
ml
.
Tensor
,
cache
*
kvcache
.
WrapperCache
,
opts
*
TextModelOptions
)
ml
.
Tensor
Forward
(
ctx
ml
.
Context
,
hiddenState
,
positionIDs
,
outputs
,
crossAttentionStates
,
crossAttentionMask
ml
.
Tensor
,
cache
*
kvcache
.
WrapperCache
,
opts
*
TextModelOptions
)
ml
.
Tensor
}
}
type
TextDecoder
struct
{
type
TextDecoder
struct
{
Layers
[]
TextDecoderLayer
Layers
[]
TextDecoderLayer
}
}
func
(
d
*
TextDecoder
)
Forward
(
ctx
ml
.
Context
,
hiddenState
,
positionIDs
,
outputs
,
mask
,
crossAttentionStates
,
crossAttentionMask
ml
.
Tensor
,
cache
*
kvcache
.
WrapperCache
,
opts
*
TextModelOptions
)
ml
.
Tensor
{
func
(
d
*
TextDecoder
)
Forward
(
ctx
ml
.
Context
,
hiddenState
,
positionIDs
,
outputs
,
crossAttentionStates
,
crossAttentionMask
ml
.
Tensor
,
cache
*
kvcache
.
WrapperCache
,
opts
*
TextModelOptions
)
ml
.
Tensor
{
for
i
,
layer
:=
range
d
.
Layers
{
for
i
,
layer
:=
range
d
.
Layers
{
layerType
:=
selfAttentionLayer
layerType
:=
selfAttentionLayer
if
slices
.
Contains
(
opts
.
crossAttentionLayers
,
int32
(
i
))
{
if
slices
.
Contains
(
opts
.
crossAttentionLayers
,
int32
(
i
))
{
...
@@ -190,7 +191,7 @@ func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs,
...
@@ -190,7 +191,7 @@ func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs,
lastLayerOutputs
=
outputs
lastLayerOutputs
=
outputs
}
}
hiddenState
=
layer
.
Forward
(
ctx
,
hiddenState
,
positionIDs
,
lastLayerOutputs
,
mask
,
crossAttentionStates
,
crossAttentionMask
,
cache
,
opts
)
hiddenState
=
layer
.
Forward
(
ctx
,
hiddenState
,
positionIDs
,
lastLayerOutputs
,
crossAttentionStates
,
crossAttentionMask
,
cache
,
opts
)
}
}
}
}
...
@@ -199,8 +200,8 @@ func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs,
...
@@ -199,8 +200,8 @@ func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs,
type
TextModelOptions
struct
{
type
TextModelOptions
struct
{
hiddenSize
,
numHeads
,
numKVHeads
int
hiddenSize
,
numHeads
,
numKVHeads
int
ropeDim
int
eps
,
ropeBase
,
ropeScale
float32
eps
,
ropeBase
,
ropeScale
float32
ropeDim
uint32
crossAttentionLayers
[]
int32
crossAttentionLayers
[]
int32
}
}
...
@@ -214,9 +215,9 @@ type TextModel struct {
...
@@ -214,9 +215,9 @@ type TextModel struct {
*
TextModelOptions
*
TextModelOptions
}
}
func
(
m
*
TextModel
)
Forward
(
ctx
ml
.
Context
,
inputIDs
,
positionIDs
,
outputs
,
mask
,
crossAttentionStates
,
crossAttentionMask
ml
.
Tensor
,
cache
*
kvcache
.
WrapperCache
)
ml
.
Tensor
{
func
(
m
*
TextModel
)
Forward
(
ctx
ml
.
Context
,
inputIDs
,
positionIDs
,
outputs
,
crossAttentionStates
,
crossAttentionMask
ml
.
Tensor
,
cache
*
kvcache
.
WrapperCache
)
ml
.
Tensor
{
hiddenState
:=
m
.
TokenEmbedding
.
Forward
(
ctx
,
inputIDs
)
hiddenState
:=
m
.
TokenEmbedding
.
Forward
(
ctx
,
inputIDs
)
hiddenState
=
m
.
Transformer
.
Forward
(
ctx
,
hiddenState
,
positionIDs
,
outputs
,
mask
,
crossAttentionStates
,
crossAttentionMask
,
cache
,
m
.
TextModelOptions
)
hiddenState
=
m
.
Transformer
.
Forward
(
ctx
,
hiddenState
,
positionIDs
,
outputs
,
crossAttentionStates
,
crossAttentionMask
,
cache
,
m
.
TextModelOptions
)
hiddenState
=
m
.
OutputNorm
.
Forward
(
ctx
,
hiddenState
,
m
.
eps
)
hiddenState
=
m
.
OutputNorm
.
Forward
(
ctx
,
hiddenState
,
m
.
eps
)
return
m
.
Output
.
Forward
(
ctx
,
hiddenState
)
return
m
.
Output
.
Forward
(
ctx
,
hiddenState
)
}
}
...
@@ -240,10 +241,10 @@ func newTextModel(c fs.Config) *TextModel {
...
@@ -240,10 +241,10 @@ func newTextModel(c fs.Config) *TextModel {
hiddenSize
:
int
(
c
.
Uint
(
"embedding_length"
)),
hiddenSize
:
int
(
c
.
Uint
(
"embedding_length"
)),
numHeads
:
int
(
c
.
Uint
(
"attention.head_count"
)),
numHeads
:
int
(
c
.
Uint
(
"attention.head_count"
)),
numKVHeads
:
int
(
c
.
Uint
(
"attention.head_count_kv"
)),
numKVHeads
:
int
(
c
.
Uint
(
"attention.head_count_kv"
)),
ropeDim
:
int
(
c
.
Uint
(
"rope.dimension_count"
)),
eps
:
c
.
Float
(
"attention.layer_norm_rms_epsilon"
),
eps
:
c
.
Float
(
"attention.layer_norm_rms_epsilon"
),
ropeBase
:
c
.
Float
(
"rope.freq_base"
),
ropeBase
:
c
.
Float
(
"rope.freq_base"
),
ropeScale
:
c
.
Float
(
"rope.freq_scale"
,
1
),
ropeScale
:
c
.
Float
(
"rope.freq_scale"
,
1
),
ropeDim
:
c
.
Uint
(
"rope.dimension_count"
),
crossAttentionLayers
:
c
.
Ints
(
"attention.cross_attention_layers"
),
crossAttentionLayers
:
c
.
Ints
(
"attention.cross_attention_layers"
),
},
},
}
}
...
...
model/models/mllama/model_vision.go
View file @
b2b270ad
...
@@ -15,9 +15,7 @@ type VisionSelfAttention struct {
...
@@ -15,9 +15,7 @@ type VisionSelfAttention struct {
Query
*
nn
.
Linear
`gguf:"attn_q"`
Query
*
nn
.
Linear
`gguf:"attn_q"`
Key
*
nn
.
Linear
`gguf:"attn_k"`
Key
*
nn
.
Linear
`gguf:"attn_k"`
Value
*
nn
.
Linear
`gguf:"attn_v"`
Value
*
nn
.
Linear
`gguf:"attn_v"`
Output
*
nn
.
Linear
`gguf:"attn_out"`
Output
*
nn
.
Linear
`gguf:"attn_output"`
Gate
ml
.
Tensor
`gguf:"attn_gate"`
}
}
func
(
sa
*
VisionSelfAttention
)
Forward
(
ctx
ml
.
Context
,
hiddenState
ml
.
Tensor
,
opts
*
VisionModelOptions
)
ml
.
Tensor
{
func
(
sa
*
VisionSelfAttention
)
Forward
(
ctx
ml
.
Context
,
hiddenState
ml
.
Tensor
,
opts
*
VisionModelOptions
)
ml
.
Tensor
{
...
@@ -25,56 +23,38 @@ func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, op
...
@@ -25,56 +23,38 @@ func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, op
query
:=
sa
.
Query
.
Forward
(
ctx
,
hiddenState
)
query
:=
sa
.
Query
.
Forward
(
ctx
,
hiddenState
)
query
=
query
.
Reshape
(
ctx
,
headDim
,
opts
.
numHeads
,
query
.
Dim
(
1
),
batchSize
)
query
=
query
.
Reshape
(
ctx
,
headDim
,
opts
.
numHeads
,
query
.
Dim
(
1
),
batchSize
)
query
=
query
.
Permute
(
ctx
,
0
,
2
,
1
,
3
)
.
Contiguous
(
ctx
)
key
:=
sa
.
Key
.
Forward
(
ctx
,
hiddenState
)
key
:=
sa
.
Key
.
Forward
(
ctx
,
hiddenState
)
key
=
key
.
Reshape
(
ctx
,
headDim
,
opts
.
numHeads
,
key
.
Dim
(
1
),
batchSize
)
key
=
key
.
Reshape
(
ctx
,
headDim
,
opts
.
numHeads
,
key
.
Dim
(
1
),
batchSize
)
key
=
key
.
Permute
(
ctx
,
0
,
2
,
1
,
3
)
.
Contiguous
(
ctx
)
value
:=
sa
.
Value
.
Forward
(
ctx
,
hiddenState
)
value
:=
sa
.
Value
.
Forward
(
ctx
,
hiddenState
)
value
=
value
.
Reshape
(
ctx
,
headDim
,
opts
.
numHeads
,
value
.
Dim
(
1
),
batchSize
)
value
=
value
.
Reshape
(
ctx
,
headDim
,
opts
.
numHeads
,
value
.
Dim
(
1
),
batchSize
)
value
=
value
.
Permute
(
ctx
,
1
,
2
,
0
,
3
)
.
Contiguous
(
ctx
)
scores
:=
key
.
Mulmat
(
ctx
,
query
)
scores
=
scores
.
Scale
(
ctx
,
1.0
/
math
.
Sqrt
(
float64
(
headDim
)))
scores
=
scores
.
Softmax
(
ctx
)
attention
:=
value
.
Mulmat
(
ctx
,
scores
)
attention
:=
nn
.
Attention
(
ctx
,
query
,
key
,
value
,
1.
/
math
.
Sqrt
(
float64
(
headDim
)),
nil
)
attention
=
attention
.
Reshape
(
ctx
,
headDim
,
attention
.
Dim
(
1
),
opts
.
numHeads
,
batchSize
)
attention
=
attention
.
Permute
(
ctx
,
0
,
2
,
1
,
3
)
.
Contiguous
(
ctx
)
attention
=
attention
.
Reshape
(
ctx
,
opts
.
hiddenSize
,
attention
.
Dim
(
2
),
batchSize
)
attention
=
attention
.
Reshape
(
ctx
,
opts
.
hiddenSize
,
attention
.
Dim
(
2
),
batchSize
)
return
sa
.
Output
.
Forward
(
ctx
,
attention
)
hiddenState
=
sa
.
Output
.
Forward
(
ctx
,
attention
)
if
sa
.
Gate
!=
nil
{
hiddenState
=
hiddenState
.
Mul
(
ctx
,
sa
.
Gate
)
}
return
hiddenState
}
}
type
VisionMLP
struct
{
type
VisionMLP
struct
{
Down
*
nn
.
Linear
`gguf:"ffn_down"`
Up
*
nn
.
Linear
`gguf:"ffn_up"`
Up
*
nn
.
Linear
`gguf:"ffn_up"`
Down
*
nn
.
Linear
`gguf:"ffn_down"`
Gate
ml
.
Tensor
`gguf:"ffn_gate"`
}
}
func
(
mlp
*
VisionMLP
)
Forward
(
ctx
ml
.
Context
,
hiddenState
ml
.
Tensor
,
opts
*
VisionModelOptions
)
ml
.
Tensor
{
func
(
mlp
*
VisionMLP
)
Forward
(
ctx
ml
.
Context
,
hiddenState
ml
.
Tensor
,
opts
*
VisionModelOptions
)
ml
.
Tensor
{
hiddenState
=
mlp
.
Down
.
Forward
(
ctx
,
hiddenState
)
.
GELU
(
ctx
)
hiddenState
=
mlp
.
Up
.
Forward
(
ctx
,
hiddenState
)
.
GELU
(
ctx
)
hiddenState
=
mlp
.
Up
.
Forward
(
ctx
,
hiddenState
)
hiddenState
=
mlp
.
Down
.
Forward
(
ctx
,
hiddenState
)
if
mlp
.
Gate
!=
nil
{
hiddenState
=
hiddenState
.
Mul
(
ctx
,
mlp
.
Gate
)
}
return
hiddenState
return
hiddenState
}
}
type
VisionEncoderLayer
struct
{
type
VisionEncoderLayer
struct
{
AttentionNorm
*
nn
.
LayerNorm
`gguf:"
ln1
"`
AttentionNorm
*
nn
.
LayerNorm
`gguf:"
attn_norm
"`
SelfAttention
*
VisionSelfAttention
SelfAttention
*
VisionSelfAttention
AttentionGate
ml
.
Tensor
`gguf:"attn_gate"`
MLPNorm
*
nn
.
LayerNorm
`gguf:"
ln2
"`
MLPNorm
*
nn
.
LayerNorm
`gguf:"
ffn_norm
"`
MLP
*
VisionMLP
MLP
*
VisionMLP
MLPGate
ml
.
Tensor
`gguf:"ffn_gate"`
}
}
func
(
e
*
VisionEncoderLayer
)
Forward
(
ctx
ml
.
Context
,
hiddenState
ml
.
Tensor
,
opts
*
VisionModelOptions
)
ml
.
Tensor
{
func
(
e
*
VisionEncoderLayer
)
Forward
(
ctx
ml
.
Context
,
hiddenState
ml
.
Tensor
,
opts
*
VisionModelOptions
)
ml
.
Tensor
{
...
@@ -83,13 +63,19 @@ func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts
...
@@ -83,13 +63,19 @@ func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts
// self attention
// self attention
hiddenState
=
e
.
AttentionNorm
.
Forward
(
ctx
,
hiddenState
,
opts
.
eps
)
hiddenState
=
e
.
AttentionNorm
.
Forward
(
ctx
,
hiddenState
,
opts
.
eps
)
hiddenState
=
e
.
SelfAttention
.
Forward
(
ctx
,
hiddenState
,
opts
)
hiddenState
=
e
.
SelfAttention
.
Forward
(
ctx
,
hiddenState
,
opts
)
if
e
.
AttentionGate
!=
nil
{
hiddenState
=
hiddenState
.
Mul
(
ctx
,
e
.
AttentionGate
)
}
hiddenState
=
hiddenState
.
Add
(
ctx
,
residual
)
hiddenState
=
hiddenState
.
Add
(
ctx
,
residual
)
residual
=
hiddenState
residual
=
hiddenState
// feed forward
hiddenState
=
e
.
MLPNorm
.
Forward
(
ctx
,
hiddenState
,
opts
.
eps
)
hiddenState
=
e
.
MLPNorm
.
Forward
(
ctx
,
hiddenState
,
opts
.
eps
)
hiddenState
=
e
.
MLP
.
Forward
(
ctx
,
hiddenState
,
opts
)
hiddenState
=
e
.
MLP
.
Forward
(
ctx
,
hiddenState
,
opts
)
return
hiddenState
.
Add
(
ctx
,
residual
)
if
e
.
MLPGate
!=
nil
{
hiddenState
=
hiddenState
.
Mul
(
ctx
,
e
.
MLPGate
)
}
hiddenState
=
hiddenState
.
Add
(
ctx
,
residual
)
return
hiddenState
}
}
type
VisionEncoder
struct
{
type
VisionEncoder
struct
{
...
@@ -114,9 +100,9 @@ type PrecomputedAspectRatioEmbedding struct {
...
@@ -114,9 +100,9 @@ type PrecomputedAspectRatioEmbedding struct {
Gate
ml
.
Tensor
`gguf:"gate"`
Gate
ml
.
Tensor
`gguf:"gate"`
}
}
func
(
e
*
PrecomputedAspectRatioEmbedding
)
Forward
(
ctx
ml
.
Context
,
hiddenState
ml
.
Tensor
,
aspectRatioIDs
ml
.
Tensor
,
opts
*
VisionModelOptions
)
ml
.
Tensor
{
func
(
e
*
PrecomputedAspectRatioEmbedding
)
Forward
(
ctx
ml
.
Context
,
hiddenState
ml
.
Tensor
,
aspectRatioIDs
ml
.
Tensor
,
numTiles
int
,
opts
*
VisionModelOptions
)
ml
.
Tensor
{
embeddings
:=
e
.
Embedding
.
Forward
(
ctx
,
aspectRatioIDs
)
embeddings
:=
e
.
Embedding
.
Forward
(
ctx
,
aspectRatioIDs
)
embeddings
=
embeddings
.
Reshape
(
ctx
,
opts
.
hiddenSize
,
1
,
opts
.
numTiles
)
embeddings
=
embeddings
.
Reshape
(
ctx
,
opts
.
hiddenSize
,
1
,
numTiles
)
if
e
.
Gate
!=
nil
{
if
e
.
Gate
!=
nil
{
embeddings
=
embeddings
.
Mul
(
ctx
,
e
.
Gate
)
embeddings
=
embeddings
.
Mul
(
ctx
,
e
.
Gate
)
}
}
...
@@ -132,7 +118,7 @@ type PrecomputedPositionEmbedding struct {
...
@@ -132,7 +118,7 @@ type PrecomputedPositionEmbedding struct {
TilePositionEmbeddingGate
ml
.
Tensor
`gguf:"tile_position_embd.gate"`
TilePositionEmbeddingGate
ml
.
Tensor
`gguf:"tile_position_embd.gate"`
}
}
func
(
e
*
PrecomputedPositionEmbedding
)
Forward
(
ctx
ml
.
Context
,
hiddenState
,
positionIDs
,
aspectRatioIDs
ml
.
Tensor
,
numPositions
int
,
opts
*
VisionModelOptions
)
ml
.
Tensor
{
func
(
e
*
PrecomputedPositionEmbedding
)
Forward
(
ctx
ml
.
Context
,
hiddenState
,
positionIDs
,
aspectRatioIDs
ml
.
Tensor
,
numPositions
,
numTiles
int
,
opts
*
VisionModelOptions
)
ml
.
Tensor
{
positionEmbedding
:=
e
.
PositionEmbedding
.
Forward
(
ctx
,
positionIDs
)
positionEmbedding
:=
e
.
PositionEmbedding
.
Forward
(
ctx
,
positionIDs
)
if
e
.
PositionEmbeddingGate
!=
nil
{
if
e
.
PositionEmbeddingGate
!=
nil
{
positionEmbedding
=
positionEmbedding
.
Mul
(
ctx
,
e
.
PositionEmbeddingGate
)
positionEmbedding
=
positionEmbedding
.
Mul
(
ctx
,
e
.
PositionEmbeddingGate
)
...
@@ -141,7 +127,7 @@ func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, posi
...
@@ -141,7 +127,7 @@ func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, posi
hiddenState
=
hiddenState
.
Add
(
ctx
,
positionEmbedding
)
hiddenState
=
hiddenState
.
Add
(
ctx
,
positionEmbedding
)
tilePositionEmbedding
:=
e
.
TilePositionEmbedding
.
Forward
(
ctx
,
aspectRatioIDs
)
tilePositionEmbedding
:=
e
.
TilePositionEmbedding
.
Forward
(
ctx
,
aspectRatioIDs
)
tilePositionEmbedding
=
tilePositionEmbedding
.
Reshape
(
ctx
,
opts
.
hiddenSize
,
numPositions
,
opts
.
numTiles
)
tilePositionEmbedding
=
tilePositionEmbedding
.
Reshape
(
ctx
,
opts
.
hiddenSize
,
numPositions
,
numTiles
)
if
e
.
TilePositionEmbeddingGate
!=
nil
{
if
e
.
TilePositionEmbeddingGate
!=
nil
{
tilePositionEmbedding
=
tilePositionEmbedding
.
Mul
(
ctx
,
e
.
TilePositionEmbeddingGate
)
tilePositionEmbedding
=
tilePositionEmbedding
.
Mul
(
ctx
,
e
.
TilePositionEmbeddingGate
)
}
}
...
@@ -150,9 +136,9 @@ func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, posi
...
@@ -150,9 +136,9 @@ func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, posi
}
}
type
VisionModelOptions
struct
{
type
VisionModelOptions
struct
{
hiddenSize
,
numHeads
,
numTiles
int
hiddenSize
,
numHeads
int
imageSize
,
patchSize
int
imageSize
,
patchSize
int
eps
float32
eps
float32
intermediateLayersIndices
[]
int32
intermediateLayersIndices
[]
int32
}
}
...
@@ -181,14 +167,16 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues, positionIDs, aspectRa
...
@@ -181,14 +167,16 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues, positionIDs, aspectRa
numPositions
++
numPositions
++
}
}
numTiles
:=
pixelValues
.
Dim
(
3
)
hiddenState
:=
m
.
PatchEmbeddings
.
Forward
(
ctx
,
pixelValues
,
m
.
patchSize
,
m
.
patchSize
,
0
,
0
,
1
,
1
)
hiddenState
:=
m
.
PatchEmbeddings
.
Forward
(
ctx
,
pixelValues
,
m
.
patchSize
,
m
.
patchSize
,
0
,
0
,
1
,
1
)
hiddenState
=
hiddenState
.
Reshape
(
ctx
,
numPatches
,
m
.
hiddenSize
,
m
.
numTiles
)
hiddenState
=
hiddenState
.
Reshape
(
ctx
,
numPatches
,
m
.
hiddenSize
,
numTiles
)
hiddenState
=
hiddenState
.
Permute
(
ctx
,
1
,
0
,
2
,
3
)
.
Contiguous
(
ctx
)
hiddenState
=
hiddenState
.
Permute
(
ctx
,
1
,
0
,
2
,
3
)
.
Contiguous
(
ctx
)
hiddenState
=
m
.
PreTilePositionEmbedding
.
Forward
(
ctx
,
hiddenState
,
aspectRatioIDs
,
m
.
VisionModelOptions
)
hiddenState
=
m
.
PreTilePositionEmbedding
.
Forward
(
ctx
,
hiddenState
,
aspectRatioIDs
,
numTiles
,
m
.
VisionModelOptions
)
hiddenState
=
m
.
ClassEmbedding
.
Repeat
(
ctx
,
2
,
m
.
numTiles
)
.
Concat
(
ctx
,
hiddenState
,
1
)
hiddenState
=
m
.
ClassEmbedding
.
Repeat
(
ctx
,
2
,
numTiles
)
.
Concat
(
ctx
,
hiddenState
,
1
)
hiddenState
=
m
.
PositionEmbedding
.
Forward
(
ctx
,
hiddenState
,
positionIDs
,
aspectRatioIDs
,
numPositions
,
m
.
VisionModelOptions
)
hiddenState
=
m
.
PositionEmbedding
.
Forward
(
ctx
,
hiddenState
,
positionIDs
,
aspectRatioIDs
,
numPositions
,
numTiles
,
m
.
VisionModelOptions
)
hiddenState
=
m
.
PreLayerNorm
.
Forward
(
ctx
,
hiddenState
,
m
.
eps
)
hiddenState
=
m
.
PreLayerNorm
.
Forward
(
ctx
,
hiddenState
,
m
.
eps
)
numPaddingPatches
:=
8
-
(
hiddenState
.
Dim
(
1
)
%
8
)
%
8
numPaddingPatches
:=
8
-
(
hiddenState
.
Dim
(
1
)
%
8
)
%
8
...
@@ -199,18 +187,18 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues, positionIDs, aspectRa
...
@@ -199,18 +187,18 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues, positionIDs, aspectRa
hiddenState
=
m
.
PostLayerNorm
.
Forward
(
ctx
,
hiddenState
,
m
.
eps
)
hiddenState
=
m
.
PostLayerNorm
.
Forward
(
ctx
,
hiddenState
,
m
.
eps
)
hiddenState
=
hiddenState
.
Reshape
(
ctx
,
m
.
hiddenSize
,
numPositions
+
numPaddingPatches
,
m
.
numTiles
,
batchSize
)
hiddenState
=
hiddenState
.
Reshape
(
ctx
,
m
.
hiddenSize
,
numPositions
+
numPaddingPatches
,
numTiles
,
batchSize
)
hiddenState
=
m
.
PostTilePositionEmbedding
.
Forward
(
ctx
,
hiddenState
,
aspectRatioIDs
,
m
.
VisionModelOptions
)
hiddenState
=
m
.
PostTilePositionEmbedding
.
Forward
(
ctx
,
hiddenState
,
aspectRatioIDs
,
numTiles
,
m
.
VisionModelOptions
)
hiddenState
=
hiddenState
.
Reshape
(
ctx
,
m
.
hiddenSize
,
m
.
numTiles
*
(
numPositions
+
numPaddingPatches
),
batchSize
)
hiddenState
=
hiddenState
.
Reshape
(
ctx
,
m
.
hiddenSize
,
numTiles
*
(
numPositions
+
numPaddingPatches
),
batchSize
)
hiddenState
,
_
=
m
.
GlobalTransformer
.
Forward
(
ctx
,
hiddenState
,
nil
,
m
.
VisionModelOptions
)
hiddenState
,
_
=
m
.
GlobalTransformer
.
Forward
(
ctx
,
hiddenState
,
nil
,
m
.
VisionModelOptions
)
hiddenStates
:=
intermediateHiddenStates
[
0
]
.
Stack
(
ctx
,
0
,
intermediateHiddenStates
[
1
:
]
...
)
hiddenStates
:=
intermediateHiddenStates
[
0
]
.
Stack
(
ctx
,
0
,
intermediateHiddenStates
[
1
:
]
...
)
hiddenStates
=
hiddenStates
.
Reshape
(
ctx
,
len
(
intermediateHiddenStates
)
*
m
.
hiddenSize
,
numPositions
+
numPaddingPatches
,
m
.
numTiles
,
batchSize
)
hiddenStates
=
hiddenStates
.
Reshape
(
ctx
,
len
(
intermediateHiddenStates
)
*
m
.
hiddenSize
,
numPositions
+
numPaddingPatches
,
numTiles
,
batchSize
)
hiddenStates
=
hiddenStates
.
Unp
ad
(
ctx
,
0
,
numPaddingPatches
,
0
,
0
)
hiddenStates
=
hiddenStates
.
P
ad
(
ctx
,
0
,
-
numPaddingPatches
,
0
,
0
)
hiddenState
=
hiddenState
.
Reshape
(
ctx
,
m
.
hiddenSize
,
numPositions
+
numPaddingPatches
,
m
.
numTiles
,
batchSize
)
hiddenState
=
hiddenState
.
Reshape
(
ctx
,
m
.
hiddenSize
,
numPositions
+
numPaddingPatches
,
numTiles
,
batchSize
)
hiddenState
=
hiddenState
.
Unp
ad
(
ctx
,
0
,
numPaddingPatches
,
0
,
0
)
hiddenState
=
hiddenState
.
P
ad
(
ctx
,
0
,
-
numPaddingPatches
,
0
,
0
)
return
hiddenState
.
Concat
(
ctx
,
hiddenStates
,
0
)
return
hiddenState
.
Concat
(
ctx
,
hiddenStates
,
0
)
}
}
...
@@ -222,7 +210,6 @@ func newVisionModel(c fs.Config) *VisionModel {
...
@@ -222,7 +210,6 @@ func newVisionModel(c fs.Config) *VisionModel {
VisionModelOptions
:
&
VisionModelOptions
{
VisionModelOptions
:
&
VisionModelOptions
{
hiddenSize
:
int
(
c
.
Uint
(
"vision.embedding_length"
)),
hiddenSize
:
int
(
c
.
Uint
(
"vision.embedding_length"
)),
numHeads
:
int
(
c
.
Uint
(
"vision.attention.head_count"
)),
numHeads
:
int
(
c
.
Uint
(
"vision.attention.head_count"
)),
numTiles
:
int
(
c
.
Uint
(
"vision.max_num_tiles"
)),
imageSize
:
int
(
c
.
Uint
(
"vision.image_size"
)),
imageSize
:
int
(
c
.
Uint
(
"vision.image_size"
)),
patchSize
:
int
(
c
.
Uint
(
"vision.patch_size"
)),
patchSize
:
int
(
c
.
Uint
(
"vision.patch_size"
)),
...
...
model/models/mllama/process_image.go
View file @
b2b270ad
...
@@ -2,17 +2,31 @@ package mllama
...
@@ -2,17 +2,31 @@ package mllama
import
(
import
(
"image"
"image"
"image/color"
"math"
"math"
"slices"
"slices"
"golang.org/x/image/draw"
"golang.org/x/image/draw"
"github.com/ollama/ollama/fs"
"github.com/ollama/ollama/fs"
"github.com/ollama/ollama/model/imageproc"
)
)
type
supportedAspectRatio
struct
{
rank
,
width
,
height
int
}
func
(
a
supportedAspectRatio
)
Point
()
image
.
Point
{
return
image
.
Point
{
a
.
width
,
a
.
height
}
}
func
(
a
supportedAspectRatio
)
numTiles
()
int
{
return
a
.
width
*
a
.
height
}
type
ImageProcessor
struct
{
type
ImageProcessor
struct
{
imageSize
,
numChannels
,
maxNumTiles
int
imageSize
,
numChannels
,
maxNumTiles
int
mean
,
std
[
3
]
float32
}
}
func
newImageProcessor
(
c
fs
.
Config
)
ImageProcessor
{
func
newImageProcessor
(
c
fs
.
Config
)
ImageProcessor
{
...
@@ -20,71 +34,49 @@ func newImageProcessor(c fs.Config) ImageProcessor {
...
@@ -20,71 +34,49 @@ func newImageProcessor(c fs.Config) ImageProcessor {
imageSize
:
int
(
c
.
Uint
(
"vision.image_size"
)),
imageSize
:
int
(
c
.
Uint
(
"vision.image_size"
)),
numChannels
:
int
(
c
.
Uint
(
"vision.num_channels"
)),
numChannels
:
int
(
c
.
Uint
(
"vision.num_channels"
)),
maxNumTiles
:
int
(
c
.
Uint
(
"vision.max_num_tiles"
)),
maxNumTiles
:
int
(
c
.
Uint
(
"vision.max_num_tiles"
)),
mean
:
imageproc
.
ClipDefaultMean
,
std
:
imageproc
.
ClipDefaultSTD
,
}
}
}
}
func
(
p
*
ImageProcessor
)
supportedAspectRatios
(
maxTiles
int
)
[]
image
.
Point
{
func
(
p
ImageProcessor
)
supportedAspectRatios
()
(
ratios
[]
supportedAspectRatio
)
{
ratios
:=
[]
image
.
Point
{}
for
w
:=
1
;
w
<=
p
.
maxNumTiles
;
w
++
{
for
h
:=
1
;
h
<=
p
.
maxNumTiles
/
w
;
h
++
{
for
w
:=
range
maxTiles
{
ratios
=
append
(
ratios
,
supportedAspectRatio
{
len
(
ratios
)
+
1
,
w
,
h
})
for
h
:=
range
maxTiles
{
if
(
w
+
1
)
*
(
h
+
1
)
<=
maxTiles
{
ratios
=
append
(
ratios
,
image
.
Point
{
w
+
1
,
h
+
1
})
}
}
}
}
}
return
ratios
return
ratios
}
}
func
(
p
*
ImageProcessor
)
clip
(
a
,
a_min
,
a_max
int
)
int
{
func
(
p
ImageProcessor
)
fitToCanvas
(
imageSize
,
canvasSize
image
.
Point
)
image
.
Point
{
if
a
<
a_min
{
tw
:=
min
(
max
(
imageSize
.
X
,
p
.
imageSize
),
canvasSize
.
X
)
return
a_min
th
:=
min
(
max
(
imageSize
.
Y
,
p
.
imageSize
),
canvasSize
.
Y
)
}
else
if
a
>
a_max
{
return
a_max
}
return
a
}
func
(
p
*
ImageProcessor
)
fitToCanvas
(
imageSize
,
canvasSize
image
.
Point
,
tileSize
int
)
image
.
Point
{
targetWidth
:=
p
.
clip
(
imageSize
.
X
,
tileSize
,
canvasSize
.
X
)
targetHeight
:=
p
.
clip
(
imageSize
.
Y
,
tileSize
,
canvasSize
.
Y
)
scaleWidth
:=
float64
(
targetWidth
)
/
float64
(
imageSize
.
X
)
r
:=
math
.
Min
(
scaleHeight
:=
float64
(
targetHeight
)
/
float64
(
imageSize
.
Y
)
float64
(
tw
)
/
float64
(
imageSize
.
X
),
float64
(
th
)
/
float64
(
imageSize
.
Y
),
)
var
w
,
h
int
w
:=
min
(
int
(
math
.
Floor
(
float64
(
imageSize
.
X
)
*
r
)),
tw
)
h
:=
min
(
int
(
math
.
Floor
(
float64
(
imageSize
.
Y
)
*
r
)),
th
)
if
scaleWidth
<
scaleHeight
{
w
=
targetWidth
h
=
min
(
int
(
math
.
Floor
(
float64
(
imageSize
.
Y
)
*
scaleWidth
)),
targetHeight
)
}
else
{
w
=
min
(
int
(
math
.
Floor
(
float64
(
imageSize
.
X
)
*
scaleHeight
)),
targetWidth
)
h
=
targetHeight
}
return
image
.
Point
{
w
,
h
}
return
image
.
Point
{
w
,
h
}
}
}
func
(
p
*
ImageProcessor
)
optimalTiledCanvas
(
imageSize
image
.
Point
,
maxImageTiles
,
tileSize
int
)
image
.
Point
{
func
(
p
ImageProcessor
)
optimalTiledCanvas
(
imageSize
image
.
Point
)
image
.
Point
{
possibleTileArrangements
:=
p
.
supportedAspectRatios
(
maxImageTiles
)
possibleTileArrangements
:=
p
.
supportedAspectRatios
()
possibleCanvasSizes
:=
[]
image
.
Point
{}
possibleCanvasSizes
:=
make
(
[]
image
.
Point
,
len
(
possibleTileArrangements
))
for
_
,
pta
:=
range
possibleTileArrangements
{
for
i
,
pta
:=
range
possibleTileArrangements
{
possibleCanvasSizes
=
append
(
possibleCanvasSizes
,
image
.
Point
{
pta
.
X
*
tileSize
,
pta
.
Y
*
til
eSize
}
)
possibleCanvasSizes
[
i
]
=
image
.
Point
{
pta
.
width
*
p
.
imageSize
,
pta
.
height
*
p
.
imag
eSize
}
}
}
scales
:=
[]
float64
{}
scales
:=
make
([]
float64
,
len
(
possibleCanvasSizes
))
for
i
,
pcs
:=
range
possibleCanvasSizes
{
for
_
,
pcs
:=
range
possibleCanvasSizes
{
scales
[
i
]
=
min
(
scaleHeight
:=
float64
(
pcs
.
Y
)
/
float64
(
imageSize
.
Y
)
float64
(
pcs
.
Y
)
/
float64
(
imageSize
.
Y
),
scaleWidth
:=
float64
(
pcs
.
X
)
/
float64
(
imageSize
.
X
)
float64
(
pcs
.
X
)
/
float64
(
imageSize
.
X
),
)
if
scaleWidth
>
scaleHeight
{
scales
=
append
(
scales
,
scaleHeight
)
}
else
{
scales
=
append
(
scales
,
scaleWidth
)
}
}
}
var
minUpscale
float64
var
minUpscale
float64
...
@@ -123,47 +115,41 @@ func (p *ImageProcessor) optimalTiledCanvas(imageSize image.Point, maxImageTiles
...
@@ -123,47 +115,41 @@ func (p *ImageProcessor) optimalTiledCanvas(imageSize image.Point, maxImageTiles
return
selectedCanvas
return
selectedCanvas
}
}
func
(
p
*
ImageProcessor
)
splitToTiles
(
img
image
.
Image
,
numTilesSize
image
.
Point
)
[]
image
.
Image
{
func
(
p
ImageProcessor
)
splitToTiles
(
img
image
.
Image
,
numTilesSize
image
.
Point
)
[]
image
.
Image
{
b
:=
img
.
Bounds
()
b
:=
img
.
Bounds
()
width
:=
b
.
Max
.
X
-
b
.
Min
.
X
width
:=
b
.
Max
.
X
-
b
.
Min
.
X
height
:=
b
.
Max
.
Y
-
b
.
Min
.
Y
height
:=
b
.
Max
.
Y
-
b
.
Min
.
Y
tileHeight
:=
height
/
numTilesSize
.
Y
tileHeight
:=
height
/
numTilesSize
.
Y
tileWidth
:=
width
/
numTilesSize
.
X
tileWidth
:=
width
/
numTilesSize
.
X
images
:=
[]
image
.
Image
{}
images
:=
make
(
[]
image
.
Image
,
0
,
numTilesSize
.
Y
*
numTilesSize
.
X
)
for
h
:=
range
numTilesSize
.
Y
{
for
h
:=
range
numTilesSize
.
Y
{
for
w
:=
range
numTilesSize
.
X
{
for
w
:=
range
numTilesSize
.
X
{
rect
:=
image
.
Rect
(
tileWidth
*
w
,
tileHeight
*
h
,
tileWidth
*
(
w
+
1
),
tileHeight
*
(
h
+
1
))
rect
:=
image
.
Rect
(
tileWidth
*
w
,
tileHeight
*
h
,
tileWidth
*
(
w
+
1
),
tileHeight
*
(
h
+
1
))
i
mages
=
append
(
images
,
img
.
(
interface
{
i
f
subImg
,
ok
:=
img
.
(
interface
{
SubImage
(
image
.
Rectangle
)
image
.
Image
SubImage
(
image
.
Rectangle
)
image
.
Image
})
.
SubImage
(
rect
))
});
ok
{
images
=
append
(
images
,
subImg
.
SubImage
(
rect
))
}
else
{
// Handle the case where img does not implement SubImage
// This is a fallback and may not be efficient
newImg
:=
image
.
NewRGBA
(
rect
)
draw
.
Draw
(
newImg
,
rect
,
img
,
rect
.
Min
,
draw
.
Src
)
images
=
append
(
images
,
newImg
)
}
}
}
}
}
return
images
return
images
}
}
// remove the "alpha" channel by drawing over a prefilled image
func
(
p
ImageProcessor
)
resize
(
img
image
.
Image
)
(
image
.
Image
,
image
.
Point
)
{
//
//nolint:unused
func
(
p
*
ImageProcessor
)
compositeImage
(
img
image
.
Image
)
image
.
Image
{
dst
:=
image
.
NewRGBA
(
img
.
Bounds
())
white
:=
color
.
RGBA
{
255
,
255
,
255
,
255
}
draw
.
Draw
(
dst
,
dst
.
Bounds
(),
&
image
.
Uniform
{
white
},
image
.
Point
{},
draw
.
Src
)
draw
.
Draw
(
dst
,
dst
.
Bounds
(),
img
,
img
.
Bounds
()
.
Min
,
draw
.
Over
)
return
dst
}
func
(
p
*
ImageProcessor
)
resize
(
img
image
.
Image
,
outputSize
image
.
Point
,
maxImageTiles
int
)
(
image
.
Image
,
image
.
Point
)
{
b
:=
img
.
Bounds
()
b
:=
img
.
Bounds
()
tileSize
:=
outputSize
.
Y
canvasSize
:=
p
.
optimalTiledCanvas
(
b
.
Max
,
maxImageTiles
,
tileSize
)
canvasSize
:=
p
.
optimalTiledCanvas
(
b
.
Max
)
aspectRatio
:=
image
.
Point
{
canvasSize
.
X
/
til
eSize
,
canvasSize
.
Y
/
til
eSize
}
aspectRatio
:=
image
.
Point
{
canvasSize
.
X
/
p
.
imag
eSize
,
canvasSize
.
Y
/
p
.
imag
eSize
}
newSize
:=
p
.
fitToCanvas
(
b
.
Max
,
canvasSize
,
tileSize
)
newSize
:=
p
.
fitToCanvas
(
b
.
Max
,
canvasSize
)
dst
:=
image
.
NewRGBA
(
image
.
Rect
(
0
,
0
,
newSize
.
X
,
newSize
.
Y
))
dst
:=
image
.
NewRGBA
(
image
.
Rect
(
0
,
0
,
newSize
.
X
,
newSize
.
Y
))
...
@@ -177,10 +163,10 @@ func (p *ImageProcessor) resize(img image.Image, outputSize image.Point, maxImag
...
@@ -177,10 +163,10 @@ func (p *ImageProcessor) resize(img image.Image, outputSize image.Point, maxImag
return
dst
,
aspectRatio
return
dst
,
aspectRatio
}
}
func
(
p
*
ImageProcessor
)
pad
(
img
image
.
Image
,
outputSize
,
aspectRatio
image
.
Point
)
image
.
Image
{
func
(
p
ImageProcessor
)
pad
(
img
image
.
Image
,
aspectRatio
image
.
Point
)
image
.
Image
{
paddedSize
:=
image
.
Point
{
paddedSize
:=
image
.
Point
{
X
:
output
Size
.
X
*
aspectRatio
.
X
,
X
:
p
.
image
Size
*
aspectRatio
.
X
,
Y
:
output
Size
.
Y
*
aspectRatio
.
Y
,
Y
:
p
.
image
Size
*
aspectRatio
.
Y
,
}
}
dst
:=
image
.
NewRGBA
(
image
.
Rect
(
0
,
0
,
paddedSize
.
X
,
paddedSize
.
Y
))
dst
:=
image
.
NewRGBA
(
image
.
Rect
(
0
,
0
,
paddedSize
.
X
,
paddedSize
.
Y
))
...
@@ -189,7 +175,7 @@ func (p *ImageProcessor) pad(img image.Image, outputSize, aspectRatio image.Poin
...
@@ -189,7 +175,7 @@ func (p *ImageProcessor) pad(img image.Image, outputSize, aspectRatio image.Poin
return
dst
return
dst
}
}
func
(
p
*
ImageProcessor
)
pack
(
img
image
.
Image
,
aspectRatio
image
.
Point
,
mean
,
std
[
3
]
float32
)
[]
float32
{
func
(
p
ImageProcessor
)
pack
(
img
image
.
Image
,
aspectRatio
image
.
Point
)
[]
float32
{
subImages
:=
p
.
splitToTiles
(
img
,
aspectRatio
)
subImages
:=
p
.
splitToTiles
(
img
,
aspectRatio
)
var
pixelVals
[]
float32
var
pixelVals
[]
float32
...
@@ -205,9 +191,9 @@ func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, st
...
@@ -205,9 +191,9 @@ func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, st
gVal
:=
float32
(
g
>>
8
)
/
255.0
gVal
:=
float32
(
g
>>
8
)
/
255.0
bVal
:=
float32
(
b
>>
8
)
/
255.0
bVal
:=
float32
(
b
>>
8
)
/
255.0
rVal
=
(
rVal
-
mean
[
0
])
/
std
[
0
]
rVal
=
(
rVal
-
p
.
mean
[
0
])
/
p
.
std
[
0
]
gVal
=
(
gVal
-
mean
[
1
])
/
std
[
1
]
gVal
=
(
gVal
-
p
.
mean
[
1
])
/
p
.
std
[
1
]
bVal
=
(
bVal
-
mean
[
2
])
/
std
[
2
]
bVal
=
(
bVal
-
p
.
mean
[
2
])
/
p
.
std
[
2
]
rVals
=
append
(
rVals
,
rVal
)
rVals
=
append
(
rVals
,
rVal
)
gVals
=
append
(
gVals
,
gVal
)
gVals
=
append
(
gVals
,
gVal
)
...
@@ -222,17 +208,15 @@ func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, st
...
@@ -222,17 +208,15 @@ func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, st
return
pixelVals
return
pixelVals
}
}
func
(
p
ImageProcessor
)
ProcessImage
(
img
image
.
Image
)
([]
float32
,
int
,
error
)
{
func
(
p
ImageProcessor
)
ProcessImage
(
img
image
.
Image
)
([]
float32
,
supportedAspectRatio
,
error
)
{
outputSize
:=
image
.
Point
{
p
.
imageSize
,
p
.
imageSize
}
newImage
,
newImageRatio
:=
p
.
resize
(
img
)
newImage
=
p
.
pad
(
newImage
,
newImageRatio
)
// clip values
pixelValues
:=
p
.
pack
(
newImage
,
newImageRatio
)
mean
:=
[
3
]
float32
{
0.48145466
,
0.4578275
,
0.40821073
}
std
:=
[
3
]
float32
{
0.26862954
,
0.26130258
,
0.27577711
}
newImage
,
aspectRatio
:=
p
.
resize
(
img
,
outputSize
,
p
.
maxNumTiles
)
supportedAspectRatios
:=
p
.
supportedAspectRatios
()
newImage
=
p
.
pad
(
newImage
,
outputSize
,
aspectRatio
)
aspectRatioID
:=
slices
.
IndexFunc
(
supportedAspectRatios
,
func
(
i
supportedAspectRatio
)
bool
{
return
i
.
width
==
newImageRatio
.
X
&&
i
.
height
==
newImageRatio
.
Y
})
data
:=
p
.
pack
(
newImage
,
aspectRatio
,
mean
,
std
)
return
pixelValues
,
supportedAspectRatios
[
aspectRatioID
],
nil
aspectRatioIndex
:=
slices
.
Index
(
p
.
supportedAspectRatios
(
p
.
maxNumTiles
),
aspectRatio
)
+
1
return
data
,
aspectRatioIndex
,
nil
}
}
Prev
1
…
7
8
9
10
11
12
13
14
15
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment