Commit b2b270ad authored by Devon Rifkin's avatar Devon Rifkin
Browse files

Merge branch 'main' into drifkin/array-head-count-simple

parents 20c5fd39 2bb69b40
...@@ -2,117 +2,17 @@ package model ...@@ -2,117 +2,17 @@ package model
import ( import (
"cmp" "cmp"
"context"
"fmt"
"iter" "iter"
"log/slog" "log/slog"
"slices"
"strings" "strings"
"sync"
"github.com/dlclark/regexp2" "github.com/dlclark/regexp2"
heap "github.com/emirpasic/gods/v2/trees/binaryheap" heap "github.com/emirpasic/gods/v2/trees/binaryheap"
"github.com/ollama/ollama/logutil"
) )
type Special int32
const (
SpecialBOS Special = iota
SpecialEOS
)
const (
TOKEN_TYPE_NORMAL = iota + 1
TOKEN_TYPE_UNKNOWN
TOKEN_TYPE_CONTROL
TOKEN_TYPE_USER_DEFINED
TOKEN_TYPE_UNUSED
TOKEN_TYPE_BYTE
)
type TextProcessor interface {
Encode(s string, addSpecial bool) ([]int32, error)
Decode([]int32) (string, error)
Is(int32, Special) bool
Vocabulary() *Vocabulary
}
type Vocabulary struct {
Values []string
Types []int32
Scores []float32
Merges []string
BOS, EOS, EOT int32
AddBOS, AddEOS, AddEOT bool
specialOnce sync.Once
special []string
valuesOnce sync.Once
values map[string]int32
mergeOnce sync.Once
merge map[string]int32
}
func (v *Vocabulary) Is(id int32, special Special) bool {
switch special {
case SpecialBOS:
return id == v.BOS
case SpecialEOS:
return id == v.EOS || id == v.EOT
default:
return false
}
}
func (v *Vocabulary) Encode(s string) int32 {
v.valuesOnce.Do(func() {
v.values = make(map[string]int32, len(v.Values))
for i, value := range v.Values {
v.values[value] = int32(i)
}
})
if id, ok := v.values[s]; ok {
return id
}
return -1
}
func (v *Vocabulary) Decode(id int32) string {
return v.Values[id]
}
func (v *Vocabulary) SpecialVocabulary() []string {
v.specialOnce.Do(func() {
for i := range v.Values {
if slices.Contains([]int{105, 106}, i) {
v.special = append(v.special, v.Values[i])
} else if v.Types[i] == TOKEN_TYPE_CONTROL {
v.special = append(v.special, v.Values[i])
}
}
})
return v.special
}
func (v *Vocabulary) Merge(left, right string) int {
v.mergeOnce.Do(func() {
v.merge = make(map[string]int32, len(v.Merges))
for i, merge := range v.Merges {
v.merge[merge] = int32(i)
}
})
if id, ok := v.merge[left+" "+right]; ok {
return int(id)
}
return -1
}
type BytePairEncoding struct { type BytePairEncoding struct {
pre *regexp2.Regexp pre *regexp2.Regexp
vocab *Vocabulary vocab *Vocabulary
...@@ -302,29 +202,23 @@ func (bpe BytePairEncoding) Encode(s string, addSpecial bool) ([]int32, error) { ...@@ -302,29 +202,23 @@ func (bpe BytePairEncoding) Encode(s string, addSpecial bool) ([]int32, error) {
} }
} }
if addSpecial && len(ids) > 0 { slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "string", s, "ids", ids)
if bpe.vocab.AddBOS {
if ids[0] == bpe.vocab.BOS {
slog.Warn("adding bos token to prompt which already has it", "id", bpe.vocab.BOS)
}
slog.Debug("adding bos token to prompt", "id", bpe.vocab.BOS)
ids = append([]int32{bpe.vocab.BOS}, ids...)
}
if bpe.vocab.AddEOS { if addSpecial && len(ids) > 0 {
if ids[len(ids)-1] == bpe.vocab.EOS { ids = bpe.vocab.addSpecials(ids)
slog.Warn("adding eos token to prompt which already has it", "id", bpe.vocab.EOS)
}
slog.Debug("adding eos token to prompt", "id", bpe.vocab.EOS)
ids = append(ids, bpe.vocab.EOS)
}
} }
return ids, nil return ids, nil
} }
type lazyIdsString struct {
ids []int32
}
func (l lazyIdsString) LogValue() slog.Value {
return slog.AnyValue(fmt.Sprint(l.ids))
}
func (bpe BytePairEncoding) Decode(ids []int32) (string, error) { func (bpe BytePairEncoding) Decode(ids []int32) (string, error) {
var sb strings.Builder var sb strings.Builder
for _, id := range ids { for _, id := range ids {
...@@ -349,5 +243,6 @@ func (bpe BytePairEncoding) Decode(ids []int32) (string, error) { ...@@ -349,5 +243,6 @@ func (bpe BytePairEncoding) Decode(ids []int32) (string, error) {
} }
} }
slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "string", sb.String(), "from", lazyIdsString{ids: ids})
return sb.String(), nil return sb.String(), nil
} }
...@@ -2,16 +2,30 @@ package input ...@@ -2,16 +2,30 @@ package input
import "github.com/ollama/ollama/ml" import "github.com/ollama/ollama/ml"
// Multimodal is a multimodal embedding or a component of one.
// For example, it could be a row of an image that can be processed
// independently.
type Multimodal struct {
// Tensor is the embedding data. Implementations may chose what to
// store here or it may be nil if not needed. However, any ml.Tensor
// objects must be stored here and not in Data.
Tensor ml.Tensor
// Data is implementation-specific opaque data, such as metadata on how
// to layout Tensor. It may be nil if not needed. It may also store larger
// objects such as complete images if they are to be processed later.
Data any
}
// Input represents one token in the input stream // Input represents one token in the input stream
type Input struct { type Input struct {
// Token is a single element of text. // Token is a single element of text.
Token int32 Token int32
// Multimodal is opaque data representing a non-text // Multimodal is represents a non-text element such as an
// element such as an image (or part of one if the image // image (or part of one if the image can be processed in pieces).
// can be processed in pieces). It may be either together // It may be used either together with Token or on its own.
// with Token or on its own. Multimodal []Multimodal
Multimodal any
// MultimodalHash is a unique representation of the data // MultimodalHash is a unique representation of the data
// stored in Multimodal, used for caching and comparing // stored in Multimodal, used for caching and comparing
...@@ -32,7 +46,7 @@ type Input struct { ...@@ -32,7 +46,7 @@ type Input struct {
// Positions slice. // Positions slice.
type MultimodalIndex struct { type MultimodalIndex struct {
Index int Index int
Multimodal any Multimodal []Multimodal
} }
// Batch contains the inputs for a model forward pass // Batch contains the inputs for a model forward pass
......
...@@ -19,6 +19,7 @@ import ( ...@@ -19,6 +19,7 @@ import (
"github.com/ollama/ollama/fs" "github.com/ollama/ollama/fs"
fsggml "github.com/ollama/ollama/fs/ggml" fsggml "github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/kvcache" "github.com/ollama/ollama/kvcache"
"github.com/ollama/ollama/logutil"
"github.com/ollama/ollama/ml" "github.com/ollama/ollama/ml"
_ "github.com/ollama/ollama/ml/backend" _ "github.com/ollama/ollama/ml/backend"
"github.com/ollama/ollama/model/input" "github.com/ollama/ollama/model/input"
...@@ -39,12 +40,13 @@ type MultimodalProcessor interface { ...@@ -39,12 +40,13 @@ type MultimodalProcessor interface {
// EncodeMultimodal processes a single input (such as an image) and // EncodeMultimodal processes a single input (such as an image) and
// generates an output (typically an embedding) that can be used by the model. // generates an output (typically an embedding) that can be used by the model.
// //
// The return value is most typically an ml.Tensor, however, different // The return value is one or more tensors, each with optional model-specific
// type are possible, such as an object containing a tensor plus // opaque metadata. Typically, the tensors might be views into an embedding
// additional metadata, a slice of tensors or even just the original input. // with each view representing a chunk of data that can be processed independently
// in different batches.
// //
// The result may be cached by the runner. // The result may be cached by the runner.
EncodeMultimodal(ml.Context, []byte) (any, error) EncodeMultimodal(ml.Context, []byte) ([]input.Multimodal, error)
// PostTokenize is called after tokenization to allow the model to edit the // PostTokenize is called after tokenization to allow the model to edit the
// input stream to correctly arrange multimodal elements. // input stream to correctly arrange multimodal elements.
...@@ -96,14 +98,8 @@ func Register(name string, f func(fs.Config) (Model, error)) { ...@@ -96,14 +98,8 @@ func Register(name string, f func(fs.Config) (Model, error)) {
} }
// New initializes a new model instance with the provided configuration based on the metadata in the model file // New initializes a new model instance with the provided configuration based on the metadata in the model file
func New(ctx context.Context, modelPath string, params ml.BackendParams) (Model, error) { func New(modelPath string, params ml.BackendParams) (Model, error) {
r, err := os.Open(modelPath) b, err := ml.NewBackend(modelPath, params)
if err != nil {
return nil, err
}
defer r.Close()
b, err := ml.NewBackend(ctx, r, params)
if err != nil { if err != nil {
return nil, err return nil, err
} }
...@@ -132,7 +128,7 @@ func NewTextProcessor(s string) (TextProcessor, error) { ...@@ -132,7 +128,7 @@ func NewTextProcessor(s string) (TextProcessor, error) {
return nil, err return nil, err
} }
defer r.Close() defer r.Close()
meta, _, err := fsggml.Decode(r, -1) meta, err := fsggml.Decode(r, -1)
if err != nil { if err != nil {
return nil, err return nil, err
} }
...@@ -202,7 +198,7 @@ func populateFields(base Base, v reflect.Value, tags ...Tag) reflect.Value { ...@@ -202,7 +198,7 @@ func populateFields(base Base, v reflect.Value, tags ...Tag) reflect.Value {
names := fn(tagsCopy) names := fn(tagsCopy)
for _, name := range names { for _, name := range names {
if tensor := base.Backend().Get(strings.Join(name, ".")); tensor != nil { if tensor := base.Backend().Get(strings.Join(name, ".")); tensor != nil {
slog.Debug("found tensor", "", tensor) slog.Log(context.TODO(), logutil.LevelTrace, "found tensor", "", tensor)
vv.Set(reflect.ValueOf(tensor)) vv.Set(reflect.ValueOf(tensor))
break break
} }
...@@ -291,11 +287,7 @@ func Forward(ctx ml.Context, m Model, inputs []int32, batch input.Batch) (ml.Ten ...@@ -291,11 +287,7 @@ func Forward(ctx ml.Context, m Model, inputs []int32, batch input.Batch) (ml.Ten
return nil, errors.New("batch size cannot be less than 1") return nil, errors.New("batch size cannot be less than 1")
} }
var err error batch.Inputs = ctx.Input().FromIntSlice(inputs, len(inputs))
batch.Inputs, err = ctx.Input().FromIntSlice(inputs, len(inputs))
if err != nil {
return nil, err
}
cache := m.Config().Cache cache := m.Config().Cache
if cache != nil { if cache != nil {
......
...@@ -7,6 +7,8 @@ import ( ...@@ -7,6 +7,8 @@ import (
"github.com/ollama/ollama/kvcache" "github.com/ollama/ollama/kvcache"
"github.com/ollama/ollama/ml" "github.com/ollama/ollama/ml"
"github.com/ollama/ollama/ml/nn" "github.com/ollama/ollama/ml/nn"
"github.com/ollama/ollama/ml/nn/fast"
"github.com/ollama/ollama/ml/nn/rope"
"github.com/ollama/ollama/model" "github.com/ollama/ollama/model"
"github.com/ollama/ollama/model/input" "github.com/ollama/ollama/model/input"
) )
...@@ -43,8 +45,13 @@ func New(c fs.Config) (model.Model, error) { ...@@ -43,8 +45,13 @@ func New(c fs.Config) (model.Model, error) {
Values: c.Strings("tokenizer.ggml.tokens"), Values: c.Strings("tokenizer.ggml.tokens"),
Scores: c.Floats("tokenizer.ggml.scores"), Scores: c.Floats("tokenizer.ggml.scores"),
Types: c.Ints("tokenizer.ggml.token_type"), Types: c.Ints("tokenizer.ggml.token_type"),
BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")), AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")), BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
EOS: append(
[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
c.Ints("tokenizer.ggml.eos_token_ids")...,
),
}, },
), ),
Layers: make([]Layer, c.Uint("block_count")), Layers: make([]Layer, c.Uint("block_count")),
...@@ -78,11 +85,10 @@ type SelfAttention struct { ...@@ -78,11 +85,10 @@ type SelfAttention struct {
func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor { func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
batchSize := hiddenState.Dim(1) batchSize := hiddenState.Dim(1)
ropeType := uint32(2)
q := sa.Query.Forward(ctx, hiddenState) q := sa.Query.Forward(ctx, hiddenState)
q = q.Reshape(ctx, opts.attnKeyLen, opts.numHeads, batchSize) q = q.Reshape(ctx, opts.attnKeyLen, opts.numHeads, batchSize)
q = q.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, opts.ropeBase, opts.ropeScale) q = fast.RoPE(ctx, q, positionIDs, opts.attnKeyLen, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
if opts.largeModelScaling { if opts.largeModelScaling {
q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.hiddenSize/opts.numHeads))) q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.hiddenSize/opts.numHeads)))
...@@ -92,7 +98,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten ...@@ -92,7 +98,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
k := sa.Key.Forward(ctx, hiddenState) k := sa.Key.Forward(ctx, hiddenState)
k = k.Reshape(ctx, opts.attnKeyLen, opts.numKVHeads, batchSize) k = k.Reshape(ctx, opts.attnKeyLen, opts.numKVHeads, batchSize)
k = k.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, opts.ropeBase, opts.ropeScale) k = fast.RoPE(ctx, k, positionIDs, opts.attnKeyLen, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
v := sa.Value.Forward(ctx, hiddenState) v := sa.Value.Forward(ctx, hiddenState)
v = v.Reshape(ctx, opts.attnValLen, opts.numKVHeads, batchSize) v = v.Reshape(ctx, opts.attnValLen, opts.numKVHeads, batchSize)
...@@ -122,7 +128,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten ...@@ -122,7 +128,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
} }
func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) { func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
return key.RoPE(ctx, shift, nil, uint32(m.Options.attnKeyLen), uint32(2), m.Options.ropeBase, m.Options.ropeScale), nil return fast.RoPE(ctx, key, shift, m.Options.attnKeyLen, m.Options.ropeBase, m.Options.ropeScale, rope.WithTypeNeoX()), nil
} }
type MLP struct { type MLP struct {
...@@ -169,15 +175,8 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Ten ...@@ -169,15 +175,8 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Ten
} }
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) { func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions)) positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
if err != nil { outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
return nil, err
}
outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
if err != nil {
return nil, err
}
hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs) hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)
hiddenState = hiddenState.Scale(ctx, math.Sqrt(float64(m.Options.hiddenSize))) hiddenState = hiddenState.Scale(ctx, math.Sqrt(float64(m.Options.hiddenSize)))
......
...@@ -60,12 +60,16 @@ func New(c fs.Config) (model.Model, error) { ...@@ -60,12 +60,16 @@ func New(c fs.Config) (model.Model, error) {
Values: c.Strings("tokenizer.ggml.tokens"), Values: c.Strings("tokenizer.ggml.tokens"),
Scores: c.Floats("tokenizer.ggml.scores"), Scores: c.Floats("tokenizer.ggml.scores"),
Types: c.Ints("tokenizer.ggml.token_type"), Types: c.Ints("tokenizer.ggml.token_type"),
BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")),
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true), AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
EOS: int32(1), BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false), AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
EOT: int32(106), EOS: append(
AddEOT: c.Bool("tokenizer.ggml.add_eot_token", false), []int32{
int32(c.Uint("tokenizer.ggml.eos_token_id")),
int32(c.Uint("tokenizer.ggml.eot_token_id", 106)),
},
c.Ints("tokenizer.ggml.eos_token_ids")...,
),
}, },
), ),
ImageProcessor: newImageProcessor(c), ImageProcessor: newImageProcessor(c),
...@@ -82,7 +86,7 @@ func New(c fs.Config) (model.Model, error) { ...@@ -82,7 +86,7 @@ func New(c fs.Config) (model.Model, error) {
return &m, nil return &m, nil
} }
func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) { func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
if len(m.VisionModel.Layers) == 0 { if len(m.VisionModel.Layers) == 0 {
return nil, model.ErrNoVisionModel return nil, model.ErrNoVisionModel
} }
...@@ -97,33 +101,30 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er ...@@ -97,33 +101,30 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
return nil, err return nil, err
} }
pixelValues, err := ctx.Input().FromFloatSlice(f32s, pixelValues := ctx.Input().FromFloatSlice(f32s,
m.ImageProcessor.imageSize, m.ImageProcessor.imageSize,
m.ImageProcessor.imageSize, m.ImageProcessor.imageSize,
m.ImageProcessor.numChannels, m.ImageProcessor.numChannels,
) )
if err != nil {
return nil, err
}
visionOutputs := m.VisionModel.Forward(ctx, pixelValues) visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
visionOutputs = m.MultiModalProjector.Forward(ctx, visionOutputs, m.imageSize, m.patchSize, m.VisionModel.eps) visionOutputs = m.MultiModalProjector.Forward(ctx, visionOutputs, m.imageSize, m.patchSize, m.VisionModel.eps)
return visionOutputs, nil return []input.Multimodal{{Tensor: visionOutputs}}, nil
} }
func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) { func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
var result []input.Input var result []input.Input
for _, inp := range inputs { for _, inp := range inputs {
if inp.Multimodal == nil { if len(inp.Multimodal) == 0 {
result = append(result, inp) result = append(result, inp)
} else { } else {
inputMultimodal := inp.Multimodal.(ml.Tensor) inputMultimodal := inp.Multimodal[0].Tensor
result = append(result, result = append(result,
input.Input{Token: 108, SameBatch: inputMultimodal.Dim(1) + 3}, // "\n\n" input.Input{Token: 108, SameBatch: inputMultimodal.Dim(1) + 3}, // "\n\n"
input.Input{Token: 255999}, // "<start_of_image>"" input.Input{Token: 255999}, // "<start_of_image>""
input.Input{Multimodal: inputMultimodal, MultimodalHash: inp.MultimodalHash}, // image data is on the first placeholder input.Input{Multimodal: []input.Multimodal{{Tensor: inputMultimodal}}, MultimodalHash: inp.MultimodalHash}, // image data is on the first placeholder
) )
// add image token placeholders // add image token placeholders
...@@ -140,15 +141,8 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) { ...@@ -140,15 +141,8 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
} }
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) { func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions)) positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
if err != nil { outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
return nil, err
}
outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
if err != nil {
return nil, err
}
return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil
} }
......
...@@ -7,7 +7,8 @@ import ( ...@@ -7,7 +7,8 @@ import (
"github.com/ollama/ollama/kvcache" "github.com/ollama/ollama/kvcache"
"github.com/ollama/ollama/ml" "github.com/ollama/ollama/ml"
"github.com/ollama/ollama/ml/nn" "github.com/ollama/ollama/ml/nn"
"github.com/ollama/ollama/model" "github.com/ollama/ollama/ml/nn/fast"
"github.com/ollama/ollama/ml/nn/rope"
"github.com/ollama/ollama/model/input" "github.com/ollama/ollama/model/input"
) )
...@@ -20,9 +21,6 @@ type TextConfig struct { ...@@ -20,9 +21,6 @@ type TextConfig struct {
} }
type TextModel struct { type TextModel struct {
model.Base
model.SentencePieceModel
TokenEmbedding *nn.Embedding `gguf:"token_embd"` TokenEmbedding *nn.Embedding `gguf:"token_embd"`
Layers []TextLayer `gguf:"blk"` Layers []TextLayer `gguf:"blk"`
OutputNorm *nn.RMSNorm `gguf:"output_norm"` OutputNorm *nn.RMSNorm `gguf:"output_norm"`
...@@ -45,15 +43,6 @@ func newTextModel(c fs.Config) *TextModel { ...@@ -45,15 +43,6 @@ func newTextModel(c fs.Config) *TextModel {
numBlocks := int(c.Uint("block_count")) numBlocks := int(c.Uint("block_count"))
m := TextModel{ m := TextModel{
SentencePieceModel: model.NewSentencePieceModel(
&model.Vocabulary{
Values: c.Strings("tokenizer.ggml.tokens"),
Scores: c.Floats("tokenizer.ggml.scores"),
Types: c.Ints("tokenizer.ggml.token_type"),
BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")),
EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")),
},
),
Layers: make([]TextLayer, numBlocks), Layers: make([]TextLayer, numBlocks),
TextConfig: &TextConfig{ TextConfig: &TextConfig{
hiddenSize: int(c.Uint("embedding_length")), hiddenSize: int(c.Uint("embedding_length")),
...@@ -86,7 +75,6 @@ type TextSelfAttention struct { ...@@ -86,7 +75,6 @@ type TextSelfAttention struct {
func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextConfig) ml.Tensor { func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextConfig) ml.Tensor {
batchSize := hiddenState.Dim(1) batchSize := hiddenState.Dim(1)
ropeType := uint32(2)
ropeBase := opts.ropeLocalBase ropeBase := opts.ropeLocalBase
if (layer+1)%gemmaGlobalCacheCount == 0 { if (layer+1)%gemmaGlobalCacheCount == 0 {
...@@ -96,7 +84,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, pos ...@@ -96,7 +84,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, pos
q := sa.Query.Forward(ctx, hiddenState) q := sa.Query.Forward(ctx, hiddenState)
q = q.Reshape(ctx, opts.attnKeyLen, opts.numHeads, batchSize) q = q.Reshape(ctx, opts.attnKeyLen, opts.numHeads, batchSize)
q = sa.QueryNorm.Forward(ctx, q, opts.eps) q = sa.QueryNorm.Forward(ctx, q, opts.eps)
q = q.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, ropeBase, opts.ropeScale) q = fast.RoPE(ctx, q, positionIDs, opts.attnKeyLen, ropeBase, opts.ropeScale, rope.WithTypeNeoX())
if opts.largeModelScaling { if opts.largeModelScaling {
q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.hiddenSize/opts.numHeads))) q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.hiddenSize/opts.numHeads)))
...@@ -107,7 +95,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, pos ...@@ -107,7 +95,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, pos
k := sa.Key.Forward(ctx, hiddenState) k := sa.Key.Forward(ctx, hiddenState)
k = k.Reshape(ctx, opts.attnKeyLen, opts.numKVHeads, batchSize) k = k.Reshape(ctx, opts.attnKeyLen, opts.numKVHeads, batchSize)
k = sa.KeyNorm.Forward(ctx, k, opts.eps) k = sa.KeyNorm.Forward(ctx, k, opts.eps)
k = k.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, ropeBase, opts.ropeScale) k = fast.RoPE(ctx, k, positionIDs, opts.attnKeyLen, ropeBase, opts.ropeScale, rope.WithTypeNeoX())
v := sa.Value.Forward(ctx, hiddenState) v := sa.Value.Forward(ctx, hiddenState)
v = v.Reshape(ctx, opts.attnValLen, opts.numKVHeads, batchSize) v = v.Reshape(ctx, opts.attnValLen, opts.numKVHeads, batchSize)
...@@ -125,7 +113,7 @@ func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.T ...@@ -125,7 +113,7 @@ func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.T
ropeBase = m.TextConfig.ropeGlobalBase ropeBase = m.TextConfig.ropeGlobalBase
} }
return key.RoPE(ctx, shift, nil, uint32(m.TextConfig.attnKeyLen), uint32(2), ropeBase, m.TextConfig.ropeScale), nil return fast.RoPE(ctx, key, shift, m.TextConfig.attnKeyLen, ropeBase, m.TextConfig.ropeScale, rope.WithTypeNeoX()), nil
} }
type TextMLP struct { type TextMLP struct {
...@@ -178,7 +166,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor ...@@ -178,7 +166,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
// set image embeddings // set image embeddings
var except []int var except []int
for _, image := range batch.Multimodal { for _, image := range batch.Multimodal {
visionOutputs := image.Multimodal.(ml.Tensor) visionOutputs := image.Multimodal[0].Tensor
ctx.Forward(visionOutputs.Copy(ctx, hiddenState.View(ctx, image.Index*hiddenState.Stride(1), visionOutputs.Dim(0)*visionOutputs.Dim(1)))) ctx.Forward(visionOutputs.Copy(ctx, hiddenState.View(ctx, image.Index*hiddenState.Stride(1), visionOutputs.Dim(0)*visionOutputs.Dim(1))))
for i := range visionOutputs.Dim(1) { for i := range visionOutputs.Dim(1) {
......
package llama package llama
import ( import (
"fmt" "cmp"
"math" "math"
"strings"
"github.com/ollama/ollama/fs" "github.com/ollama/ollama/fs"
"github.com/ollama/ollama/kvcache" "github.com/ollama/ollama/kvcache"
"github.com/ollama/ollama/ml" "github.com/ollama/ollama/ml"
"github.com/ollama/ollama/ml/nn" "github.com/ollama/ollama/ml/nn"
"github.com/ollama/ollama/ml/nn/fast"
"github.com/ollama/ollama/ml/nn/rope"
"github.com/ollama/ollama/model" "github.com/ollama/ollama/model"
"github.com/ollama/ollama/model/input" "github.com/ollama/ollama/model/input"
) )
type Options struct { type Options struct {
hiddenSize, numHeads, numKVHeads int hiddenSize, numHeads, numKVHeads int
headDim, ropeDim int
eps, ropeBase, ropeScale float32 eps, ropeBase, ropeScale float32
ropeDim uint32
} }
type Model struct { type Model struct {
...@@ -32,10 +33,6 @@ type Model struct { ...@@ -32,10 +33,6 @@ type Model struct {
} }
func New(c fs.Config) (model.Model, error) { func New(c fs.Config) (model.Model, error) {
if !strings.EqualFold(c.String("tokenizer.ggml.model"), "gpt2") {
return nil, fmt.Errorf("tokenizer %s not yet supported", c.String("tokenizer.ggml.model"))
}
m := Model{ m := Model{
BytePairEncoding: model.NewBytePairEncoding( BytePairEncoding: model.NewBytePairEncoding(
c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`), c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
...@@ -43,10 +40,13 @@ func New(c fs.Config) (model.Model, error) { ...@@ -43,10 +40,13 @@ func New(c fs.Config) (model.Model, error) {
Values: c.Strings("tokenizer.ggml.tokens"), Values: c.Strings("tokenizer.ggml.tokens"),
Types: c.Ints("tokenizer.ggml.token_type"), Types: c.Ints("tokenizer.ggml.token_type"),
Merges: c.Strings("tokenizer.ggml.merges"), Merges: c.Strings("tokenizer.ggml.merges"),
BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")),
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true), AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")), BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false), AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
EOS: append(
[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
c.Ints("tokenizer.ggml.eos_token_ids")...,
),
}, },
), ),
Layers: make([]Layer, c.Uint("block_count")), Layers: make([]Layer, c.Uint("block_count")),
...@@ -54,10 +54,11 @@ func New(c fs.Config) (model.Model, error) { ...@@ -54,10 +54,11 @@ func New(c fs.Config) (model.Model, error) {
hiddenSize: int(c.Uint("embedding_length")), hiddenSize: int(c.Uint("embedding_length")),
numHeads: int(c.Uint("attention.head_count")), numHeads: int(c.Uint("attention.head_count")),
numKVHeads: int(c.Uint("attention.head_count_kv")), numKVHeads: int(c.Uint("attention.head_count_kv")),
headDim: int(c.Uint("attention.key_length")),
ropeDim: int(c.Uint("rope.dimension_count")),
eps: c.Float("attention.layer_norm_rms_epsilon"), eps: c.Float("attention.layer_norm_rms_epsilon"),
ropeBase: c.Float("rope.freq_base"), ropeBase: c.Float("rope.freq_base"),
ropeScale: c.Float("rope.freq_scale", 1), ropeScale: c.Float("rope.freq_scale", 1),
ropeDim: c.Uint("rope.dimension_count"),
}, },
} }
...@@ -74,31 +75,31 @@ type SelfAttention struct { ...@@ -74,31 +75,31 @@ type SelfAttention struct {
RopeFactors ml.Tensor `gguf:"rope_freqs.weight"` RopeFactors ml.Tensor `gguf:"rope_freqs.weight"`
} }
func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor { func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
batchSize := hiddenState.Dim(1) batchSize := hiddenState.Dim(1)
headDim := opts.hiddenSize / opts.numHeads headDim := cmp.Or(opts.headDim, opts.hiddenSize/opts.numHeads)
ropeType := uint32(0) ropeDim := cmp.Or(opts.ropeDim, headDim)
q := sa.Query.Forward(ctx, hiddenState) query := sa.Query.Forward(ctx, hiddenState)
q = q.Reshape(ctx, headDim, opts.numHeads, batchSize) query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
q = q.RoPE(ctx, positionIDs, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
k := sa.Key.Forward(ctx, hiddenState) key := sa.Key.Forward(ctx, hiddenState)
k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize) key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
k = k.RoPE(ctx, positionIDs, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
v := sa.Value.Forward(ctx, hiddenState) value := sa.Value.Forward(ctx, hiddenState)
v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize) value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
scaleFactor := 1.0 / math.Sqrt(float64(headDim)) query = fast.RoPE(ctx, query, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
kqv := nn.Attention(ctx, q, k, v, scaleFactor, cache) key = fast.RoPE(ctx, key, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
kqv = kqv.Reshape(ctx, opts.hiddenSize, batchSize)
return sa.Output.Forward(ctx, kqv) attention := nn.Attention(ctx, query, key, value, 1.0/math.Sqrt(float64(headDim)), cache)
attention = attention.Reshape(ctx, headDim*opts.numHeads, batchSize)
return sa.Output.Forward(ctx, attention)
} }
func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) { func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
return key.RoPE(ctx, shift, m.Layers[layer].SelfAttention.RopeFactors, uint32(0), m.ropeDim, m.ropeBase, m.ropeScale), nil ropeDim := cmp.Or(m.ropeDim, m.hiddenSize/m.numHeads)
return fast.RoPE(ctx, key, shift, ropeDim, m.ropeBase, m.ropeScale, rope.WithFactors(m.Layers[layer].SelfAttention.RopeFactors)), nil
} }
type MLP struct { type MLP struct {
...@@ -119,11 +120,11 @@ type Layer struct { ...@@ -119,11 +120,11 @@ type Layer struct {
MLP *MLP MLP *MLP
} }
func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor { func (l *Layer) Forward(ctx ml.Context, hiddenState, positions, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
residual := hiddenState residual := hiddenState
hiddenState = l.AttentionNorm.Forward(ctx, hiddenState, opts.eps) hiddenState = l.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
hiddenState = l.SelfAttention.Forward(ctx, hiddenState, positionIDs, cache, opts) hiddenState = l.SelfAttention.Forward(ctx, hiddenState, positions, cache, opts)
// In the final layer (outputs != nil), optimize by pruning to just the token positions // In the final layer (outputs != nil), optimize by pruning to just the token positions
// we need logits for. // we need logits for.
...@@ -141,27 +142,19 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Ten ...@@ -141,27 +142,19 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Ten
} }
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) { func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions)) positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
if err != nil {
return nil, err
}
outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
if err != nil {
return nil, err
}
hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs) hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)
for i, layer := range m.Layers { for i, layer := range m.Layers {
m.Cache.SetLayer(i) m.Cache.SetLayer(i)
var lastLayerOutputs ml.Tensor var outputs ml.Tensor
if i == len(m.Layers)-1 { if i == len(m.Layers)-1 {
lastLayerOutputs = outputs outputs = ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
} }
hiddenState = layer.Forward(ctx, hiddenState, positions, lastLayerOutputs, m.Cache, m.Options) hiddenState = layer.Forward(ctx, hiddenState, positions, outputs, m.Cache, m.Options)
} }
hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps) hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
......
...@@ -4,7 +4,6 @@ import ( ...@@ -4,7 +4,6 @@ import (
"bytes" "bytes"
"image" "image"
"slices" "slices"
"sync"
"github.com/ollama/ollama/fs" "github.com/ollama/ollama/fs"
"github.com/ollama/ollama/kvcache" "github.com/ollama/ollama/kvcache"
...@@ -41,10 +40,13 @@ func New(c fs.Config) (model.Model, error) { ...@@ -41,10 +40,13 @@ func New(c fs.Config) (model.Model, error) {
Values: c.Strings("tokenizer.ggml.tokens"), Values: c.Strings("tokenizer.ggml.tokens"),
Types: c.Ints("tokenizer.ggml.token_type"), Types: c.Ints("tokenizer.ggml.token_type"),
Merges: c.Strings("tokenizer.ggml.merges"), Merges: c.Strings("tokenizer.ggml.merges"),
BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")),
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true), AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")), BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false), AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
EOS: append(
[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
c.Ints("tokenizer.ggml.eos_token_ids")...,
),
}, },
), ),
ImageProcessor: newImageProcessor(c), ImageProcessor: newImageProcessor(c),
...@@ -60,7 +62,7 @@ func New(c fs.Config) (model.Model, error) { ...@@ -60,7 +62,7 @@ func New(c fs.Config) (model.Model, error) {
return &m, nil return &m, nil
} }
func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) { func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
if len(m.VisionModel.Layers) < 1 { if len(m.VisionModel.Layers) < 1 {
return nil, model.ErrNoVisionModel return nil, model.ErrNoVisionModel
} }
...@@ -75,10 +77,7 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er ...@@ -75,10 +77,7 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
return nil, err return nil, err
} }
tilesLocal, err := ctx.Input().FromFloatSlice(pixelsLocal, size.X, size.Y, m.numChannels) tilesLocal := ctx.Input().FromFloatSlice(pixelsLocal, size.X, size.Y, m.numChannels)
if err != nil {
return nil, err
}
ratioW, ratioH := size.X/m.imageSize, size.Y/m.imageSize ratioW, ratioH := size.X/m.imageSize, size.Y/m.imageSize
...@@ -89,81 +88,86 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er ...@@ -89,81 +88,86 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
pixelValues := tilesLocal pixelValues := tilesLocal
if len(pixelsGlobal) > 0 { if len(pixelsGlobal) > 0 {
tilesGlobal, err := ctx.Input().FromFloatSlice(pixelsGlobal, m.imageSize, m.imageSize, m.numChannels) tilesGlobal := ctx.Input().FromFloatSlice(pixelsGlobal, m.imageSize, m.imageSize, m.numChannels)
if err != nil {
return nil, err
}
pixelValues = pixelValues.Concat(ctx, tilesGlobal, 3) pixelValues = pixelValues.Concat(ctx, tilesGlobal, 3)
} }
visionOutputs := m.VisionModel.Forward(ctx, pixelValues) visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
visionOutputs = visionOutputs.Reshape(ctx, visionOutputs.Dim(0), visionOutputs.Dim(1)*visionOutputs.Dim(2)*visionOutputs.Dim(3)) visionOutputs = visionOutputs.Reshape(ctx, visionOutputs.Dim(0), visionOutputs.Dim(1)*visionOutputs.Dim(2)*visionOutputs.Dim(3))
projectedOutputs := m.Projector.Forward(ctx, visionOutputs) projectedOutputs := m.Projector.Forward(ctx, visionOutputs)
return &chunks{Model: m, Tensor: projectedOutputs, aspectRatio: image.Point{ratioW, ratioH}}, nil
}
type chunks struct { var multimodal []input.Multimodal
*Model aspectRatio := image.Point{ratioW, ratioH}
ml.Tensor
aspectRatio image.Point var offset int
patchesPerChunk := projectedOutputs.Dim(1)
if aspectRatio.Y*aspectRatio.X > 1 {
patchesPerChunk = projectedOutputs.Dim(1) / (aspectRatio.X*aspectRatio.Y + 1)
for range aspectRatio.Y {
for x := range aspectRatio.X {
view := projectedOutputs.View(ctx, projectedOutputs.Stride(1)*offset,
projectedOutputs.Dim(0), projectedOutputs.Stride(1),
patchesPerChunk)
var separator separator
if x < aspectRatio.X-1 {
separator.x = true // <|tile_x_separator|>
} else {
separator.y = true // <|tile_y_separator|>
}
multimodal = append(multimodal, input.Multimodal{Tensor: view, Data: &separator})
offset += patchesPerChunk
}
}
}
dataOnce sync.Once view := projectedOutputs.View(ctx, projectedOutputs.Stride(1)*offset,
data []float32 projectedOutputs.Dim(0), projectedOutputs.Stride(1),
} patchesPerChunk)
multimodal = append(multimodal, input.Multimodal{Tensor: view, Data: &separator{}})
type chunk struct { return multimodal, nil
*chunks
s, n int
} }
func (r *chunk) floats() []float32 { type separator struct {
r.dataOnce.Do(func() { x bool
temp := r.Backend().NewContext() y bool
defer temp.Close()
temp.Forward(r.Tensor).Compute(r.Tensor)
r.data = r.Floats()
})
return r.data[r.s*r.Dim(0) : (r.s+r.n)*r.Dim(0)]
} }
func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) { func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
var result []input.Input var result []input.Input
for _, inp := range inputs { for _, inp := range inputs {
if inp.Multimodal == nil { if len(inp.Multimodal) == 0 {
result = append(result, inp) result = append(result, inp)
continue continue
} }
t := inp.Multimodal.(*chunks)
var imageInputs []input.Input var imageInputs []input.Input
imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_start|> imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_start|>
var offset int for i, mm := range inp.Multimodal {
patchesPerChunk := t.Dim(1) patchesPerChunk := mm.Tensor.Dim(1)
if t.aspectRatio.Y*t.aspectRatio.X > 1 {
patchesPerChunk = t.Dim(1) / (t.aspectRatio.X*t.aspectRatio.Y + 1)
for range t.aspectRatio.Y {
for x := range t.aspectRatio.X {
imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: &chunk{t, offset, patchesPerChunk}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
if x < t.aspectRatio.X-1 {
imageInputs = append(imageInputs, input.Input{Token: 200084}) // <|tile_x_separator|>
}
offset += patchesPerChunk
}
imageInputs = append(imageInputs, input.Input{Token: 200085}) // <|tile_y_separator|> if i < len(inp.Multimodal)-1 {
separator := mm.Data.(*separator)
imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: []input.Multimodal{{Tensor: mm.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
if separator.x {
imageInputs = append(imageInputs, input.Input{Token: 200084}) // <|tile_x_separator|>
}
if separator.y {
imageInputs = append(imageInputs, input.Input{Token: 200085}) // <|tile_y_separator|>
}
} else {
imageInputs = append(imageInputs, input.Input{Token: 200090}) // <|image|>
imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: []input.Multimodal{{Tensor: mm.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_end|>
} }
} }
imageInputs = append(imageInputs, input.Input{Token: 200090}) // <|image|>
imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: &chunk{t, offset, patchesPerChunk}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_end|>
result = append(result, imageInputs...) result = append(result, imageInputs...)
} }
...@@ -171,15 +175,8 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) { ...@@ -171,15 +175,8 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
} }
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) { func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions)) positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
if err != nil { outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
return nil, err
}
outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
if err != nil {
return nil, err
}
return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil
} }
......
...@@ -8,6 +8,8 @@ import ( ...@@ -8,6 +8,8 @@ import (
"github.com/ollama/ollama/kvcache" "github.com/ollama/ollama/kvcache"
"github.com/ollama/ollama/ml" "github.com/ollama/ollama/ml"
"github.com/ollama/ollama/ml/nn" "github.com/ollama/ollama/ml/nn"
"github.com/ollama/ollama/ml/nn/fast"
"github.com/ollama/ollama/ml/nn/rope"
"github.com/ollama/ollama/model/input" "github.com/ollama/ollama/model/input"
) )
...@@ -31,8 +33,8 @@ func (sa *TextAttention) Forward(ctx ml.Context, hiddenStates, positions, attent ...@@ -31,8 +33,8 @@ func (sa *TextAttention) Forward(ctx ml.Context, hiddenStates, positions, attent
value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize) value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
if useRope { if useRope {
query = query.RoPE(ctx, positions, sa.RopeFactors, uint32(opts.ropeDim), uint32(0), opts.ropeBase, opts.ropeScale) query = fast.RoPE(ctx, query, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
key = key.RoPE(ctx, positions, sa.RopeFactors, uint32(opts.ropeDim), uint32(0), opts.ropeBase, opts.ropeScale) key = fast.RoPE(ctx, key, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
} }
if opts.useQKNorm { if opts.useQKNorm {
...@@ -61,9 +63,9 @@ func (mlp *TextMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *TextOp ...@@ -61,9 +63,9 @@ func (mlp *TextMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *TextOp
} }
type TextExperts struct { type TextExperts struct {
Gate ml.Tensor `gguf:"ffn_gate_exps.weight"` Gate *nn.Linear `gguf:"ffn_gate_exps"`
Up ml.Tensor `gguf:"ffn_up_exps.weight"` Up *nn.Linear `gguf:"ffn_up_exps"`
Down ml.Tensor `gguf:"ffn_down_exps.weight"` Down *nn.Linear `gguf:"ffn_down_exps"`
} }
func (e *TextExperts) Forward(ctx ml.Context, hiddenStates, routerLogits ml.Tensor, opts *TextOptions) ml.Tensor { func (e *TextExperts) Forward(ctx ml.Context, hiddenStates, routerLogits ml.Tensor, opts *TextOptions) ml.Tensor {
...@@ -74,13 +76,13 @@ func (e *TextExperts) Forward(ctx ml.Context, hiddenStates, routerLogits ml.Tens ...@@ -74,13 +76,13 @@ func (e *TextExperts) Forward(ctx ml.Context, hiddenStates, routerLogits ml.Tens
hiddenStates = hiddenStates.Repeat(ctx, 1, opts.numExpertsUsed) hiddenStates = hiddenStates.Repeat(ctx, 1, opts.numExpertsUsed)
hiddenStates = hiddenStates.Mul(ctx, scores) hiddenStates = hiddenStates.Mul(ctx, scores)
upStates := e.Up.MulmatID(ctx, hiddenStates, experts) upStates := e.Up.Weight.MulmatID(ctx, hiddenStates, experts)
gateStates := e.Gate.MulmatID(ctx, hiddenStates, experts) gateStates := e.Gate.Weight.MulmatID(ctx, hiddenStates, experts)
downStates := e.Down.MulmatID(ctx, upStates.Mul(ctx, gateStates.SILU(ctx)), experts) downStates := e.Down.Weight.MulmatID(ctx, upStates.Mul(ctx, gateStates.SILU(ctx)), experts)
nextStates := downStates.View(ctx, 0, hiddenStates.Dim(0), downStates.Stride(2), hiddenStates.Dim(2)) nextStates := downStates.View(ctx, 0, hiddenStates.Dim(0), downStates.Stride(2), hiddenStates.Dim(2))
for i := 1; i < opts.numExpertsUsed; i++ { for i := 1; i < opts.numExpertsUsed; i++ {
nextStates.Add(ctx, downStates.View(ctx, i*downStates.Stride(1), hiddenStates.Dim(0), downStates.Stride(2), hiddenStates.Dim(2))) nextStates = nextStates.Add(ctx, downStates.View(ctx, i*downStates.Stride(1), hiddenStates.Dim(0), downStates.Stride(2), hiddenStates.Dim(2)))
} }
return nextStates return nextStates
...@@ -210,12 +212,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor ...@@ -210,12 +212,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
hiddenStates := m.TokenEmbedding.Forward(ctx, inputs).Duplicate(ctx) hiddenStates := m.TokenEmbedding.Forward(ctx, inputs).Duplicate(ctx)
for _, mi := range batch.Multimodal { for _, mi := range batch.Multimodal {
f32s := mi.Multimodal.(*chunk).floats() img := mi.Multimodal[0].Tensor
img, err := ctx.Input().FromFloatSlice(f32s, len(f32s)/m.hiddenSize, m.hiddenSize)
if err != nil {
panic(err)
}
ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1)))) ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1))))
} }
...@@ -226,11 +223,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor ...@@ -226,11 +223,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
scales[i] = float32(math.Log(math.Floor(((float64(p)+1.0)/float64(m.attentionFloorScale))+1.0))*m.attentionScale + 1.0) scales[i] = float32(math.Log(math.Floor(((float64(p)+1.0)/float64(m.attentionFloorScale))+1.0))*m.attentionScale + 1.0)
} }
var err error attentionScales = ctx.Input().FromFloatSlice(scales, 1, 1, len(scales))
attentionScales, err = ctx.Input().FromFloatSlice(scales, 1, 1, len(scales))
if err != nil {
panic(err)
}
} }
for i, layer := range m.Layers { for i, layer := range m.Layers {
...@@ -255,5 +248,5 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor ...@@ -255,5 +248,5 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
} }
func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) { func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
return key.RoPE(ctx, shift, m.Layers[layer].Attention.RopeFactors, uint32(0), uint32(m.ropeDim), m.ropeBase, m.ropeScale), nil return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale, rope.WithFactors(m.Layers[layer].Attention.RopeFactors)), nil
} }
...@@ -208,7 +208,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor { ...@@ -208,7 +208,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
} }
hiddenStates = m.LayerNormPost.Forward(ctx, hiddenStates, m.eps) hiddenStates = m.LayerNormPost.Forward(ctx, hiddenStates, m.eps)
hiddenStates = hiddenStates.Unpad(ctx, 0, 1, 0, 0) hiddenStates = hiddenStates.Pad(ctx, 0, -1, 0, 0)
hiddenStates = m.VisionAdapter.Forward(ctx, hiddenStates, m.VisionOptions) hiddenStates = m.VisionAdapter.Forward(ctx, hiddenStates, m.VisionOptions)
return hiddenStates return hiddenStates
} }
...@@ -245,10 +245,7 @@ func (m *VisionModel) rotaryEmbedding(ctx ml.Context) (ml.Tensor, ml.Tensor) { ...@@ -245,10 +245,7 @@ func (m *VisionModel) rotaryEmbedding(ctx ml.Context) (ml.Tensor, ml.Tensor) {
} }
} }
ropeFreqs, err := ctx.Input().FromFloatSlice(freqs, freqDim/2, numPatches, 2) ropeFreqs := ctx.Input().FromFloatSlice(freqs, freqDim/2, numPatches, 2)
if err != nil {
panic(err)
}
ropeFreqs = ropeFreqs.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx) ropeFreqs = ropeFreqs.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
ropeFreqs = ropeFreqs.Reshape(ctx, freqDim, 1, numPatches) ropeFreqs = ropeFreqs.Reshape(ctx, freqDim, 1, numPatches)
......
...@@ -4,7 +4,6 @@ import ( ...@@ -4,7 +4,6 @@ import (
"bytes" "bytes"
"image" "image"
"slices" "slices"
"sync"
"github.com/ollama/ollama/fs" "github.com/ollama/ollama/fs"
"github.com/ollama/ollama/kvcache" "github.com/ollama/ollama/kvcache"
...@@ -16,6 +15,8 @@ import ( ...@@ -16,6 +15,8 @@ import (
type Model struct { type Model struct {
model.Base model.Base
model.BytePairEncoding
*TextModel *TextModel
*VisionModel `gguf:"v,vision"` *VisionModel `gguf:"v,vision"`
*MultiModalProjector `gguf:"mm"` *MultiModalProjector `gguf:"mm"`
...@@ -30,13 +31,23 @@ var _ model.MultimodalProcessor = (*Model)(nil) ...@@ -30,13 +31,23 @@ var _ model.MultimodalProcessor = (*Model)(nil)
var _ model.TextProcessor = (*Model)(nil) var _ model.TextProcessor = (*Model)(nil)
func New(c fs.Config) (model.Model, error) { func New(c fs.Config) (model.Model, error) {
textModel, err := NewTextModel(c)
if err != nil {
return nil, err
}
m := &Model{ m := &Model{
TextModel: textModel, BytePairEncoding: model.NewBytePairEncoding(
c.String("tokenizer.ggml.pretokenizer", `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
&model.Vocabulary{
Values: c.Strings("tokenizer.ggml.tokens"),
Types: c.Ints("tokenizer.ggml.token_type"),
Merges: c.Strings("tokenizer.ggml.merges"),
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
EOS: append(
[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
c.Ints("tokenizer.ggml.eos_token_ids")...,
),
},
),
TextModel: newTextModel(c),
VisionModel: newVisionModel(c), VisionModel: newVisionModel(c),
ImageProcessor: newImageProcessor(c), ImageProcessor: newImageProcessor(c),
MultiModalProjector: newMultiModalProjector(c), MultiModalProjector: newMultiModalProjector(c),
...@@ -88,7 +99,7 @@ func newMultiModalProjector(c fs.Config) *MultiModalProjector { ...@@ -88,7 +99,7 @@ func newMultiModalProjector(c fs.Config) *MultiModalProjector {
} }
} }
func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) { func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
if len(m.VisionModel.Layers) == 0 { if len(m.VisionModel.Layers) == 0 {
return nil, model.ErrNoVisionModel return nil, model.ErrNoVisionModel
} }
...@@ -103,46 +114,20 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er ...@@ -103,46 +114,20 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
return nil, err return nil, err
} }
pixelValues, err := ctx.Input().FromFloatSlice(f32s, size.X, size.Y, m.ImageProcessor.numChannels) pixelValues := ctx.Input().FromFloatSlice(f32s, size.X, size.Y, m.ImageProcessor.numChannels)
if err != nil {
return nil, err
}
visionOutputs := m.VisionModel.Forward(ctx, pixelValues) visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
features, size := m.MultiModalProjector.Forward(ctx, visionOutputs, size) features, size := m.MultiModalProjector.Forward(ctx, visionOutputs, size)
// split into patches to be sent to the text transformer // split into patches to be sent to the text transformer
parent := imageFeatures{tensor: features} rows := make([]input.Multimodal, size.Y)
rows := make([]*imageRow, size.Y)
for i := range rows { for i := range rows {
rows[i] = &imageRow{parent: &parent, s: i, shape: []int{features.Dim(0), size.X}} rows[i].Tensor = features.View(ctx, features.Stride(1)*size.X*i, features.Dim(0), features.Stride(1), size.X)
} }
return rows, nil return rows, nil
} }
type imageFeatures struct {
tensor ml.Tensor
dataOnce sync.Once
data []float32
}
type imageRow struct {
parent *imageFeatures
s int
shape []int
}
func (r *imageRow) data() []float32 {
n := 1
for _, s := range r.shape {
n *= s
}
return r.parent.data[r.s*n : (r.s+1)*n]
}
// PostTokenize arranges Mistral 3's inputs for the forward pass // PostTokenize arranges Mistral 3's inputs for the forward pass
// In Mistral 3 and Pixtral, the input patches are arranged as follows: // In Mistral 3 and Pixtral, the input patches are arranged as follows:
// [IMG]...[IMG][IMG_BREAK][IMG]...[IMG][IMG_BREAK][IMG]...[IMG][IMG_END] // [IMG]...[IMG][IMG_BREAK][IMG]...[IMG][IMG_BREAK][IMG]...[IMG][IMG_END]
...@@ -151,15 +136,14 @@ func (r *imageRow) data() []float32 { ...@@ -151,15 +136,14 @@ func (r *imageRow) data() []float32 {
func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) { func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
var result []input.Input var result []input.Input
for _, inp := range inputs { for _, inp := range inputs {
if inp.Multimodal == nil { if len(inp.Multimodal) == 0 {
result = append(result, inp) result = append(result, inp)
} else { } else {
inputMultimodal := inp.Multimodal.([]*imageRow) for i, row := range inp.Multimodal {
for i, row := range inputMultimodal {
// [IMG] // [IMG]
result = append(result, input.Input{Token: 10, Multimodal: row, MultimodalHash: inp.MultimodalHash, SameBatch: row.shape[1]}) result = append(result, input.Input{Token: 10, Multimodal: []input.Multimodal{{Tensor: row.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: row.Tensor.Dim(1)})
result = append(result, slices.Repeat([]input.Input{{Token: 10}}, row.shape[1]-1)...) result = append(result, slices.Repeat([]input.Input{{Token: 10}}, row.Tensor.Dim(1)-1)...)
if i == len(inputMultimodal)-1 { if i == len(inp.Multimodal)-1 {
// [IMG_END] // [IMG_END]
result = append(result, input.Input{Token: 13}) result = append(result, input.Input{Token: 13})
} else { } else {
...@@ -174,15 +158,8 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) { ...@@ -174,15 +158,8 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
} }
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) { func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions)) positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
if err != nil { outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
return nil, err
}
outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
if err != nil {
return nil, err
}
return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil
} }
......
package mistral3 package mistral3
import ( import (
"fmt" "cmp"
"math" "math"
"strings"
"github.com/ollama/ollama/fs" "github.com/ollama/ollama/fs"
"github.com/ollama/ollama/kvcache" "github.com/ollama/ollama/kvcache"
"github.com/ollama/ollama/ml" "github.com/ollama/ollama/ml"
"github.com/ollama/ollama/ml/nn" "github.com/ollama/ollama/ml/nn"
"github.com/ollama/ollama/model" "github.com/ollama/ollama/ml/nn/fast"
"github.com/ollama/ollama/model/input" "github.com/ollama/ollama/model/input"
) )
type TextOptions struct { type TextOptions struct {
hiddenSize, numHeads, numKVHeads, headDim int hiddenSize, numHeads, numKVHeads int
eps, ropeBase, ropeScale float32 headDim, ropeDim int
ropeDim uint32 eps, ropeBase, ropeScale float32
} }
type TextModel struct { type TextModel struct {
model.Base
model.BytePairEncoding
TokenEmbedding *nn.Embedding `gguf:"token_embd"` TokenEmbedding *nn.Embedding `gguf:"token_embd"`
Layers []Layer `gguf:"blk"` Layers []Layer `gguf:"blk"`
OutputNorm *nn.RMSNorm `gguf:"output_norm"` OutputNorm *nn.RMSNorm `gguf:"output_norm"`
...@@ -40,19 +36,15 @@ type SelfAttention struct { ...@@ -40,19 +36,15 @@ type SelfAttention struct {
func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor { func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
batchSize := hiddenState.Dim(1) batchSize := hiddenState.Dim(1)
ropeType := uint32(0) headDim := cmp.Or(opts.headDim, opts.hiddenSize/opts.numHeads)
headDim := opts.headDim
if headDim == 0 {
headDim = opts.hiddenSize / opts.numHeads
}
q := sa.Query.Forward(ctx, hiddenState) q := sa.Query.Forward(ctx, hiddenState)
q = q.Reshape(ctx, headDim, opts.numHeads, batchSize) q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
q = q.RoPE(ctx, positionIDs, nil, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale) q = fast.RoPE(ctx, q, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale)
k := sa.Key.Forward(ctx, hiddenState) k := sa.Key.Forward(ctx, hiddenState)
k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize) k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
k = k.RoPE(ctx, positionIDs, nil, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale) k = fast.RoPE(ctx, k, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale)
v := sa.Value.Forward(ctx, hiddenState) v := sa.Value.Forward(ctx, hiddenState)
v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize) v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
...@@ -63,7 +55,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten ...@@ -63,7 +55,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
} }
func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) { func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
return key.RoPE(ctx, shift, nil, uint32(0), m.ropeDim, m.ropeBase, m.ropeScale), nil return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale), nil
} }
type MLP struct { type MLP struct {
...@@ -110,20 +102,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor ...@@ -110,20 +102,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
// image embeddings // image embeddings
for _, image := range batch.Multimodal { for _, image := range batch.Multimodal {
row := image.Multimodal.(*imageRow) imageFeature := image.Multimodal[0].Tensor
row.parent.dataOnce.Do(func() {
// use a new, throwaway context so the image tensor is not added to the graph
temp := m.Backend().NewContext()
temp.Forward(row.parent.tensor).Compute(row.parent.tensor)
row.parent.data = row.parent.tensor.Floats()
temp.Close()
})
imageFeature, err := ctx.Input().FromFloatSlice(row.data(), row.shape...)
if err != nil {
panic(err)
}
ctx.Forward(imageFeature.Copy(ctx, hiddenState.View(ctx, image.Index*hiddenState.Stride(1), imageFeature.Dim(0)*imageFeature.Dim(1)))) ctx.Forward(imageFeature.Copy(ctx, hiddenState.View(ctx, image.Index*hiddenState.Stride(1), imageFeature.Dim(0)*imageFeature.Dim(1))))
} }
...@@ -142,36 +121,18 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor ...@@ -142,36 +121,18 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
return m.Output.Forward(ctx, hiddenState) return m.Output.Forward(ctx, hiddenState)
} }
func NewTextModel(c fs.Config) (*TextModel, error) { func newTextModel(c fs.Config) *TextModel {
if !strings.EqualFold(c.String("tokenizer.ggml.model"), "gpt2") { return &TextModel{
return nil, fmt.Errorf("tokenizer %s not yet supported", c.String("tokenizer.ggml.model"))
}
textModel := &TextModel{
BytePairEncoding: model.NewBytePairEncoding(
c.String("tokenizer.ggml.pretokenizer", `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
&model.Vocabulary{
Values: c.Strings("tokenizer.ggml.tokens"),
Types: c.Ints("tokenizer.ggml.token_type"),
Merges: c.Strings("tokenizer.ggml.merges"),
BOS: int32(c.Uint("tokenizer.ggml.bos_token_id", 1)),
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
EOS: int32(c.Uint("tokenizer.ggml.eos_token_id", 2)),
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
},
),
Layers: make([]Layer, c.Uint("block_count")), Layers: make([]Layer, c.Uint("block_count")),
TextOptions: &TextOptions{ TextOptions: &TextOptions{
hiddenSize: int(c.Uint("embedding_length")), hiddenSize: int(c.Uint("embedding_length")),
numHeads: int(c.Uint("attention.head_count")), numHeads: int(c.Uint("attention.head_count")),
numKVHeads: int(c.Uint("attention.head_count_kv")), numKVHeads: int(c.Uint("attention.head_count_kv")),
headDim: int(c.Uint("attention.key_length")), headDim: int(c.Uint("attention.key_length")),
ropeDim: int(c.Uint("rope.dimension_count")),
eps: c.Float("attention.layer_norm_rms_epsilon"), eps: c.Float("attention.layer_norm_rms_epsilon"),
ropeBase: c.Float("rope.freq_base"), ropeBase: c.Float("rope.freq_base"),
ropeScale: c.Float("rope.freq_scale", 1), ropeScale: c.Float("rope.freq_scale", 1),
ropeDim: c.Uint("rope.dimension_count"),
}, },
} }
return textModel, nil
} }
...@@ -110,15 +110,8 @@ func (m *VisionModel) positionalEmbedding(ctx ml.Context, positionIDs ml.Tensor) ...@@ -110,15 +110,8 @@ func (m *VisionModel) positionalEmbedding(ctx ml.Context, positionIDs ml.Tensor)
} }
} }
h, err := ctx.Input().FromFloatSlice(frequenciesHeight, maxPatchesPerSide, frequencies/2) h := ctx.Input().FromFloatSlice(frequenciesHeight, maxPatchesPerSide, frequencies/2)
if err != nil { w := ctx.Input().FromFloatSlice(frequenciesWidth, maxPatchesPerSide, frequencies/2)
panic(err)
}
w, err := ctx.Input().FromFloatSlice(frequenciesWidth, maxPatchesPerSide, frequencies/2)
if err != nil {
panic(err)
}
h = h.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx) h = h.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
w = w.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx) w = w.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
...@@ -151,10 +144,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor { ...@@ -151,10 +144,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
} }
} }
positionIDs, err := ctx.Input().FromIntSlice(positions, len(positions)) positionIDs := ctx.Input().FromIntSlice(positions, len(positions))
if err != nil {
panic(err)
}
positionEmbedding := m.positionalEmbedding(ctx, positionIDs) positionEmbedding := m.positionalEmbedding(ctx, positionIDs)
cos, sin := positionEmbedding.Cos(ctx), positionEmbedding.Sin(ctx) cos, sin := positionEmbedding.Cos(ctx), positionEmbedding.Sin(ctx)
...@@ -170,7 +160,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor { ...@@ -170,7 +160,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
func newVisionModel(c fs.Config) *VisionModel { func newVisionModel(c fs.Config) *VisionModel {
return &VisionModel{ return &VisionModel{
Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count", 24)), Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count")),
VisionModelOptions: &VisionModelOptions{ VisionModelOptions: &VisionModelOptions{
hiddenSize: int(c.Uint("vision.embedding_length", 1024)), hiddenSize: int(c.Uint("vision.embedding_length", 1024)),
numHeads: int(c.Uint("vision.attention.head_count", 16)), numHeads: int(c.Uint("vision.attention.head_count", 16)),
......
package mllama
import (
"fmt"
"image"
_ "image/jpeg"
_ "image/png"
"io"
"math"
"slices"
"golang.org/x/image/draw"
"github.com/ollama/ollama/model/imageproc"
)
func getSupportedAspectRatios(maxTiles int) []image.Point {
ratios := []image.Point{}
for w := range maxTiles {
for h := range maxTiles {
if (w+1)*(h+1) <= maxTiles {
ratios = append(ratios, image.Point{w + 1, h + 1})
}
}
}
return ratios
}
func clip(a, a_min, a_max int) int {
if a < a_min {
return a_min
} else if a > a_max {
return a_max
}
return a
}
func getOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point {
possibleTileArrangements := getSupportedAspectRatios(maxImageTiles)
possibleCanvasSizes := []image.Point{}
for _, pta := range possibleTileArrangements {
possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize})
}
scales := []float64{}
for _, pcs := range possibleCanvasSizes {
scaleHeight := float64(pcs.Y) / float64(imageSize.Y)
scaleWidth := float64(pcs.X) / float64(imageSize.X)
if scaleWidth > scaleHeight {
scales = append(scales, scaleHeight)
} else {
scales = append(scales, scaleWidth)
}
}
var minUpscale float64
var maxDownscale float64
var upscale bool
for _, s := range scales {
if s > 1.0 {
upscale = true
if minUpscale == 0 {
minUpscale = s
} else {
minUpscale = math.Min(minUpscale, s)
}
} else {
maxDownscale = math.Max(maxDownscale, s)
}
}
selectedScale := maxDownscale
if upscale {
selectedScale = minUpscale
}
var selectedCanvas image.Point
for n, pcs := range possibleCanvasSizes {
if scales[n] == selectedScale {
// choose the smallest possible canvas
if selectedCanvas.X == 0 && selectedCanvas.Y == 0 {
selectedCanvas = pcs
} else if pcs.X*pcs.Y < selectedCanvas.X*selectedCanvas.Y {
selectedCanvas = pcs
}
}
}
return selectedCanvas
}
func getImageSizeFitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point {
targetWidth := clip(imageSize.X, tileSize, canvasSize.X)
targetHeight := clip(imageSize.Y, tileSize, canvasSize.Y)
scaleWidth := float64(targetWidth) / float64(imageSize.X)
scaleHeight := float64(targetHeight) / float64(imageSize.Y)
var w, h int
if scaleWidth < scaleHeight {
w = targetWidth
h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight)
} else {
w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth)
h = targetHeight
}
return image.Point{w, h}
}
func resizeImage(img image.Image, format string, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) {
if format == "png" {
img = imageproc.Composite(img)
}
b := img.Bounds()
tileSize := outputSize.Y
canvasSize := getOptimalTiledCanvas(b.Max, maxImageTiles, tileSize)
aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize}
newSize := getImageSizeFitToCanvas(b.Max, canvasSize, tileSize)
return imageproc.Resize(img, newSize, imageproc.ResizeBilinear), aspectRatio
}
func padImage(img image.Image, outputSize, aspectRatio image.Point) image.Image {
paddedSize := image.Point{
X: outputSize.X * aspectRatio.X,
Y: outputSize.Y * aspectRatio.Y,
}
dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y))
draw.Draw(dst, img.Bounds(), img, image.Point{0, 0}, draw.Over)
return dst
}
func splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
b := img.Bounds()
width := b.Max.X - b.Min.X
height := b.Max.Y - b.Min.Y
tileHeight := height / numTilesSize.Y
tileWidth := width / numTilesSize.X
images := []image.Image{}
for h := range numTilesSize.Y {
for w := range numTilesSize.X {
rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1))
images = append(images, img.(interface {
SubImage(image.Rectangle) image.Image
}).SubImage(rect))
}
}
return images
}
func packImages(img image.Image, aspectRatio image.Point) []float32 {
subImages := splitToTiles(img, aspectRatio)
var pixelVals []float32
rescale := true
channelFirst := true
for _, subImg := range subImages {
vals := imageproc.Normalize(subImg, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, rescale, channelFirst)
pixelVals = append(pixelVals, vals...)
}
return pixelVals
}
func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) {
outputSize := image.Point{560, 560}
maxTiles := 4
img, format, err := image.Decode(imageData)
if err != nil {
return nil, nil, fmt.Errorf("failed to decode image: %w", err)
}
newImage, aspectRatio := resizeImage(img, format, outputSize, maxTiles)
newImage = padImage(newImage, outputSize, aspectRatio)
data := packImages(newImage, aspectRatio)
aspectRatioIndex := slices.Index(getSupportedAspectRatios(maxTiles), aspectRatio) + 1
opts := map[string]any{
"aspectRatioIndex": aspectRatioIndex,
}
return data, opts, nil
}
package mllama
import (
"bytes"
"image"
"image/png"
"testing"
"github.com/google/go-cmp/cmp"
)
func TestAspectRatios(t *testing.T) {
type aspectCase struct {
MaxTiles int
Expected []image.Point
}
cases := []aspectCase{
{
MaxTiles: 1,
Expected: []image.Point{{1, 1}},
},
{
MaxTiles: 2,
Expected: []image.Point{{1, 1}, {1, 2}, {2, 1}},
},
{
MaxTiles: 3,
Expected: []image.Point{{1, 1}, {1, 2}, {1, 3}, {2, 1}, {3, 1}},
},
{
MaxTiles: 4,
Expected: []image.Point{{1, 1}, {1, 2}, {1, 3}, {1, 4}, {2, 1}, {2, 2}, {3, 1}, {4, 1}},
},
}
for _, c := range cases {
actual := getSupportedAspectRatios(c.MaxTiles)
if diff := cmp.Diff(actual, c.Expected); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
}
}
}
func TestGetImageSizeFitToCanvas(t *testing.T) {
type imageSizeCase struct {
ImageRect image.Point
CanvasRect image.Point
TileSize int
Expected image.Point
}
cases := []imageSizeCase{
{
ImageRect: image.Point{400, 400},
CanvasRect: image.Point{640, 480},
TileSize: 200,
Expected: image.Point{400, 400},
},
{
ImageRect: image.Point{1024, 768},
CanvasRect: image.Point{640, 480},
TileSize: 200,
Expected: image.Point{640, 480},
},
{
ImageRect: image.Point{500, 500},
CanvasRect: image.Point{1000, 1000},
TileSize: 750,
Expected: image.Point{750, 750},
},
{
ImageRect: image.Point{500, 1000},
CanvasRect: image.Point{2000, 2000},
TileSize: 2000,
Expected: image.Point{1000, 2000},
},
{
ImageRect: image.Point{4000, 3000},
CanvasRect: image.Point{2000, 1000},
TileSize: 1000,
Expected: image.Point{1333, 1000},
},
{
ImageRect: image.Point{667, 1000},
CanvasRect: image.Point{1000, 1000},
TileSize: 560,
Expected: image.Point{667, 1000},
},
}
for _, c := range cases {
actual := getImageSizeFitToCanvas(c.ImageRect, c.CanvasRect, c.TileSize)
if actual != c.Expected {
t.Errorf("incorrect image rect: '%#v'. expected: '%#v'", actual, c.Expected)
}
}
}
func TestGetOptimalTiledCanvas(t *testing.T) {
type tiledCanvasSizeCase struct {
ImageSize image.Point
MaxImageTiles int
TileSize int
Expected image.Point
}
cases := []tiledCanvasSizeCase{
{
ImageSize: image.Point{1024, 768},
MaxImageTiles: 4,
TileSize: 1000,
Expected: image.Point{2000, 1000},
},
{
ImageSize: image.Point{1024, 768},
MaxImageTiles: 4,
TileSize: 560,
Expected: image.Point{1120, 1120},
},
{
ImageSize: image.Point{800, 600},
MaxImageTiles: 4,
TileSize: 560,
Expected: image.Point{1120, 1120},
},
{
ImageSize: image.Point{640, 480},
MaxImageTiles: 4,
TileSize: 560,
Expected: image.Point{1120, 560},
},
{
ImageSize: image.Point{320, 200},
MaxImageTiles: 4,
TileSize: 560,
Expected: image.Point{560, 560},
},
{
ImageSize: image.Point{1320, 200},
MaxImageTiles: 4,
TileSize: 560,
Expected: image.Point{1680, 560},
},
{
ImageSize: image.Point{2000, 200},
MaxImageTiles: 4,
TileSize: 560,
Expected: image.Point{2240, 560},
},
{
ImageSize: image.Point{10000, 200},
MaxImageTiles: 4,
TileSize: 560,
Expected: image.Point{2240, 560},
},
{
ImageSize: image.Point{480, 640},
MaxImageTiles: 4,
TileSize: 560,
Expected: image.Point{560, 1120},
},
{
ImageSize: image.Point{200, 320},
MaxImageTiles: 4,
TileSize: 560,
Expected: image.Point{560, 560},
},
{
ImageSize: image.Point{200, 1320},
MaxImageTiles: 4,
TileSize: 560,
Expected: image.Point{560, 1680},
},
{
ImageSize: image.Point{200, 2000},
MaxImageTiles: 4,
TileSize: 560,
Expected: image.Point{560, 2240},
},
{
ImageSize: image.Point{200, 10000},
MaxImageTiles: 4,
TileSize: 560,
Expected: image.Point{560, 2240},
},
{
ImageSize: image.Point{10000, 10000},
MaxImageTiles: 4,
TileSize: 560,
Expected: image.Point{1120, 1120},
},
}
for _, c := range cases {
actual := getOptimalTiledCanvas(c.ImageSize, c.MaxImageTiles, c.TileSize)
if actual != c.Expected {
t.Errorf("incorrect tiled canvas: '%#v'. expected: '%#v'", actual, c.Expected)
}
}
}
func TestSplitToTiles(t *testing.T) {
type splitCase struct {
TestImage image.Image
NumTilesSize image.Point
Expected []image.Image
}
cases := []splitCase{
{
TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)),
NumTilesSize: image.Point{1, 1},
Expected: []image.Image{image.NewRGBA(image.Rect(0, 0, 1024, 768))},
},
{
TestImage: image.NewRGBA(image.Rect(0, 0, 1000, 500)),
NumTilesSize: image.Point{2, 1},
Expected: []image.Image{
image.NewRGBA(image.Rect(0, 0, 500, 500)),
image.NewRGBA(image.Rect(500, 0, 1000, 500)),
},
},
{
TestImage: image.NewRGBA(image.Rect(0, 0, 1000, 1000)),
NumTilesSize: image.Point{2, 2},
Expected: []image.Image{
image.NewRGBA(image.Rect(0, 0, 500, 500)),
image.NewRGBA(image.Rect(500, 0, 1000, 500)),
image.NewRGBA(image.Rect(0, 500, 500, 1000)),
image.NewRGBA(image.Rect(500, 500, 1000, 1000)),
},
},
}
for _, c := range cases {
actual := splitToTiles(c.TestImage, c.NumTilesSize)
if len(actual) != len(c.Expected) {
t.Errorf("incorrect number of images '%d': expected: '%d'", len(actual), len(c.Expected))
}
for i := range actual {
if actual[i].Bounds() != c.Expected[i].Bounds() {
t.Errorf("image size incorrect: '%#v': expected: '%#v'", actual[i].Bounds(), c.Expected[i].Bounds())
}
}
}
}
func TestResize(t *testing.T) {
type resizeCase struct {
TestImage image.Image
OutputSize image.Point
MaxImageTiles int
ExpectedImage image.Image
ExpectedAspectRatio image.Point
}
cases := []resizeCase{
{
TestImage: image.NewRGBA(image.Rect(0, 0, 200, 200)),
OutputSize: image.Point{100, 100},
MaxImageTiles: 1,
ExpectedImage: image.NewRGBA(image.Rect(0, 0, 100, 100)),
ExpectedAspectRatio: image.Point{1, 1},
},
{
TestImage: image.NewRGBA(image.Rect(0, 0, 200, 200)),
OutputSize: image.Point{100, 100},
MaxImageTiles: 2,
ExpectedImage: image.NewRGBA(image.Rect(0, 0, 100, 100)),
ExpectedAspectRatio: image.Point{1, 1},
},
{
TestImage: image.NewRGBA(image.Rect(0, 0, 10, 10)),
OutputSize: image.Point{560, 560},
MaxImageTiles: 4,
ExpectedImage: image.NewRGBA(image.Rect(0, 0, 560, 560)),
ExpectedAspectRatio: image.Point{1, 1},
},
{
TestImage: image.NewRGBA(image.Rect(0, 0, 2560, 1920)),
OutputSize: image.Point{560, 560},
MaxImageTiles: 4,
ExpectedImage: image.NewRGBA(image.Rect(0, 0, 1120, 840)),
ExpectedAspectRatio: image.Point{2, 2},
},
{
TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)),
OutputSize: image.Point{560, 560},
MaxImageTiles: 4,
ExpectedImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)),
ExpectedAspectRatio: image.Point{2, 2},
},
}
for _, c := range cases {
actualImage, actualAspectRatio := resizeImage(c.TestImage, "png", c.OutputSize, c.MaxImageTiles)
if actualImage.Bounds() != c.ExpectedImage.Bounds() {
t.Errorf("image size incorrect: '%#v': expected: '%#v'", actualImage.Bounds(), c.ExpectedImage.Bounds())
}
if actualAspectRatio != c.ExpectedAspectRatio {
t.Errorf("aspect ratio incorrect: '%#v': expected: '%#v'", actualAspectRatio, c.ExpectedAspectRatio)
}
}
}
func TestPad(t *testing.T) {
type padCase struct {
TestImage image.Image
OutputSize image.Point
AspectRatio image.Point
Expected image.Image
}
cases := []padCase{
{
TestImage: image.NewRGBA(image.Rect(0, 0, 1000, 667)),
OutputSize: image.Point{560, 560},
AspectRatio: image.Point{2, 2},
Expected: image.NewRGBA(image.Rect(0, 0, 1120, 1120)),
},
}
for _, c := range cases {
actual := padImage(c.TestImage, c.OutputSize, c.AspectRatio)
if actual.Bounds() != c.Expected.Bounds() {
t.Errorf("image size incorrect: '%#v': expected: '%#v'", actual.Bounds(), c.Expected.Bounds())
}
}
}
func TestPackImages(t *testing.T) {
type packCase struct {
TestImage image.Image
AspectRatio image.Point
ExpectedVals int
}
cases := []packCase{
{
TestImage: image.NewRGBA(image.Rect(0, 0, 1120, 1120)),
AspectRatio: image.Point{2, 2},
ExpectedVals: 2 * 2 * 3 * 560 * 560,
},
{
TestImage: image.NewRGBA(image.Rect(0, 0, 560, 560)),
AspectRatio: image.Point{1, 1},
ExpectedVals: 1 * 1 * 3 * 560 * 560,
},
{
TestImage: image.NewRGBA(image.Rect(0, 0, 1120, 560)),
AspectRatio: image.Point{1, 2},
ExpectedVals: 1 * 2 * 3 * 560 * 560,
},
}
for _, c := range cases {
actualVals := packImages(c.TestImage, c.AspectRatio)
if len(actualVals) != c.ExpectedVals {
t.Errorf("packed image size incorrect: '%d': expected: '%d'", len(actualVals), c.ExpectedVals)
}
}
}
func TestPreprocess(t *testing.T) {
type preprocessCase struct {
TestImage image.Image
ExpectedVals int
ExpectedAspectRatioID int
}
cases := []preprocessCase{
{
TestImage: image.NewRGBA(image.Rect(0, 0, 10, 10)),
ExpectedVals: 0,
ExpectedAspectRatioID: 1,
},
{
TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)),
ExpectedVals: 0,
ExpectedAspectRatioID: 6,
},
}
for _, c := range cases {
var buf bytes.Buffer
err := png.Encode(&buf, c.TestImage)
if err != nil {
t.Fatal(err)
}
imgData, opts, err := Preprocess(&buf)
if err != nil {
t.Fatalf("error processing: %q", err)
}
if len(imgData) == 0 {
t.Errorf("no image data returned")
}
ar, ok := opts["aspectRatioIndex"]
if !ok {
t.Fatalf("no aspect ratio found")
}
aspectRatioID := ar.(int)
if aspectRatioID != c.ExpectedAspectRatioID {
t.Errorf("aspect ratio incorrect: '%d': expected: '%d'", aspectRatioID, c.ExpectedAspectRatioID)
}
}
}
...@@ -2,9 +2,6 @@ package mllama ...@@ -2,9 +2,6 @@ package mllama
import ( import (
"bytes" "bytes"
"encoding/binary"
"fmt"
"hash/fnv"
"image" "image"
"slices" "slices"
...@@ -34,10 +31,6 @@ const ( ...@@ -34,10 +31,6 @@ const (
) )
func New(c fs.Config) (model.Model, error) { func New(c fs.Config) (model.Model, error) {
// Verify unified config
if c.Uint("vision.block_count") == 0 {
return nil, fmt.Errorf("non-unified vision model not supported")
}
m := Model{ m := Model{
BytePairEncoding: model.NewBytePairEncoding( BytePairEncoding: model.NewBytePairEncoding(
c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`), c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
...@@ -45,10 +38,13 @@ func New(c fs.Config) (model.Model, error) { ...@@ -45,10 +38,13 @@ func New(c fs.Config) (model.Model, error) {
Values: c.Strings("tokenizer.ggml.tokens"), Values: c.Strings("tokenizer.ggml.tokens"),
Types: c.Ints("tokenizer.ggml.token_type"), Types: c.Ints("tokenizer.ggml.token_type"),
Merges: c.Strings("tokenizer.ggml.merges"), Merges: c.Strings("tokenizer.ggml.merges"),
BOS: int32(c.Uint("tokenizer.ggml.bos_token_id")),
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true), AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")), BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false), AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
EOS: append(
[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
c.Ints("tokenizer.ggml.eos_token_ids")...,
),
}, },
), ),
ImageProcessor: newImageProcessor(c), ImageProcessor: newImageProcessor(c),
...@@ -63,7 +59,7 @@ func New(c fs.Config) (model.Model, error) { ...@@ -63,7 +59,7 @@ func New(c fs.Config) (model.Model, error) {
return &m, nil return &m, nil
} }
func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) { func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
if len(m.VisionModel.Transformer.Layers) == 0 || len(m.GlobalTransformer.Layers) == 0 { if len(m.VisionModel.Transformer.Layers) == 0 || len(m.GlobalTransformer.Layers) == 0 {
return nil, model.ErrNoVisionModel return nil, model.ErrNoVisionModel
} }
...@@ -73,81 +69,48 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er ...@@ -73,81 +69,48 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
return nil, err return nil, err
} }
f32s, aspectRatioID, err := m.ImageProcessor.ProcessImage(image) f32s, ratio, err := m.ImageProcessor.ProcessImage(image)
if err != nil { if err != nil {
return nil, err return nil, err
} }
pixelValues, err := ctx.Input().FromFloatSlice(f32s, if ratio.numTiles() < m.maxNumTiles {
m.ImageProcessor.imageSize, // Pad tiles to maxNumTiles
m.ImageProcessor.imageSize, f32s = slices.Grow(f32s, m.imageSize*m.imageSize*m.numChannels*m.maxNumTiles)
m.ImageProcessor.numChannels, f32s = f32s[:m.imageSize*m.imageSize*m.numChannels*m.maxNumTiles]
m.ImageProcessor.maxNumTiles,
)
if err != nil {
return nil, err
} }
aspectRatio, err := ctx.Input().FromIntSlice([]int32{int32(aspectRatioID)}, 1) pixelValues := ctx.Input().FromFloatSlice(f32s, m.imageSize, m.imageSize, m.numChannels, m.maxNumTiles)
if err != nil { aspectRatio := ctx.Input().FromIntSlice([]int32{int32(ratio.rank)}, 1)
return nil, err
}
positionIDs := ctx.Arange(0, 1601, 1, ml.DTypeI32) positionIDs := ctx.Arange(0, 1601, 1, ml.DTypeI32)
crossAttentionStates := m.VisionModel.Forward(ctx, pixelValues, positionIDs, aspectRatio) crossAttentionStates := m.VisionModel.Forward(ctx, pixelValues, positionIDs, aspectRatio)
return m.Projector.Forward(ctx, crossAttentionStates), nil projectedOutputs := m.Projector.Forward(ctx, crossAttentionStates)
return []input.Multimodal{{Tensor: projectedOutputs}}, nil
} }
func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) { func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
var images []input.Input
fnvHash := fnv.New64a()
for i := range inputs { for i := range inputs {
if inputs[i].Multimodal == nil { if inputs[i].Multimodal != nil {
if len(images) > 0 { inputs[i].Token = 128256 // <|image|>
inputs[i].Multimodal = []ml.Tensor{images[0].Multimodal.(ml.Tensor)}
inputs[i].MultimodalHash = images[0].MultimodalHash
for j := 1; j < len(images); j++ {
inputs[i].Multimodal = append(inputs[i].Multimodal.([]ml.Tensor), images[0].Multimodal.(ml.Tensor))
fnvHash.Reset()
binary.Write(fnvHash, binary.NativeEndian, inputs[i].MultimodalHash)
binary.Write(fnvHash, binary.NativeEndian, inputs[j].MultimodalHash)
inputs[i].MultimodalHash = fnvHash.Sum64()
}
images = nil
}
} else {
images = append(images, inputs[i])
inputs[i].Token = -1
} }
} }
inputs = slices.DeleteFunc(inputs, func(input input.Input) bool { return input.Token == -1 })
return inputs, nil return inputs, nil
} }
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) { func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
var crossAttentionStates ml.Tensor var crossAttentionStates ml.Tensor
if len(batch.Multimodal) > 0 { if len(batch.Multimodal) > 0 {
images := batch.Multimodal[len(batch.Multimodal)-1].Multimodal.([]ml.Tensor) crossAttentionStates = batch.Multimodal[len(batch.Multimodal)-1].Multimodal[0].Tensor
if len(images) > 0 {
crossAttentionStates = images[len(images)-1]
}
}
positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
if err != nil {
return nil, err
} }
outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs)) positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
if err != nil { outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
return nil, err
}
// TODO: attention mask, cross attention mask // TODO: attention mask, cross attention mask
return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, nil, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache)), nil return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache)), nil
} }
func init() { func init() {
......
...@@ -8,6 +8,8 @@ import ( ...@@ -8,6 +8,8 @@ import (
"github.com/ollama/ollama/kvcache" "github.com/ollama/ollama/kvcache"
"github.com/ollama/ollama/ml" "github.com/ollama/ollama/ml"
"github.com/ollama/ollama/ml/nn" "github.com/ollama/ollama/ml/nn"
"github.com/ollama/ollama/ml/nn/fast"
"github.com/ollama/ollama/ml/nn/rope"
) )
type TextSelfAttention struct { type TextSelfAttention struct {
...@@ -18,18 +20,17 @@ type TextSelfAttention struct { ...@@ -18,18 +20,17 @@ type TextSelfAttention struct {
RopeFactors ml.Tensor `gguf:"rope_freqs.weight"` RopeFactors ml.Tensor `gguf:"rope_freqs.weight"`
} }
func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor { func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
batchSize := hiddenState.Dim(1) batchSize := hiddenState.Dim(1)
headDim := opts.hiddenSize / opts.numHeads headDim := opts.hiddenSize / opts.numHeads
ropeType := uint32(0)
query := sa.Query.Forward(ctx, hiddenState) query := sa.Query.Forward(ctx, hiddenState)
query = query.Reshape(ctx, headDim, opts.numHeads, batchSize) query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
query = query.RoPE(ctx, positions, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale) query = fast.RoPE(ctx, query, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
key := sa.Key.Forward(ctx, hiddenState) key := sa.Key.Forward(ctx, hiddenState)
key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize) key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
key = key.RoPE(ctx, positions, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale) key = fast.RoPE(ctx, key, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
value := sa.Value.Forward(ctx, hiddenState) value := sa.Value.Forward(ctx, hiddenState)
value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize) value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
...@@ -44,7 +45,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, _ m ...@@ -44,7 +45,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, _ m
func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) { func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
// This will only get called for layers in the cache, which are just the self attention layers // This will only get called for layers in the cache, which are just the self attention layers
if sa, ok := m.Transformer.Layers[layer].(*TextSelfAttentionDecoderLayer); ok { if sa, ok := m.Transformer.Layers[layer].(*TextSelfAttentionDecoderLayer); ok {
return key.RoPE(ctx, shift, sa.SelfAttention.RopeFactors, m.ropeDim, uint32(0), m.ropeBase, m.ropeScale), nil return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale, rope.WithFactors(sa.SelfAttention.RopeFactors)), nil
} }
return key, nil return key, nil
...@@ -69,11 +70,11 @@ type TextSelfAttentionDecoderLayer struct { ...@@ -69,11 +70,11 @@ type TextSelfAttentionDecoderLayer struct {
MLP *TextMLP MLP *TextMLP
} }
func (d *TextSelfAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, positions, outputs, mask, _, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor { func (d *TextSelfAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, positions, outputs, _, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
residual := hiddenState residual := hiddenState
hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps) hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
hiddenState = d.SelfAttention.Forward(ctx, hiddenState, positions, mask, cache, opts) hiddenState = d.SelfAttention.Forward(ctx, hiddenState, positions, cache, opts)
// In the final layer (outputs != nil), optimize by pruning to just the token positions // In the final layer (outputs != nil), optimize by pruning to just the token positions
// we need logits for. // we need logits for.
...@@ -151,7 +152,7 @@ type TextCrossAttentionDecoderLayer struct { ...@@ -151,7 +152,7 @@ type TextCrossAttentionDecoderLayer struct {
MLPGate ml.Tensor `gguf:"cross_attn_mlp_gate"` MLPGate ml.Tensor `gguf:"cross_attn_mlp_gate"`
} }
func (d *TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _, _, _, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor { func (d *TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _, _, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
residual := hiddenState residual := hiddenState
hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps) hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
...@@ -167,14 +168,14 @@ func (d *TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _, ...@@ -167,14 +168,14 @@ func (d *TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _,
} }
type TextDecoderLayer interface { type TextDecoderLayer interface {
Forward(ctx ml.Context, hiddenState, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor Forward(ctx ml.Context, hiddenState, positionIDs, outputs, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor
} }
type TextDecoder struct { type TextDecoder struct {
Layers []TextDecoderLayer Layers []TextDecoderLayer
} }
func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor { func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
for i, layer := range d.Layers { for i, layer := range d.Layers {
layerType := selfAttentionLayer layerType := selfAttentionLayer
if slices.Contains(opts.crossAttentionLayers, int32(i)) { if slices.Contains(opts.crossAttentionLayers, int32(i)) {
...@@ -190,7 +191,7 @@ func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs, ...@@ -190,7 +191,7 @@ func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs,
lastLayerOutputs = outputs lastLayerOutputs = outputs
} }
hiddenState = layer.Forward(ctx, hiddenState, positionIDs, lastLayerOutputs, mask, crossAttentionStates, crossAttentionMask, cache, opts) hiddenState = layer.Forward(ctx, hiddenState, positionIDs, lastLayerOutputs, crossAttentionStates, crossAttentionMask, cache, opts)
} }
} }
...@@ -199,8 +200,8 @@ func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs, ...@@ -199,8 +200,8 @@ func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs,
type TextModelOptions struct { type TextModelOptions struct {
hiddenSize, numHeads, numKVHeads int hiddenSize, numHeads, numKVHeads int
ropeDim int
eps, ropeBase, ropeScale float32 eps, ropeBase, ropeScale float32
ropeDim uint32
crossAttentionLayers []int32 crossAttentionLayers []int32
} }
...@@ -214,9 +215,9 @@ type TextModel struct { ...@@ -214,9 +215,9 @@ type TextModel struct {
*TextModelOptions *TextModelOptions
} }
func (m *TextModel) Forward(ctx ml.Context, inputIDs, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache) ml.Tensor { func (m *TextModel) Forward(ctx ml.Context, inputIDs, positionIDs, outputs, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache) ml.Tensor {
hiddenState := m.TokenEmbedding.Forward(ctx, inputIDs) hiddenState := m.TokenEmbedding.Forward(ctx, inputIDs)
hiddenState = m.Transformer.Forward(ctx, hiddenState, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask, cache, m.TextModelOptions) hiddenState = m.Transformer.Forward(ctx, hiddenState, positionIDs, outputs, crossAttentionStates, crossAttentionMask, cache, m.TextModelOptions)
hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps) hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
return m.Output.Forward(ctx, hiddenState) return m.Output.Forward(ctx, hiddenState)
} }
...@@ -240,10 +241,10 @@ func newTextModel(c fs.Config) *TextModel { ...@@ -240,10 +241,10 @@ func newTextModel(c fs.Config) *TextModel {
hiddenSize: int(c.Uint("embedding_length")), hiddenSize: int(c.Uint("embedding_length")),
numHeads: int(c.Uint("attention.head_count")), numHeads: int(c.Uint("attention.head_count")),
numKVHeads: int(c.Uint("attention.head_count_kv")), numKVHeads: int(c.Uint("attention.head_count_kv")),
ropeDim: int(c.Uint("rope.dimension_count")),
eps: c.Float("attention.layer_norm_rms_epsilon"), eps: c.Float("attention.layer_norm_rms_epsilon"),
ropeBase: c.Float("rope.freq_base"), ropeBase: c.Float("rope.freq_base"),
ropeScale: c.Float("rope.freq_scale", 1), ropeScale: c.Float("rope.freq_scale", 1),
ropeDim: c.Uint("rope.dimension_count"),
crossAttentionLayers: c.Ints("attention.cross_attention_layers"), crossAttentionLayers: c.Ints("attention.cross_attention_layers"),
}, },
} }
......
...@@ -15,9 +15,7 @@ type VisionSelfAttention struct { ...@@ -15,9 +15,7 @@ type VisionSelfAttention struct {
Query *nn.Linear `gguf:"attn_q"` Query *nn.Linear `gguf:"attn_q"`
Key *nn.Linear `gguf:"attn_k"` Key *nn.Linear `gguf:"attn_k"`
Value *nn.Linear `gguf:"attn_v"` Value *nn.Linear `gguf:"attn_v"`
Output *nn.Linear `gguf:"attn_out"` Output *nn.Linear `gguf:"attn_output"`
Gate ml.Tensor `gguf:"attn_gate"`
} }
func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor { func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
...@@ -25,56 +23,38 @@ func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, op ...@@ -25,56 +23,38 @@ func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, op
query := sa.Query.Forward(ctx, hiddenState) query := sa.Query.Forward(ctx, hiddenState)
query = query.Reshape(ctx, headDim, opts.numHeads, query.Dim(1), batchSize) query = query.Reshape(ctx, headDim, opts.numHeads, query.Dim(1), batchSize)
query = query.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
key := sa.Key.Forward(ctx, hiddenState) key := sa.Key.Forward(ctx, hiddenState)
key = key.Reshape(ctx, headDim, opts.numHeads, key.Dim(1), batchSize) key = key.Reshape(ctx, headDim, opts.numHeads, key.Dim(1), batchSize)
key = key.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
value := sa.Value.Forward(ctx, hiddenState) value := sa.Value.Forward(ctx, hiddenState)
value = value.Reshape(ctx, headDim, opts.numHeads, value.Dim(1), batchSize) value = value.Reshape(ctx, headDim, opts.numHeads, value.Dim(1), batchSize)
value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
scores := key.Mulmat(ctx, query)
scores = scores.Scale(ctx, 1.0/math.Sqrt(float64(headDim)))
scores = scores.Softmax(ctx)
attention := value.Mulmat(ctx, scores) attention := nn.Attention(ctx, query, key, value, 1./math.Sqrt(float64(headDim)), nil)
attention = attention.Reshape(ctx, headDim, attention.Dim(1), opts.numHeads, batchSize)
attention = attention.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize) attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize)
return sa.Output.Forward(ctx, attention)
hiddenState = sa.Output.Forward(ctx, attention)
if sa.Gate != nil {
hiddenState = hiddenState.Mul(ctx, sa.Gate)
}
return hiddenState
} }
type VisionMLP struct { type VisionMLP struct {
Down *nn.Linear `gguf:"ffn_down"`
Up *nn.Linear `gguf:"ffn_up"` Up *nn.Linear `gguf:"ffn_up"`
Down *nn.Linear `gguf:"ffn_down"`
Gate ml.Tensor `gguf:"ffn_gate"`
} }
func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor { func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
hiddenState = mlp.Down.Forward(ctx, hiddenState).GELU(ctx) hiddenState = mlp.Up.Forward(ctx, hiddenState).GELU(ctx)
hiddenState = mlp.Up.Forward(ctx, hiddenState) hiddenState = mlp.Down.Forward(ctx, hiddenState)
if mlp.Gate != nil {
hiddenState = hiddenState.Mul(ctx, mlp.Gate)
}
return hiddenState return hiddenState
} }
type VisionEncoderLayer struct { type VisionEncoderLayer struct {
AttentionNorm *nn.LayerNorm `gguf:"ln1"` AttentionNorm *nn.LayerNorm `gguf:"attn_norm"`
SelfAttention *VisionSelfAttention SelfAttention *VisionSelfAttention
AttentionGate ml.Tensor `gguf:"attn_gate"`
MLPNorm *nn.LayerNorm `gguf:"ln2"` MLPNorm *nn.LayerNorm `gguf:"ffn_norm"`
MLP *VisionMLP MLP *VisionMLP
MLPGate ml.Tensor `gguf:"ffn_gate"`
} }
func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor { func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
...@@ -83,13 +63,19 @@ func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts ...@@ -83,13 +63,19 @@ func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts
// self attention // self attention
hiddenState = e.AttentionNorm.Forward(ctx, hiddenState, opts.eps) hiddenState = e.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
hiddenState = e.SelfAttention.Forward(ctx, hiddenState, opts) hiddenState = e.SelfAttention.Forward(ctx, hiddenState, opts)
if e.AttentionGate != nil {
hiddenState = hiddenState.Mul(ctx, e.AttentionGate)
}
hiddenState = hiddenState.Add(ctx, residual) hiddenState = hiddenState.Add(ctx, residual)
residual = hiddenState residual = hiddenState
// feed forward
hiddenState = e.MLPNorm.Forward(ctx, hiddenState, opts.eps) hiddenState = e.MLPNorm.Forward(ctx, hiddenState, opts.eps)
hiddenState = e.MLP.Forward(ctx, hiddenState, opts) hiddenState = e.MLP.Forward(ctx, hiddenState, opts)
return hiddenState.Add(ctx, residual) if e.MLPGate != nil {
hiddenState = hiddenState.Mul(ctx, e.MLPGate)
}
hiddenState = hiddenState.Add(ctx, residual)
return hiddenState
} }
type VisionEncoder struct { type VisionEncoder struct {
...@@ -114,9 +100,9 @@ type PrecomputedAspectRatioEmbedding struct { ...@@ -114,9 +100,9 @@ type PrecomputedAspectRatioEmbedding struct {
Gate ml.Tensor `gguf:"gate"` Gate ml.Tensor `gguf:"gate"`
} }
func (e *PrecomputedAspectRatioEmbedding) Forward(ctx ml.Context, hiddenState ml.Tensor, aspectRatioIDs ml.Tensor, opts *VisionModelOptions) ml.Tensor { func (e *PrecomputedAspectRatioEmbedding) Forward(ctx ml.Context, hiddenState ml.Tensor, aspectRatioIDs ml.Tensor, numTiles int, opts *VisionModelOptions) ml.Tensor {
embeddings := e.Embedding.Forward(ctx, aspectRatioIDs) embeddings := e.Embedding.Forward(ctx, aspectRatioIDs)
embeddings = embeddings.Reshape(ctx, opts.hiddenSize, 1, opts.numTiles) embeddings = embeddings.Reshape(ctx, opts.hiddenSize, 1, numTiles)
if e.Gate != nil { if e.Gate != nil {
embeddings = embeddings.Mul(ctx, e.Gate) embeddings = embeddings.Mul(ctx, e.Gate)
} }
...@@ -132,7 +118,7 @@ type PrecomputedPositionEmbedding struct { ...@@ -132,7 +118,7 @@ type PrecomputedPositionEmbedding struct {
TilePositionEmbeddingGate ml.Tensor `gguf:"tile_position_embd.gate"` TilePositionEmbeddingGate ml.Tensor `gguf:"tile_position_embd.gate"`
} }
func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, positionIDs, aspectRatioIDs ml.Tensor, numPositions int, opts *VisionModelOptions) ml.Tensor { func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, positionIDs, aspectRatioIDs ml.Tensor, numPositions, numTiles int, opts *VisionModelOptions) ml.Tensor {
positionEmbedding := e.PositionEmbedding.Forward(ctx, positionIDs) positionEmbedding := e.PositionEmbedding.Forward(ctx, positionIDs)
if e.PositionEmbeddingGate != nil { if e.PositionEmbeddingGate != nil {
positionEmbedding = positionEmbedding.Mul(ctx, e.PositionEmbeddingGate) positionEmbedding = positionEmbedding.Mul(ctx, e.PositionEmbeddingGate)
...@@ -141,7 +127,7 @@ func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, posi ...@@ -141,7 +127,7 @@ func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, posi
hiddenState = hiddenState.Add(ctx, positionEmbedding) hiddenState = hiddenState.Add(ctx, positionEmbedding)
tilePositionEmbedding := e.TilePositionEmbedding.Forward(ctx, aspectRatioIDs) tilePositionEmbedding := e.TilePositionEmbedding.Forward(ctx, aspectRatioIDs)
tilePositionEmbedding = tilePositionEmbedding.Reshape(ctx, opts.hiddenSize, numPositions, opts.numTiles) tilePositionEmbedding = tilePositionEmbedding.Reshape(ctx, opts.hiddenSize, numPositions, numTiles)
if e.TilePositionEmbeddingGate != nil { if e.TilePositionEmbeddingGate != nil {
tilePositionEmbedding = tilePositionEmbedding.Mul(ctx, e.TilePositionEmbeddingGate) tilePositionEmbedding = tilePositionEmbedding.Mul(ctx, e.TilePositionEmbeddingGate)
} }
...@@ -150,9 +136,9 @@ func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, posi ...@@ -150,9 +136,9 @@ func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, posi
} }
type VisionModelOptions struct { type VisionModelOptions struct {
hiddenSize, numHeads, numTiles int hiddenSize, numHeads int
imageSize, patchSize int imageSize, patchSize int
eps float32 eps float32
intermediateLayersIndices []int32 intermediateLayersIndices []int32
} }
...@@ -181,14 +167,16 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues, positionIDs, aspectRa ...@@ -181,14 +167,16 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues, positionIDs, aspectRa
numPositions++ numPositions++
} }
numTiles := pixelValues.Dim(3)
hiddenState := m.PatchEmbeddings.Forward(ctx, pixelValues, m.patchSize, m.patchSize, 0, 0, 1, 1) hiddenState := m.PatchEmbeddings.Forward(ctx, pixelValues, m.patchSize, m.patchSize, 0, 0, 1, 1)
hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize, m.numTiles) hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize, numTiles)
hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx) hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
hiddenState = m.PreTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, m.VisionModelOptions) hiddenState = m.PreTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, numTiles, m.VisionModelOptions)
hiddenState = m.ClassEmbedding.Repeat(ctx, 2, m.numTiles).Concat(ctx, hiddenState, 1) hiddenState = m.ClassEmbedding.Repeat(ctx, 2, numTiles).Concat(ctx, hiddenState, 1)
hiddenState = m.PositionEmbedding.Forward(ctx, hiddenState, positionIDs, aspectRatioIDs, numPositions, m.VisionModelOptions) hiddenState = m.PositionEmbedding.Forward(ctx, hiddenState, positionIDs, aspectRatioIDs, numPositions, numTiles, m.VisionModelOptions)
hiddenState = m.PreLayerNorm.Forward(ctx, hiddenState, m.eps) hiddenState = m.PreLayerNorm.Forward(ctx, hiddenState, m.eps)
numPaddingPatches := 8 - (hiddenState.Dim(1)%8)%8 numPaddingPatches := 8 - (hiddenState.Dim(1)%8)%8
...@@ -199,18 +187,18 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues, positionIDs, aspectRa ...@@ -199,18 +187,18 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues, positionIDs, aspectRa
hiddenState = m.PostLayerNorm.Forward(ctx, hiddenState, m.eps) hiddenState = m.PostLayerNorm.Forward(ctx, hiddenState, m.eps)
hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize) hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, numTiles, batchSize)
hiddenState = m.PostTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, m.VisionModelOptions) hiddenState = m.PostTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, numTiles, m.VisionModelOptions)
hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, m.numTiles*(numPositions+numPaddingPatches), batchSize) hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numTiles*(numPositions+numPaddingPatches), batchSize)
hiddenState, _ = m.GlobalTransformer.Forward(ctx, hiddenState, nil, m.VisionModelOptions) hiddenState, _ = m.GlobalTransformer.Forward(ctx, hiddenState, nil, m.VisionModelOptions)
hiddenStates := intermediateHiddenStates[0].Stack(ctx, 0, intermediateHiddenStates[1:]...) hiddenStates := intermediateHiddenStates[0].Stack(ctx, 0, intermediateHiddenStates[1:]...)
hiddenStates = hiddenStates.Reshape(ctx, len(intermediateHiddenStates)*m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize) hiddenStates = hiddenStates.Reshape(ctx, len(intermediateHiddenStates)*m.hiddenSize, numPositions+numPaddingPatches, numTiles, batchSize)
hiddenStates = hiddenStates.Unpad(ctx, 0, numPaddingPatches, 0, 0) hiddenStates = hiddenStates.Pad(ctx, 0, -numPaddingPatches, 0, 0)
hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize) hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, numTiles, batchSize)
hiddenState = hiddenState.Unpad(ctx, 0, numPaddingPatches, 0, 0) hiddenState = hiddenState.Pad(ctx, 0, -numPaddingPatches, 0, 0)
return hiddenState.Concat(ctx, hiddenStates, 0) return hiddenState.Concat(ctx, hiddenStates, 0)
} }
...@@ -222,7 +210,6 @@ func newVisionModel(c fs.Config) *VisionModel { ...@@ -222,7 +210,6 @@ func newVisionModel(c fs.Config) *VisionModel {
VisionModelOptions: &VisionModelOptions{ VisionModelOptions: &VisionModelOptions{
hiddenSize: int(c.Uint("vision.embedding_length")), hiddenSize: int(c.Uint("vision.embedding_length")),
numHeads: int(c.Uint("vision.attention.head_count")), numHeads: int(c.Uint("vision.attention.head_count")),
numTiles: int(c.Uint("vision.max_num_tiles")),
imageSize: int(c.Uint("vision.image_size")), imageSize: int(c.Uint("vision.image_size")),
patchSize: int(c.Uint("vision.patch_size")), patchSize: int(c.Uint("vision.patch_size")),
......
...@@ -2,17 +2,31 @@ package mllama ...@@ -2,17 +2,31 @@ package mllama
import ( import (
"image" "image"
"image/color"
"math" "math"
"slices" "slices"
"golang.org/x/image/draw" "golang.org/x/image/draw"
"github.com/ollama/ollama/fs" "github.com/ollama/ollama/fs"
"github.com/ollama/ollama/model/imageproc"
) )
type supportedAspectRatio struct {
rank, width, height int
}
func (a supportedAspectRatio) Point() image.Point {
return image.Point{a.width, a.height}
}
func (a supportedAspectRatio) numTiles() int {
return a.width * a.height
}
type ImageProcessor struct { type ImageProcessor struct {
imageSize, numChannels, maxNumTiles int imageSize, numChannels, maxNumTiles int
mean, std [3]float32
} }
func newImageProcessor(c fs.Config) ImageProcessor { func newImageProcessor(c fs.Config) ImageProcessor {
...@@ -20,71 +34,49 @@ func newImageProcessor(c fs.Config) ImageProcessor { ...@@ -20,71 +34,49 @@ func newImageProcessor(c fs.Config) ImageProcessor {
imageSize: int(c.Uint("vision.image_size")), imageSize: int(c.Uint("vision.image_size")),
numChannels: int(c.Uint("vision.num_channels")), numChannels: int(c.Uint("vision.num_channels")),
maxNumTiles: int(c.Uint("vision.max_num_tiles")), maxNumTiles: int(c.Uint("vision.max_num_tiles")),
mean: imageproc.ClipDefaultMean,
std: imageproc.ClipDefaultSTD,
} }
} }
func (p *ImageProcessor) supportedAspectRatios(maxTiles int) []image.Point { func (p ImageProcessor) supportedAspectRatios() (ratios []supportedAspectRatio) {
ratios := []image.Point{} for w := 1; w <= p.maxNumTiles; w++ {
for h := 1; h <= p.maxNumTiles/w; h++ {
for w := range maxTiles { ratios = append(ratios, supportedAspectRatio{len(ratios) + 1, w, h})
for h := range maxTiles {
if (w+1)*(h+1) <= maxTiles {
ratios = append(ratios, image.Point{w + 1, h + 1})
}
} }
} }
return ratios return ratios
} }
func (p *ImageProcessor) clip(a, a_min, a_max int) int { func (p ImageProcessor) fitToCanvas(imageSize, canvasSize image.Point) image.Point {
if a < a_min { tw := min(max(imageSize.X, p.imageSize), canvasSize.X)
return a_min th := min(max(imageSize.Y, p.imageSize), canvasSize.Y)
} else if a > a_max {
return a_max
}
return a
}
func (p *ImageProcessor) fitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point {
targetWidth := p.clip(imageSize.X, tileSize, canvasSize.X)
targetHeight := p.clip(imageSize.Y, tileSize, canvasSize.Y)
scaleWidth := float64(targetWidth) / float64(imageSize.X) r := math.Min(
scaleHeight := float64(targetHeight) / float64(imageSize.Y) float64(tw)/float64(imageSize.X),
float64(th)/float64(imageSize.Y),
)
var w, h int w := min(int(math.Floor(float64(imageSize.X)*r)), tw)
h := min(int(math.Floor(float64(imageSize.Y)*r)), th)
if scaleWidth < scaleHeight {
w = targetWidth
h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight)
} else {
w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth)
h = targetHeight
}
return image.Point{w, h} return image.Point{w, h}
} }
func (p *ImageProcessor) optimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point { func (p ImageProcessor) optimalTiledCanvas(imageSize image.Point) image.Point {
possibleTileArrangements := p.supportedAspectRatios(maxImageTiles) possibleTileArrangements := p.supportedAspectRatios()
possibleCanvasSizes := []image.Point{} possibleCanvasSizes := make([]image.Point, len(possibleTileArrangements))
for _, pta := range possibleTileArrangements { for i, pta := range possibleTileArrangements {
possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize}) possibleCanvasSizes[i] = image.Point{pta.width * p.imageSize, pta.height * p.imageSize}
} }
scales := []float64{} scales := make([]float64, len(possibleCanvasSizes))
for i, pcs := range possibleCanvasSizes {
for _, pcs := range possibleCanvasSizes { scales[i] = min(
scaleHeight := float64(pcs.Y) / float64(imageSize.Y) float64(pcs.Y)/float64(imageSize.Y),
scaleWidth := float64(pcs.X) / float64(imageSize.X) float64(pcs.X)/float64(imageSize.X),
)
if scaleWidth > scaleHeight {
scales = append(scales, scaleHeight)
} else {
scales = append(scales, scaleWidth)
}
} }
var minUpscale float64 var minUpscale float64
...@@ -123,47 +115,41 @@ func (p *ImageProcessor) optimalTiledCanvas(imageSize image.Point, maxImageTiles ...@@ -123,47 +115,41 @@ func (p *ImageProcessor) optimalTiledCanvas(imageSize image.Point, maxImageTiles
return selectedCanvas return selectedCanvas
} }
func (p *ImageProcessor) splitToTiles(img image.Image, numTilesSize image.Point) []image.Image { func (p ImageProcessor) splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
b := img.Bounds() b := img.Bounds()
width := b.Max.X - b.Min.X width := b.Max.X - b.Min.X
height := b.Max.Y - b.Min.Y height := b.Max.Y - b.Min.Y
tileHeight := height / numTilesSize.Y tileHeight := height / numTilesSize.Y
tileWidth := width / numTilesSize.X tileWidth := width / numTilesSize.X
images := []image.Image{} images := make([]image.Image, 0, numTilesSize.Y*numTilesSize.X)
for h := range numTilesSize.Y { for h := range numTilesSize.Y {
for w := range numTilesSize.X { for w := range numTilesSize.X {
rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1)) rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1))
images = append(images, img.(interface { if subImg, ok := img.(interface {
SubImage(image.Rectangle) image.Image SubImage(image.Rectangle) image.Image
}).SubImage(rect)) }); ok {
images = append(images, subImg.SubImage(rect))
} else {
// Handle the case where img does not implement SubImage
// This is a fallback and may not be efficient
newImg := image.NewRGBA(rect)
draw.Draw(newImg, rect, img, rect.Min, draw.Src)
images = append(images, newImg)
}
} }
} }
return images return images
} }
// remove the "alpha" channel by drawing over a prefilled image func (p ImageProcessor) resize(img image.Image) (image.Image, image.Point) {
//
//nolint:unused
func (p *ImageProcessor) compositeImage(img image.Image) image.Image {
dst := image.NewRGBA(img.Bounds())
white := color.RGBA{255, 255, 255, 255}
draw.Draw(dst, dst.Bounds(), &image.Uniform{white}, image.Point{}, draw.Src)
draw.Draw(dst, dst.Bounds(), img, img.Bounds().Min, draw.Over)
return dst
}
func (p *ImageProcessor) resize(img image.Image, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) {
b := img.Bounds() b := img.Bounds()
tileSize := outputSize.Y
canvasSize := p.optimalTiledCanvas(b.Max, maxImageTiles, tileSize) canvasSize := p.optimalTiledCanvas(b.Max)
aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize} aspectRatio := image.Point{canvasSize.X / p.imageSize, canvasSize.Y / p.imageSize}
newSize := p.fitToCanvas(b.Max, canvasSize, tileSize) newSize := p.fitToCanvas(b.Max, canvasSize)
dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y)) dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y))
...@@ -177,10 +163,10 @@ func (p *ImageProcessor) resize(img image.Image, outputSize image.Point, maxImag ...@@ -177,10 +163,10 @@ func (p *ImageProcessor) resize(img image.Image, outputSize image.Point, maxImag
return dst, aspectRatio return dst, aspectRatio
} }
func (p *ImageProcessor) pad(img image.Image, outputSize, aspectRatio image.Point) image.Image { func (p ImageProcessor) pad(img image.Image, aspectRatio image.Point) image.Image {
paddedSize := image.Point{ paddedSize := image.Point{
X: outputSize.X * aspectRatio.X, X: p.imageSize * aspectRatio.X,
Y: outputSize.Y * aspectRatio.Y, Y: p.imageSize * aspectRatio.Y,
} }
dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y)) dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y))
...@@ -189,7 +175,7 @@ func (p *ImageProcessor) pad(img image.Image, outputSize, aspectRatio image.Poin ...@@ -189,7 +175,7 @@ func (p *ImageProcessor) pad(img image.Image, outputSize, aspectRatio image.Poin
return dst return dst
} }
func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, std [3]float32) []float32 { func (p ImageProcessor) pack(img image.Image, aspectRatio image.Point) []float32 {
subImages := p.splitToTiles(img, aspectRatio) subImages := p.splitToTiles(img, aspectRatio)
var pixelVals []float32 var pixelVals []float32
...@@ -205,9 +191,9 @@ func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, st ...@@ -205,9 +191,9 @@ func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, st
gVal := float32(g>>8) / 255.0 gVal := float32(g>>8) / 255.0
bVal := float32(b>>8) / 255.0 bVal := float32(b>>8) / 255.0
rVal = (rVal - mean[0]) / std[0] rVal = (rVal - p.mean[0]) / p.std[0]
gVal = (gVal - mean[1]) / std[1] gVal = (gVal - p.mean[1]) / p.std[1]
bVal = (bVal - mean[2]) / std[2] bVal = (bVal - p.mean[2]) / p.std[2]
rVals = append(rVals, rVal) rVals = append(rVals, rVal)
gVals = append(gVals, gVal) gVals = append(gVals, gVal)
...@@ -222,17 +208,15 @@ func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, st ...@@ -222,17 +208,15 @@ func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, st
return pixelVals return pixelVals
} }
func (p ImageProcessor) ProcessImage(img image.Image) ([]float32, int, error) { func (p ImageProcessor) ProcessImage(img image.Image) ([]float32, supportedAspectRatio, error) {
outputSize := image.Point{p.imageSize, p.imageSize} newImage, newImageRatio := p.resize(img)
newImage = p.pad(newImage, newImageRatio)
// clip values pixelValues := p.pack(newImage, newImageRatio)
mean := [3]float32{0.48145466, 0.4578275, 0.40821073}
std := [3]float32{0.26862954, 0.26130258, 0.27577711}
newImage, aspectRatio := p.resize(img, outputSize, p.maxNumTiles) supportedAspectRatios := p.supportedAspectRatios()
newImage = p.pad(newImage, outputSize, aspectRatio) aspectRatioID := slices.IndexFunc(supportedAspectRatios, func(i supportedAspectRatio) bool {
return i.width == newImageRatio.X && i.height == newImageRatio.Y
})
data := p.pack(newImage, aspectRatio, mean, std) return pixelValues, supportedAspectRatios[aspectRatioID], nil
aspectRatioIndex := slices.Index(p.supportedAspectRatios(p.maxNumTiles), aspectRatio) + 1
return data, aspectRatioIndex, nil
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment