Merge branch 'main' into drifkin/array-head-count-simple

b2b270ad · Devon Rifkin · 20c5fd39 · 2bb69b40 · b2b270ad · b2b270ad
Commit b2b270ad authored Jun 23, 2025 by Devon Rifkin
20 changed files
--- a/model/process_text.go
+++ b/model/process_text.go
@@ -2,117 +2,17 @@ package model

 import (
 	"cmp"
+	"context"
+	"fmt"
 	"iter"
 	"log/slog"
-	"slices"
 	"strings"
-	"sync"

 	"github.com/dlclark/regexp2"
 	heap "github.com/emirpasic/gods/v2/trees/binaryheap"
+	"github.com/ollama/ollama/logutil"
 )

-type Special int32
-
-const (
-	SpecialBOS Special = iota
-	SpecialEOS
-)
-
-const (
-	TOKEN_TYPE_NORMAL = iota + 1
-	TOKEN_TYPE_UNKNOWN
-	TOKEN_TYPE_CONTROL
-	TOKEN_TYPE_USER_DEFINED
-	TOKEN_TYPE_UNUSED
-	TOKEN_TYPE_BYTE
-)
-
-type TextProcessor interface {
-	Encode(s string, addSpecial bool) ([]int32, error)
-	Decode([]int32) (string, error)
-	Is(int32, Special) bool
-	Vocabulary() *Vocabulary
-}
-
-type Vocabulary struct {
-	Values []string
-	Types  []int32
-	Scores []float32
-	Merges []string
-
-	BOS, EOS, EOT          int32
-	AddBOS, AddEOS, AddEOT bool
-
-	specialOnce sync.Once
-	special     []string
-
-	valuesOnce sync.Once
-	values     map[string]int32
-
-	mergeOnce sync.Once
-	merge     map[string]int32
-}
-
-func (v *Vocabulary) Is(id int32, special Special) bool {
-	switch special {
-	case SpecialBOS:
-		return id == v.BOS
-	case SpecialEOS:
-		return id == v.EOS || id == v.EOT
-	default:
-		return false
-	}
-}
-
-func (v *Vocabulary) Encode(s string) int32 {
-	v.valuesOnce.Do(func() {
-		v.values = make(map[string]int32, len(v.Values))
-		for i, value := range v.Values {
-			v.values[value] = int32(i)
-		}
-	})
-
-	if id, ok := v.values[s]; ok {
-		return id
-	}
-
-	return -1
-}
-
-func (v *Vocabulary) Decode(id int32) string {
-	return v.Values[id]
-}
-
-func (v *Vocabulary) SpecialVocabulary() []string {
-	v.specialOnce.Do(func() {
-		for i := range v.Values {
-			if slices.Contains([]int{105, 106}, i) {
-				v.special = append(v.special, v.Values[i])
-			} else if v.Types[i] == TOKEN_TYPE_CONTROL {
-				v.special = append(v.special, v.Values[i])
-			}
-		}
-	})
-
-	return v.special
-}
-
-func (v *Vocabulary) Merge(left, right string) int {
-	v.mergeOnce.Do(func() {
-		v.merge = make(map[string]int32, len(v.Merges))
-		for i, merge := range v.Merges {
-			v.merge[merge] = int32(i)
-		}
-	})
-
-	if id, ok := v.merge[left+" "+right]; ok {
-		return int(id)
-	}
-
-	return -1
-}
-
 type BytePairEncoding struct {
 	pre   *regexp2.Regexp
 	vocab *Vocabulary
@@ -302,29 +202,23 @@ func (bpe BytePairEncoding) Encode(s string, addSpecial bool) ([]int32, error) {
 		}
 	}

-	if addSpecial && len(ids) > 0 {
-		if bpe.vocab.AddBOS {
-			if ids[0] == bpe.vocab.BOS {
-				slog.Warn("adding bos token to prompt which already has it", "id", bpe.vocab.BOS)
-			}
-
-			slog.Debug("adding bos token to prompt", "id", bpe.vocab.BOS)
-			ids = append([]int32{bpe.vocab.BOS}, ids...)
-		}
+	slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "string", s, "ids", ids)

-		if bpe.vocab.AddEOS {
-			if ids[len(ids)-1] == bpe.vocab.EOS {
-				slog.Warn("adding eos token to prompt which already has it", "id", bpe.vocab.EOS)
-			}
-
-			slog.Debug("adding eos token to prompt", "id", bpe.vocab.EOS)
-			ids = append(ids, bpe.vocab.EOS)
-		}
+	if addSpecial && len(ids) > 0 {
+		ids = bpe.vocab.addSpecials(ids)
 	}

 	return ids, nil
 }

+type lazyIdsString struct {
+	ids []int32
+}
+
+func (l lazyIdsString) LogValue() slog.Value {
+	return slog.AnyValue(fmt.Sprint(l.ids))
+}
+
 func (bpe BytePairEncoding) Decode(ids []int32) (string, error) {
 	var sb strings.Builder
 	for _, id := range ids {
@@ -349,5 +243,6 @@ func (bpe BytePairEncoding) Decode(ids []int32) (string, error) {
 		}
 	}

+	slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "string", sb.String(), "from", lazyIdsString{ids: ids})
 	return sb.String(), nil
 }
--- a/model/process_text_test.go
+++ b/model/process_text_test.go
--- a/model/input/input.go
+++ b/model/input/input.go
@@ -2,16 +2,30 @@ package input

 import "github.com/ollama/ollama/ml"

+// Multimodal is a multimodal embedding or a component of one.
+// For example, it could be a row of an image that can be processed
+// independently.
+type Multimodal struct {
+	// Tensor is the embedding data. Implementations may chose what to
+	// store here or it may be nil if not needed. However, any ml.Tensor
+	// objects must be stored here and not in Data.
+	Tensor ml.Tensor
+
+	// Data is implementation-specific opaque data, such as metadata on how
+	// to layout Tensor. It may be nil if not needed. It may also store larger
+	// objects such as complete images if they are to be processed later.
+	Data any
+}
+
 // Input represents one token in the input stream
 type Input struct {
 	// Token is a single element of text.
 	Token int32

-	// Multimodal is opaque data representing a non-text
-	// element such as an image (or part of one if the image
-	// can be processed in pieces). It may be either together
-	// with Token or on its own.
-	Multimodal any
+	// Multimodal is represents a non-text element such as an
+	// image (or part of one if the image can be processed in pieces).
+	// It may be used either together with Token or on its own.
+	Multimodal []Multimodal

 	// MultimodalHash is a unique representation of the data
 	// stored in Multimodal, used for caching and comparing
@@ -32,7 +46,7 @@ type Input struct {
 // Positions slice.
 type MultimodalIndex struct {
 	Index      int
-	Multimodal any
+	Multimodal []Multimodal
 }

 // Batch contains the inputs for a model forward pass

--- a/model/model.go
+++ b/model/model.go
@@ -19,6 +19,7 @@ import (
 	"github.com/ollama/ollama/fs"
 	fsggml "github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/logutil"
 	"github.com/ollama/ollama/ml"
 	_ "github.com/ollama/ollama/ml/backend"
 	"github.com/ollama/ollama/model/input"
@@ -39,12 +40,13 @@ type MultimodalProcessor interface {
 	// EncodeMultimodal processes a single input (such as an image) and
 	// generates an output (typically an embedding) that can be used by the model.
 	//
-	// The return value is most typically an ml.Tensor, however, different
-	// type are possible, such as an object containing a tensor plus
-	// additional metadata, a slice of tensors or even just the original input.
+	// The return value is one or more tensors, each with optional model-specific
+	// opaque metadata. Typically, the tensors might be views into an embedding
+	// with each view representing a chunk of data that can be processed independently
+	// in different batches.
 	//
 	// The result may be cached by the runner.
-	EncodeMultimodal(ml.Context, []byte) (any, error)
+	EncodeMultimodal(ml.Context, []byte) ([]input.Multimodal, error)

 	// PostTokenize is called after tokenization to allow the model to edit the
 	// input stream to correctly arrange multimodal elements.
@@ -96,14 +98,8 @@ func Register(name string, f func(fs.Config) (Model, error)) {
 }

 // New initializes a new model instance with the provided configuration based on the metadata in the model file
-func New(ctx context.Context, modelPath string, params ml.BackendParams) (Model, error) {
-	r, err := os.Open(modelPath)
-	if err != nil {
-		return nil, err
-	}
-	defer r.Close()
-
-	b, err := ml.NewBackend(ctx, r, params)
+func New(modelPath string, params ml.BackendParams) (Model, error) {
+	b, err := ml.NewBackend(modelPath, params)
 	if err != nil {
 		return nil, err
 	}
@@ -132,7 +128,7 @@ func NewTextProcessor(s string) (TextProcessor, error) {
 		return nil, err
 	}
 	defer r.Close()
-	meta, _, err := fsggml.Decode(r, -1)
+	meta, err := fsggml.Decode(r, -1)
 	if err != nil {
 		return nil, err
 	}
@@ -202,7 +198,7 @@ func populateFields(base Base, v reflect.Value, tags ...Tag) reflect.Value {
 				names := fn(tagsCopy)
 				for _, name := range names {
 					if tensor := base.Backend().Get(strings.Join(name, ".")); tensor != nil {
-						slog.Debug("found tensor", "", tensor)
+						slog.Log(context.TODO(), logutil.LevelTrace, "found tensor", "", tensor)
 						vv.Set(reflect.ValueOf(tensor))
 						break
 					}
@@ -291,11 +287,7 @@ func Forward(ctx ml.Context, m Model, inputs []int32, batch input.Batch) (ml.Ten
 		return nil, errors.New("batch size cannot be less than 1")
 	}

-	var err error
-	batch.Inputs, err = ctx.Input().FromIntSlice(inputs, len(inputs))
-	if err != nil {
-		return nil, err
-	}
+	batch.Inputs = ctx.Input().FromIntSlice(inputs, len(inputs))

 	cache := m.Config().Cache
 	if cache != nil {

--- a/model/models/gemma2/model.go
+++ b/model/models/gemma2/model.go
@@ -7,6 +7,8 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/fast"
+	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
 )
@@ -43,8 +45,13 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Scores: c.Floats("tokenizer.ggml.scores"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
-				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
-				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS: append(
+					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
 			},
 		),
 		Layers: make([]Layer, c.Uint("block_count")),
@@ -78,11 +85,10 @@ type SelfAttention struct {

 func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
-	ropeType := uint32(2)

 	q := sa.Query.Forward(ctx, hiddenState)
 	q = q.Reshape(ctx, opts.attnKeyLen, opts.numHeads, batchSize)
-	q = q.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, opts.ropeBase, opts.ropeScale)
+	q = fast.RoPE(ctx, q, positionIDs, opts.attnKeyLen, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())

 	if opts.largeModelScaling {
 		q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.hiddenSize/opts.numHeads)))
@@ -92,7 +98,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten

 	k := sa.Key.Forward(ctx, hiddenState)
 	k = k.Reshape(ctx, opts.attnKeyLen, opts.numKVHeads, batchSize)
-	k = k.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, opts.ropeBase, opts.ropeScale)
+	k = fast.RoPE(ctx, k, positionIDs, opts.attnKeyLen, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())

 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, opts.attnValLen, opts.numKVHeads, batchSize)
@@ -122,7 +128,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
 }

 func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return key.RoPE(ctx, shift, nil, uint32(m.Options.attnKeyLen), uint32(2), m.Options.ropeBase, m.Options.ropeScale), nil
+	return fast.RoPE(ctx, key, shift, m.Options.attnKeyLen, m.Options.ropeBase, m.Options.ropeScale, rope.WithTypeNeoX()), nil
 }

 type MLP struct {
@@ -169,15 +175,8 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Ten
 }

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	if err != nil {
-		return nil, err
-	}
-
-	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
-	if err != nil {
-		return nil, err
-	}
+	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))

 	hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)
 	hiddenState = hiddenState.Scale(ctx, math.Sqrt(float64(m.Options.hiddenSize)))

--- a/model/models/gemma3/model.go
+++ b/model/models/gemma3/model.go
@@ -60,12 +60,16 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Scores: c.Floats("tokenizer.ggml.scores"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
-				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				EOS:    int32(1),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				EOT:    int32(106),
-				AddEOT: c.Bool("tokenizer.ggml.add_eot_token", false),
+				EOS: append(
+					[]int32{
+						int32(c.Uint("tokenizer.ggml.eos_token_id")),
+						int32(c.Uint("tokenizer.ggml.eot_token_id", 106)),
+					},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
 			},
 		),
 		ImageProcessor: newImageProcessor(c),
@@ -82,7 +86,7 @@ func New(c fs.Config) (model.Model, error) {
 	return &m, nil
 }

-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
 	if len(m.VisionModel.Layers) == 0 {
 		return nil, model.ErrNoVisionModel
 	}
@@ -97,33 +101,30 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
 		return nil, err
 	}

-	pixelValues, err := ctx.Input().FromFloatSlice(f32s,
+	pixelValues := ctx.Input().FromFloatSlice(f32s,
 		m.ImageProcessor.imageSize,
 		m.ImageProcessor.imageSize,
 		m.ImageProcessor.numChannels,
 	)
-	if err != nil {
-		return nil, err
-	}

 	visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
 	visionOutputs = m.MultiModalProjector.Forward(ctx, visionOutputs, m.imageSize, m.patchSize, m.VisionModel.eps)
-	return visionOutputs, nil
+	return []input.Multimodal{{Tensor: visionOutputs}}, nil
 }

 func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 	var result []input.Input

 	for _, inp := range inputs {
-		if inp.Multimodal == nil {
+		if len(inp.Multimodal) == 0 {
 			result = append(result, inp)
 		} else {
-			inputMultimodal := inp.Multimodal.(ml.Tensor)
+			inputMultimodal := inp.Multimodal[0].Tensor

 			result = append(result,
-				input.Input{Token: 108, SameBatch: inputMultimodal.Dim(1) + 3},               // "\n\n"
-				input.Input{Token: 255999},                                                   // "<start_of_image>""
-				input.Input{Multimodal: inputMultimodal, MultimodalHash: inp.MultimodalHash}, // image data is on the first placeholder
+				input.Input{Token: 108, SameBatch: inputMultimodal.Dim(1) + 3}, // "\n\n"
+				input.Input{Token: 255999},                                     // "<start_of_image>""
+				input.Input{Multimodal: []input.Multimodal{{Tensor: inputMultimodal}}, MultimodalHash: inp.MultimodalHash}, // image data is on the first placeholder
 			)

 			// add image token placeholders
@@ -140,15 +141,8 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 }

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	if err != nil {
-		return nil, err
-	}
-
-	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
-	if err != nil {
-		return nil, err
-	}
+	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))

 	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil
 }

--- a/model/models/gemma3/model_text.go
+++ b/model/models/gemma3/model_text.go
@@ -7,7 +7,8 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
-	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/ml/nn/fast"
+	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model/input"
 )

@@ -20,9 +21,6 @@ type TextConfig struct {
 }

 type TextModel struct {
-	model.Base
-	model.SentencePieceModel
-
 	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
 	Layers         []TextLayer   `gguf:"blk"`
 	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
@@ -45,15 +43,6 @@ func newTextModel(c fs.Config) *TextModel {
 	numBlocks := int(c.Uint("block_count"))

 	m := TextModel{
-		SentencePieceModel: model.NewSentencePieceModel(
-			&model.Vocabulary{
-				Values: c.Strings("tokenizer.ggml.tokens"),
-				Scores: c.Floats("tokenizer.ggml.scores"),
-				Types:  c.Ints("tokenizer.ggml.token_type"),
-				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
-				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
-			},
-		),
 		Layers: make([]TextLayer, numBlocks),
 		TextConfig: &TextConfig{
 			hiddenSize:     int(c.Uint("embedding_length")),
@@ -86,7 +75,6 @@ type TextSelfAttention struct {

 func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextConfig) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
-	ropeType := uint32(2)

 	ropeBase := opts.ropeLocalBase
 	if (layer+1)%gemmaGlobalCacheCount == 0 {
@@ -96,7 +84,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, pos
 	q := sa.Query.Forward(ctx, hiddenState)
 	q = q.Reshape(ctx, opts.attnKeyLen, opts.numHeads, batchSize)
 	q = sa.QueryNorm.Forward(ctx, q, opts.eps)
-	q = q.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, ropeBase, opts.ropeScale)
+	q = fast.RoPE(ctx, q, positionIDs, opts.attnKeyLen, ropeBase, opts.ropeScale, rope.WithTypeNeoX())

 	if opts.largeModelScaling {
 		q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.hiddenSize/opts.numHeads)))
@@ -107,7 +95,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, pos
 	k := sa.Key.Forward(ctx, hiddenState)
 	k = k.Reshape(ctx, opts.attnKeyLen, opts.numKVHeads, batchSize)
 	k = sa.KeyNorm.Forward(ctx, k, opts.eps)
-	k = k.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, ropeBase, opts.ropeScale)
+	k = fast.RoPE(ctx, k, positionIDs, opts.attnKeyLen, ropeBase, opts.ropeScale, rope.WithTypeNeoX())

 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, opts.attnValLen, opts.numKVHeads, batchSize)
@@ -125,7 +113,7 @@ func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.T
 		ropeBase = m.TextConfig.ropeGlobalBase
 	}

-	return key.RoPE(ctx, shift, nil, uint32(m.TextConfig.attnKeyLen), uint32(2), ropeBase, m.TextConfig.ropeScale), nil
+	return fast.RoPE(ctx, key, shift, m.TextConfig.attnKeyLen, ropeBase, m.TextConfig.ropeScale, rope.WithTypeNeoX()), nil
 }

 type TextMLP struct {
@@ -178,7 +166,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 	// set image embeddings
 	var except []int
 	for _, image := range batch.Multimodal {
-		visionOutputs := image.Multimodal.(ml.Tensor)
+		visionOutputs := image.Multimodal[0].Tensor
 		ctx.Forward(visionOutputs.Copy(ctx, hiddenState.View(ctx, image.Index*hiddenState.Stride(1), visionOutputs.Dim(0)*visionOutputs.Dim(1))))

 		for i := range visionOutputs.Dim(1) {

--- a/model/models/llama/model.go
+++ b/model/models/llama/model.go
 package llama

 import (
-	"fmt"
+	"cmp"
 	"math"
-	"strings"

 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/fast"
+	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
 )

 type Options struct {
 	hiddenSize, numHeads, numKVHeads int
+	headDim, ropeDim                 int
 	eps, ropeBase, ropeScale         float32
-	ropeDim                          uint32
 }

 type Model struct {
@@ -32,10 +33,6 @@ type Model struct {
 }

 func New(c fs.Config) (model.Model, error) {
-	if !strings.EqualFold(c.String("tokenizer.ggml.model"), "gpt2") {
-		return nil, fmt.Errorf("tokenizer %s not yet supported", c.String("tokenizer.ggml.model"))
-	}
-
 	m := Model{
 		BytePairEncoding: model.NewBytePairEncoding(
 			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
@@ -43,10 +40,13 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
-				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS: append(
+					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
 			},
 		),
 		Layers: make([]Layer, c.Uint("block_count")),
@@ -54,10 +54,11 @@ func New(c fs.Config) (model.Model, error) {
 			hiddenSize: int(c.Uint("embedding_length")),
 			numHeads:   int(c.Uint("attention.head_count")),
 			numKVHeads: int(c.Uint("attention.head_count_kv")),
+			headDim:    int(c.Uint("attention.key_length")),
+			ropeDim:    int(c.Uint("rope.dimension_count")),
 			eps:        c.Float("attention.layer_norm_rms_epsilon"),
 			ropeBase:   c.Float("rope.freq_base"),
 			ropeScale:  c.Float("rope.freq_scale", 1),
-			ropeDim:    c.Uint("rope.dimension_count"),
 		},
 	}

@@ -74,31 +75,31 @@ type SelfAttention struct {
 	RopeFactors ml.Tensor  `gguf:"rope_freqs.weight"`
 }

-func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
-	headDim := opts.hiddenSize / opts.numHeads
-	ropeType := uint32(0)
+	headDim := cmp.Or(opts.headDim, opts.hiddenSize/opts.numHeads)
+	ropeDim := cmp.Or(opts.ropeDim, headDim)

-	q := sa.Query.Forward(ctx, hiddenState)
-	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	q = q.RoPE(ctx, positionIDs, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
+	query := sa.Query.Forward(ctx, hiddenState)
+	query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)

-	k := sa.Key.Forward(ctx, hiddenState)
-	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-	k = k.RoPE(ctx, positionIDs, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
+	key := sa.Key.Forward(ctx, hiddenState)
+	key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)

-	v := sa.Value.Forward(ctx, hiddenState)
-	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
+	value := sa.Value.Forward(ctx, hiddenState)
+	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)

-	scaleFactor := 1.0 / math.Sqrt(float64(headDim))
-	kqv := nn.Attention(ctx, q, k, v, scaleFactor, cache)
-	kqv = kqv.Reshape(ctx, opts.hiddenSize, batchSize)
+	query = fast.RoPE(ctx, query, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
+	key = fast.RoPE(ctx, key, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))

-	return sa.Output.Forward(ctx, kqv)
+	attention := nn.Attention(ctx, query, key, value, 1.0/math.Sqrt(float64(headDim)), cache)
+	attention = attention.Reshape(ctx, headDim*opts.numHeads, batchSize)
+	return sa.Output.Forward(ctx, attention)
 }

 func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return key.RoPE(ctx, shift, m.Layers[layer].SelfAttention.RopeFactors, uint32(0), m.ropeDim, m.ropeBase, m.ropeScale), nil
+	ropeDim := cmp.Or(m.ropeDim, m.hiddenSize/m.numHeads)
+	return fast.RoPE(ctx, key, shift, ropeDim, m.ropeBase, m.ropeScale, rope.WithFactors(m.Layers[layer].SelfAttention.RopeFactors)), nil
 }

 type MLP struct {
@@ -119,11 +120,11 @@ type Layer struct {
 	MLP           *MLP
 }

-func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+func (l *Layer) Forward(ctx ml.Context, hiddenState, positions, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
 	residual := hiddenState

 	hiddenState = l.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
-	hiddenState = l.SelfAttention.Forward(ctx, hiddenState, positionIDs, cache, opts)
+	hiddenState = l.SelfAttention.Forward(ctx, hiddenState, positions, cache, opts)

 	// In the final layer (outputs != nil), optimize by pruning to just the token positions
 	// we need logits for.
@@ -141,27 +142,19 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Ten
 }

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	if err != nil {
-		return nil, err
-	}
-
-	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
-	if err != nil {
-		return nil, err
-	}
+	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))

 	hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)

 	for i, layer := range m.Layers {
 		m.Cache.SetLayer(i)

-		var lastLayerOutputs ml.Tensor
+		var outputs ml.Tensor
 		if i == len(m.Layers)-1 {
-			lastLayerOutputs = outputs
+			outputs = ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
 		}

-		hiddenState = layer.Forward(ctx, hiddenState, positions, lastLayerOutputs, m.Cache, m.Options)
+		hiddenState = layer.Forward(ctx, hiddenState, positions, outputs, m.Cache, m.Options)
 	}

 	hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)

--- a/model/models/llama4/model.go
+++ b/model/models/llama4/model.go
@@ -4,7 +4,6 @@ import (
 	"bytes"
 	"image"
 	"slices"
-	"sync"

 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
@@ -41,10 +40,13 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
-				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS: append(
+					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
 			},
 		),
 		ImageProcessor: newImageProcessor(c),
@@ -60,7 +62,7 @@ func New(c fs.Config) (model.Model, error) {
 	return &m, nil
 }

-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
 	if len(m.VisionModel.Layers) < 1 {
 		return nil, model.ErrNoVisionModel
 	}
@@ -75,10 +77,7 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
 		return nil, err
 	}

-	tilesLocal, err := ctx.Input().FromFloatSlice(pixelsLocal, size.X, size.Y, m.numChannels)
-	if err != nil {
-		return nil, err
-	}
+	tilesLocal := ctx.Input().FromFloatSlice(pixelsLocal, size.X, size.Y, m.numChannels)

 	ratioW, ratioH := size.X/m.imageSize, size.Y/m.imageSize

@@ -89,81 +88,86 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
 	pixelValues := tilesLocal

 	if len(pixelsGlobal) > 0 {
-		tilesGlobal, err := ctx.Input().FromFloatSlice(pixelsGlobal, m.imageSize, m.imageSize, m.numChannels)
-		if err != nil {
-			return nil, err
-		}
-
+		tilesGlobal := ctx.Input().FromFloatSlice(pixelsGlobal, m.imageSize, m.imageSize, m.numChannels)
 		pixelValues = pixelValues.Concat(ctx, tilesGlobal, 3)
 	}

 	visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
 	visionOutputs = visionOutputs.Reshape(ctx, visionOutputs.Dim(0), visionOutputs.Dim(1)*visionOutputs.Dim(2)*visionOutputs.Dim(3))
 	projectedOutputs := m.Projector.Forward(ctx, visionOutputs)
-	return &chunks{Model: m, Tensor: projectedOutputs, aspectRatio: image.Point{ratioW, ratioH}}, nil
-}

-type chunks struct {
-	*Model
-	ml.Tensor
-	aspectRatio image.Point
+	var multimodal []input.Multimodal
+	aspectRatio := image.Point{ratioW, ratioH}
+
+	var offset int
+	patchesPerChunk := projectedOutputs.Dim(1)
+	if aspectRatio.Y*aspectRatio.X > 1 {
+		patchesPerChunk = projectedOutputs.Dim(1) / (aspectRatio.X*aspectRatio.Y + 1)
+
+		for range aspectRatio.Y {
+			for x := range aspectRatio.X {
+				view := projectedOutputs.View(ctx, projectedOutputs.Stride(1)*offset,
+					projectedOutputs.Dim(0), projectedOutputs.Stride(1),
+					patchesPerChunk)
+				var separator separator
+				if x < aspectRatio.X-1 {
+					separator.x = true // <|tile_x_separator|>
+				} else {
+					separator.y = true // <|tile_y_separator|>
+				}
+				multimodal = append(multimodal, input.Multimodal{Tensor: view, Data: &separator})
+				offset += patchesPerChunk
+			}
+		}
+	}

-	dataOnce sync.Once
-	data     []float32
-}
+	view := projectedOutputs.View(ctx, projectedOutputs.Stride(1)*offset,
+		projectedOutputs.Dim(0), projectedOutputs.Stride(1),
+		patchesPerChunk)
+	multimodal = append(multimodal, input.Multimodal{Tensor: view, Data: &separator{}})

-type chunk struct {
-	*chunks
-	s, n int
+	return multimodal, nil
 }

-func (r *chunk) floats() []float32 {
-	r.dataOnce.Do(func() {
-		temp := r.Backend().NewContext()
-		defer temp.Close()
-		temp.Forward(r.Tensor).Compute(r.Tensor)
-		r.data = r.Floats()
-	})
-
-	return r.data[r.s*r.Dim(0) : (r.s+r.n)*r.Dim(0)]
+type separator struct {
+	x bool
+	y bool
 }

 func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 	var result []input.Input
 	for _, inp := range inputs {
-		if inp.Multimodal == nil {
+		if len(inp.Multimodal) == 0 {
 			result = append(result, inp)
 			continue
 		}

-		t := inp.Multimodal.(*chunks)
 		var imageInputs []input.Input
 		imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_start|>

-		var offset int
-		patchesPerChunk := t.Dim(1)
-		if t.aspectRatio.Y*t.aspectRatio.X > 1 {
-			patchesPerChunk = t.Dim(1) / (t.aspectRatio.X*t.aspectRatio.Y + 1)
-
-			for range t.aspectRatio.Y {
-				for x := range t.aspectRatio.X {
-					imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: &chunk{t, offset, patchesPerChunk}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
-					imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
-					if x < t.aspectRatio.X-1 {
-						imageInputs = append(imageInputs, input.Input{Token: 200084}) // <|tile_x_separator|>
-					}
-					offset += patchesPerChunk
-				}
+		for i, mm := range inp.Multimodal {
+			patchesPerChunk := mm.Tensor.Dim(1)

-				imageInputs = append(imageInputs, input.Input{Token: 200085}) // <|tile_y_separator|>
+			if i < len(inp.Multimodal)-1 {
+				separator := mm.Data.(*separator)
+
+				imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: []input.Multimodal{{Tensor: mm.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
+				imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
+
+				if separator.x {
+					imageInputs = append(imageInputs, input.Input{Token: 200084}) // <|tile_x_separator|>
+				}
+				if separator.y {
+					imageInputs = append(imageInputs, input.Input{Token: 200085}) // <|tile_y_separator|>
+				}
+			} else {
+				imageInputs = append(imageInputs, input.Input{Token: 200090})                                                                                                                      // <|image|>
+				imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: []input.Multimodal{{Tensor: mm.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
+				imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
+				imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_end|>
 			}
 		}

-		imageInputs = append(imageInputs, input.Input{Token: 200090})                                                                                                                 // <|image|>
-		imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: &chunk{t, offset, patchesPerChunk}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
-		imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
-		imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_end|>
-
 		result = append(result, imageInputs...)
 	}

@@ -171,15 +175,8 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 }

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	if err != nil {
-		return nil, err
-	}
-
-	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
-	if err != nil {
-		return nil, err
-	}
+	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))

 	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil
 }

--- a/model/models/llama4/model_text.go
+++ b/model/models/llama4/model_text.go
@@ -8,6 +8,8 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/fast"
+	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model/input"
 )

@@ -31,8 +33,8 @@ func (sa *TextAttention) Forward(ctx ml.Context, hiddenStates, positions, attent
 	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)

 	if useRope {
-		query = query.RoPE(ctx, positions, sa.RopeFactors, uint32(opts.ropeDim), uint32(0), opts.ropeBase, opts.ropeScale)
-		key = key.RoPE(ctx, positions, sa.RopeFactors, uint32(opts.ropeDim), uint32(0), opts.ropeBase, opts.ropeScale)
+		query = fast.RoPE(ctx, query, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
+		key = fast.RoPE(ctx, key, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
 	}

 	if opts.useQKNorm {
@@ -61,9 +63,9 @@ func (mlp *TextMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *TextOp
 }

 type TextExperts struct {
-	Gate ml.Tensor `gguf:"ffn_gate_exps.weight"`
-	Up   ml.Tensor `gguf:"ffn_up_exps.weight"`
-	Down ml.Tensor `gguf:"ffn_down_exps.weight"`
+	Gate *nn.Linear `gguf:"ffn_gate_exps"`
+	Up   *nn.Linear `gguf:"ffn_up_exps"`
+	Down *nn.Linear `gguf:"ffn_down_exps"`
 }

 func (e *TextExperts) Forward(ctx ml.Context, hiddenStates, routerLogits ml.Tensor, opts *TextOptions) ml.Tensor {
@@ -74,13 +76,13 @@ func (e *TextExperts) Forward(ctx ml.Context, hiddenStates, routerLogits ml.Tens
 	hiddenStates = hiddenStates.Repeat(ctx, 1, opts.numExpertsUsed)
 	hiddenStates = hiddenStates.Mul(ctx, scores)

-	upStates := e.Up.MulmatID(ctx, hiddenStates, experts)
-	gateStates := e.Gate.MulmatID(ctx, hiddenStates, experts)
-	downStates := e.Down.MulmatID(ctx, upStates.Mul(ctx, gateStates.SILU(ctx)), experts)
+	upStates := e.Up.Weight.MulmatID(ctx, hiddenStates, experts)
+	gateStates := e.Gate.Weight.MulmatID(ctx, hiddenStates, experts)
+	downStates := e.Down.Weight.MulmatID(ctx, upStates.Mul(ctx, gateStates.SILU(ctx)), experts)

 	nextStates := downStates.View(ctx, 0, hiddenStates.Dim(0), downStates.Stride(2), hiddenStates.Dim(2))
 	for i := 1; i < opts.numExpertsUsed; i++ {
-		nextStates.Add(ctx, downStates.View(ctx, i*downStates.Stride(1), hiddenStates.Dim(0), downStates.Stride(2), hiddenStates.Dim(2)))
+		nextStates = nextStates.Add(ctx, downStates.View(ctx, i*downStates.Stride(1), hiddenStates.Dim(0), downStates.Stride(2), hiddenStates.Dim(2)))
 	}

 	return nextStates
@@ -210,12 +212,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 	hiddenStates := m.TokenEmbedding.Forward(ctx, inputs).Duplicate(ctx)

 	for _, mi := range batch.Multimodal {
-		f32s := mi.Multimodal.(*chunk).floats()
-		img, err := ctx.Input().FromFloatSlice(f32s, len(f32s)/m.hiddenSize, m.hiddenSize)
-		if err != nil {
-			panic(err)
-		}
-
+		img := mi.Multimodal[0].Tensor
 		ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1))))
 	}

@@ -226,11 +223,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 			scales[i] = float32(math.Log(math.Floor(((float64(p)+1.0)/float64(m.attentionFloorScale))+1.0))*m.attentionScale + 1.0)
 		}

-		var err error
-		attentionScales, err = ctx.Input().FromFloatSlice(scales, 1, 1, len(scales))
-		if err != nil {
-			panic(err)
-		}
+		attentionScales = ctx.Input().FromFloatSlice(scales, 1, 1, len(scales))
 	}

 	for i, layer := range m.Layers {
@@ -255,5 +248,5 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 }

 func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return key.RoPE(ctx, shift, m.Layers[layer].Attention.RopeFactors, uint32(0), uint32(m.ropeDim), m.ropeBase, m.ropeScale), nil
+	return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale, rope.WithFactors(m.Layers[layer].Attention.RopeFactors)), nil
 }
--- a/model/models/llama4/model_vision.go
+++ b/model/models/llama4/model_vision.go
@@ -208,7 +208,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
 	}

 	hiddenStates = m.LayerNormPost.Forward(ctx, hiddenStates, m.eps)
-	hiddenStates = hiddenStates.Unpad(ctx, 0, 1, 0, 0)
+	hiddenStates = hiddenStates.Pad(ctx, 0, -1, 0, 0)
 	hiddenStates = m.VisionAdapter.Forward(ctx, hiddenStates, m.VisionOptions)
 	return hiddenStates
 }
@@ -245,10 +245,7 @@ func (m *VisionModel) rotaryEmbedding(ctx ml.Context) (ml.Tensor, ml.Tensor) {
 		}
 	}

-	ropeFreqs, err := ctx.Input().FromFloatSlice(freqs, freqDim/2, numPatches, 2)
-	if err != nil {
-		panic(err)
-	}
+	ropeFreqs := ctx.Input().FromFloatSlice(freqs, freqDim/2, numPatches, 2)

 	ropeFreqs = ropeFreqs.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
 	ropeFreqs = ropeFreqs.Reshape(ctx, freqDim, 1, numPatches)

--- a/model/models/mistral3/model.go
+++ b/model/models/mistral3/model.go
@@ -4,7 +4,6 @@ import (
 	"bytes"
 	"image"
 	"slices"
-	"sync"

 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
@@ -16,6 +15,8 @@ import (

 type Model struct {
 	model.Base
+	model.BytePairEncoding
+
 	*TextModel
 	*VisionModel         `gguf:"v,vision"`
 	*MultiModalProjector `gguf:"mm"`
@@ -30,13 +31,23 @@ var _ model.MultimodalProcessor = (*Model)(nil)
 var _ model.TextProcessor = (*Model)(nil)

 func New(c fs.Config) (model.Model, error) {
-	textModel, err := NewTextModel(c)
-	if err != nil {
-		return nil, err
-	}
-
 	m := &Model{
-		TextModel:           textModel,
+		BytePairEncoding: model.NewBytePairEncoding(
+			c.String("tokenizer.ggml.pretokenizer", `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
+			&model.Vocabulary{
+				Values: c.Strings("tokenizer.ggml.tokens"),
+				Types:  c.Ints("tokenizer.ggml.token_type"),
+				Merges: c.Strings("tokenizer.ggml.merges"),
+				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS: append(
+					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
+			},
+		),
+		TextModel:           newTextModel(c),
 		VisionModel:         newVisionModel(c),
 		ImageProcessor:      newImageProcessor(c),
 		MultiModalProjector: newMultiModalProjector(c),
@@ -88,7 +99,7 @@ func newMultiModalProjector(c fs.Config) *MultiModalProjector {
 	}
 }

-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
 	if len(m.VisionModel.Layers) == 0 {
 		return nil, model.ErrNoVisionModel
 	}
@@ -103,46 +114,20 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
 		return nil, err
 	}

-	pixelValues, err := ctx.Input().FromFloatSlice(f32s, size.X, size.Y, m.ImageProcessor.numChannels)
-	if err != nil {
-		return nil, err
-	}
+	pixelValues := ctx.Input().FromFloatSlice(f32s, size.X, size.Y, m.ImageProcessor.numChannels)

 	visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
 	features, size := m.MultiModalProjector.Forward(ctx, visionOutputs, size)

 	// split into patches to be sent to the text transformer
-	parent := imageFeatures{tensor: features}
-	rows := make([]*imageRow, size.Y)
+	rows := make([]input.Multimodal, size.Y)
 	for i := range rows {
-		rows[i] = &imageRow{parent: &parent, s: i, shape: []int{features.Dim(0), size.X}}
+		rows[i].Tensor = features.View(ctx, features.Stride(1)*size.X*i, features.Dim(0), features.Stride(1), size.X)
 	}

 	return rows, nil
 }

-type imageFeatures struct {
-	tensor ml.Tensor
-
-	dataOnce sync.Once
-	data     []float32
-}
-
-type imageRow struct {
-	parent *imageFeatures
-	s      int
-	shape  []int
-}
-
-func (r *imageRow) data() []float32 {
-	n := 1
-	for _, s := range r.shape {
-		n *= s
-	}
-
-	return r.parent.data[r.s*n : (r.s+1)*n]
-}
-
 // PostTokenize arranges Mistral 3's inputs for the forward pass
 // In Mistral 3 and Pixtral, the input patches are arranged as follows:
 // [IMG]...[IMG][IMG_BREAK][IMG]...[IMG][IMG_BREAK][IMG]...[IMG][IMG_END]
@@ -151,15 +136,14 @@ func (r *imageRow) data() []float32 {
 func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 	var result []input.Input
 	for _, inp := range inputs {
-		if inp.Multimodal == nil {
+		if len(inp.Multimodal) == 0 {
 			result = append(result, inp)
 		} else {
-			inputMultimodal := inp.Multimodal.([]*imageRow)
-			for i, row := range inputMultimodal {
+			for i, row := range inp.Multimodal {
 				// [IMG]
-				result = append(result, input.Input{Token: 10, Multimodal: row, MultimodalHash: inp.MultimodalHash, SameBatch: row.shape[1]})
-				result = append(result, slices.Repeat([]input.Input{{Token: 10}}, row.shape[1]-1)...)
-				if i == len(inputMultimodal)-1 {
+				result = append(result, input.Input{Token: 10, Multimodal: []input.Multimodal{{Tensor: row.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: row.Tensor.Dim(1)})
+				result = append(result, slices.Repeat([]input.Input{{Token: 10}}, row.Tensor.Dim(1)-1)...)
+				if i == len(inp.Multimodal)-1 {
 					// [IMG_END]
 					result = append(result, input.Input{Token: 13})
 				} else {
@@ -174,15 +158,8 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 }

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	if err != nil {
-		return nil, err
-	}
-
-	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
-	if err != nil {
-		return nil, err
-	}
+	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))

 	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil
 }

--- a/model/models/mistral3/model_text.go
+++ b/model/models/mistral3/model_text.go
 package mistral3

 import (
-	"fmt"
+	"cmp"
 	"math"
-	"strings"

 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
-	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/ml/nn/fast"
 	"github.com/ollama/ollama/model/input"
 )

 type TextOptions struct {
-	hiddenSize, numHeads, numKVHeads, headDim int
-	eps, ropeBase, ropeScale                  float32
-	ropeDim                                   uint32
+	hiddenSize, numHeads, numKVHeads int
+	headDim, ropeDim                 int
+	eps, ropeBase, ropeScale         float32
 }

 type TextModel struct {
-	model.Base
-	model.BytePairEncoding
-
 	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
 	Layers         []Layer       `gguf:"blk"`
 	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
@@ -40,19 +36,15 @@ type SelfAttention struct {

 func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
-	ropeType := uint32(0)
-	headDim := opts.headDim
-	if headDim == 0 {
-		headDim = opts.hiddenSize / opts.numHeads
-	}
+	headDim := cmp.Or(opts.headDim, opts.hiddenSize/opts.numHeads)

 	q := sa.Query.Forward(ctx, hiddenState)
 	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	q = q.RoPE(ctx, positionIDs, nil, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
+	q = fast.RoPE(ctx, q, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale)

 	k := sa.Key.Forward(ctx, hiddenState)
 	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-	k = k.RoPE(ctx, positionIDs, nil, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
+	k = fast.RoPE(ctx, k, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale)

 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
@@ -63,7 +55,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
 }

 func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return key.RoPE(ctx, shift, nil, uint32(0), m.ropeDim, m.ropeBase, m.ropeScale), nil
+	return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale), nil
 }

 type MLP struct {
@@ -110,20 +102,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor

 	// image embeddings
 	for _, image := range batch.Multimodal {
-		row := image.Multimodal.(*imageRow)
-		row.parent.dataOnce.Do(func() {
-			// use a new, throwaway context so the image tensor is not added to the graph
-			temp := m.Backend().NewContext()
-			temp.Forward(row.parent.tensor).Compute(row.parent.tensor)
-			row.parent.data = row.parent.tensor.Floats()
-			temp.Close()
-		})
-
-		imageFeature, err := ctx.Input().FromFloatSlice(row.data(), row.shape...)
-		if err != nil {
-			panic(err)
-		}
-
+		imageFeature := image.Multimodal[0].Tensor
 		ctx.Forward(imageFeature.Copy(ctx, hiddenState.View(ctx, image.Index*hiddenState.Stride(1), imageFeature.Dim(0)*imageFeature.Dim(1))))
 	}

@@ -142,36 +121,18 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 	return m.Output.Forward(ctx, hiddenState)
 }

-func NewTextModel(c fs.Config) (*TextModel, error) {
-	if !strings.EqualFold(c.String("tokenizer.ggml.model"), "gpt2") {
-		return nil, fmt.Errorf("tokenizer %s not yet supported", c.String("tokenizer.ggml.model"))
-	}
-
-	textModel := &TextModel{
-		BytePairEncoding: model.NewBytePairEncoding(
-			c.String("tokenizer.ggml.pretokenizer", `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
-			&model.Vocabulary{
-				Values: c.Strings("tokenizer.ggml.tokens"),
-				Types:  c.Ints("tokenizer.ggml.token_type"),
-				Merges: c.Strings("tokenizer.ggml.merges"),
-				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id", 1)),
-				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id", 2)),
-				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-			},
-		),
+func newTextModel(c fs.Config) *TextModel {
+	return &TextModel{
 		Layers: make([]Layer, c.Uint("block_count")),
 		TextOptions: &TextOptions{
 			hiddenSize: int(c.Uint("embedding_length")),
 			numHeads:   int(c.Uint("attention.head_count")),
 			numKVHeads: int(c.Uint("attention.head_count_kv")),
 			headDim:    int(c.Uint("attention.key_length")),
+			ropeDim:    int(c.Uint("rope.dimension_count")),
 			eps:        c.Float("attention.layer_norm_rms_epsilon"),
 			ropeBase:   c.Float("rope.freq_base"),
 			ropeScale:  c.Float("rope.freq_scale", 1),
-			ropeDim:    c.Uint("rope.dimension_count"),
 		},
 	}
-
-	return textModel, nil
 }
--- a/model/models/mistral3/model_vision.go
+++ b/model/models/mistral3/model_vision.go
@@ -110,15 +110,8 @@ func (m *VisionModel) positionalEmbedding(ctx ml.Context, positionIDs ml.Tensor)
 		}
 	}

-	h, err := ctx.Input().FromFloatSlice(frequenciesHeight, maxPatchesPerSide, frequencies/2)
-	if err != nil {
-		panic(err)
-	}
-
-	w, err := ctx.Input().FromFloatSlice(frequenciesWidth, maxPatchesPerSide, frequencies/2)
-	if err != nil {
-		panic(err)
-	}
+	h := ctx.Input().FromFloatSlice(frequenciesHeight, maxPatchesPerSide, frequencies/2)
+	w := ctx.Input().FromFloatSlice(frequenciesWidth, maxPatchesPerSide, frequencies/2)

 	h = h.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
 	w = w.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
@@ -151,10 +144,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
 		}
 	}

-	positionIDs, err := ctx.Input().FromIntSlice(positions, len(positions))
-	if err != nil {
-		panic(err)
-	}
+	positionIDs := ctx.Input().FromIntSlice(positions, len(positions))

 	positionEmbedding := m.positionalEmbedding(ctx, positionIDs)
 	cos, sin := positionEmbedding.Cos(ctx), positionEmbedding.Sin(ctx)
@@ -170,7 +160,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {

 func newVisionModel(c fs.Config) *VisionModel {
 	return &VisionModel{
-		Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count", 24)),
+		Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count")),
 		VisionModelOptions: &VisionModelOptions{
 			hiddenSize:       int(c.Uint("vision.embedding_length", 1024)),
 			numHeads:         int(c.Uint("vision.attention.head_count", 16)),

--- a/model/models/mllama/imageproc.go
+++ b/model/models/mllama/imageproc.go
-package mllama
-
-import (
-	"fmt"
-	"image"
-	_ "image/jpeg"
-	_ "image/png"
-	"io"
-	"math"
-	"slices"
-
-	"golang.org/x/image/draw"
-
-	"github.com/ollama/ollama/model/imageproc"
-)
-
-func getSupportedAspectRatios(maxTiles int) []image.Point {
-	ratios := []image.Point{}
-
-	for w := range maxTiles {
-		for h := range maxTiles {
-			if (w+1)*(h+1) <= maxTiles {
-				ratios = append(ratios, image.Point{w + 1, h + 1})
-			}
-		}
-	}
-
-	return ratios
-}
-
-func clip(a, a_min, a_max int) int {
-	if a < a_min {
-		return a_min
-	} else if a > a_max {
-		return a_max
-	}
-
-	return a
-}
-
-func getOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point {
-	possibleTileArrangements := getSupportedAspectRatios(maxImageTiles)
-	possibleCanvasSizes := []image.Point{}
-	for _, pta := range possibleTileArrangements {
-		possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize})
-	}
-
-	scales := []float64{}
-
-	for _, pcs := range possibleCanvasSizes {
-		scaleHeight := float64(pcs.Y) / float64(imageSize.Y)
-		scaleWidth := float64(pcs.X) / float64(imageSize.X)
-
-		if scaleWidth > scaleHeight {
-			scales = append(scales, scaleHeight)
-		} else {
-			scales = append(scales, scaleWidth)
-		}
-	}
-
-	var minUpscale float64
-	var maxDownscale float64
-	var upscale bool
-
-	for _, s := range scales {
-		if s > 1.0 {
-			upscale = true
-			if minUpscale == 0 {
-				minUpscale = s
-			} else {
-				minUpscale = math.Min(minUpscale, s)
-			}
-		} else {
-			maxDownscale = math.Max(maxDownscale, s)
-		}
-	}
-
-	selectedScale := maxDownscale
-	if upscale {
-		selectedScale = minUpscale
-	}
-
-	var selectedCanvas image.Point
-	for n, pcs := range possibleCanvasSizes {
-		if scales[n] == selectedScale {
-			// choose the smallest possible canvas
-			if selectedCanvas.X == 0 && selectedCanvas.Y == 0 {
-				selectedCanvas = pcs
-			} else if pcs.X*pcs.Y < selectedCanvas.X*selectedCanvas.Y {
-				selectedCanvas = pcs
-			}
-		}
-	}
-	return selectedCanvas
-}
-
-func getImageSizeFitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point {
-	targetWidth := clip(imageSize.X, tileSize, canvasSize.X)
-	targetHeight := clip(imageSize.Y, tileSize, canvasSize.Y)
-
-	scaleWidth := float64(targetWidth) / float64(imageSize.X)
-	scaleHeight := float64(targetHeight) / float64(imageSize.Y)
-
-	var w, h int
-
-	if scaleWidth < scaleHeight {
-		w = targetWidth
-		h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight)
-	} else {
-		w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth)
-		h = targetHeight
-	}
-
-	return image.Point{w, h}
-}
-
-func resizeImage(img image.Image, format string, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) {
-	if format == "png" {
-		img = imageproc.Composite(img)
-	}
-
-	b := img.Bounds()
-	tileSize := outputSize.Y
-
-	canvasSize := getOptimalTiledCanvas(b.Max, maxImageTiles, tileSize)
-	aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize}
-	newSize := getImageSizeFitToCanvas(b.Max, canvasSize, tileSize)
-
-	return imageproc.Resize(img, newSize, imageproc.ResizeBilinear), aspectRatio
-}
-
-func padImage(img image.Image, outputSize, aspectRatio image.Point) image.Image {
-	paddedSize := image.Point{
-		X: outputSize.X * aspectRatio.X,
-		Y: outputSize.Y * aspectRatio.Y,
-	}
-
-	dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y))
-	draw.Draw(dst, img.Bounds(), img, image.Point{0, 0}, draw.Over)
-
-	return dst
-}
-
-func splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
-	b := img.Bounds()
-	width := b.Max.X - b.Min.X
-	height := b.Max.Y - b.Min.Y
-	tileHeight := height / numTilesSize.Y
-	tileWidth := width / numTilesSize.X
-
-	images := []image.Image{}
-
-	for h := range numTilesSize.Y {
-		for w := range numTilesSize.X {
-			rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1))
-			images = append(images, img.(interface {
-				SubImage(image.Rectangle) image.Image
-			}).SubImage(rect))
-		}
-	}
-
-	return images
-}
-
-func packImages(img image.Image, aspectRatio image.Point) []float32 {
-	subImages := splitToTiles(img, aspectRatio)
-
-	var pixelVals []float32
-
-	rescale := true
-	channelFirst := true
-
-	for _, subImg := range subImages {
-		vals := imageproc.Normalize(subImg, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, rescale, channelFirst)
-		pixelVals = append(pixelVals, vals...)
-	}
-
-	return pixelVals
-}
-
-func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) {
-	outputSize := image.Point{560, 560}
-	maxTiles := 4
-
-	img, format, err := image.Decode(imageData)
-	if err != nil {
-		return nil, nil, fmt.Errorf("failed to decode image: %w", err)
-	}
-
-	newImage, aspectRatio := resizeImage(img, format, outputSize, maxTiles)
-	newImage = padImage(newImage, outputSize, aspectRatio)
-
-	data := packImages(newImage, aspectRatio)
-	aspectRatioIndex := slices.Index(getSupportedAspectRatios(maxTiles), aspectRatio) + 1
-
-	opts := map[string]any{
-		"aspectRatioIndex": aspectRatioIndex,
-	}
-
-	return data, opts, nil
-}
--- a/model/models/mllama/imageproc_test.go
+++ b/model/models/mllama/imageproc_test.go
-package mllama
-
-import (
-	"bytes"
-	"image"
-	"image/png"
-	"testing"
-
-	"github.com/google/go-cmp/cmp"
-)
-
-func TestAspectRatios(t *testing.T) {
-	type aspectCase struct {
-		MaxTiles int
-		Expected []image.Point
-	}
-
-	cases := []aspectCase{
-		{
-			MaxTiles: 1,
-			Expected: []image.Point{{1, 1}},
-		},
-		{
-			MaxTiles: 2,
-			Expected: []image.Point{{1, 1}, {1, 2}, {2, 1}},
-		},
-		{
-			MaxTiles: 3,
-			Expected: []image.Point{{1, 1}, {1, 2}, {1, 3}, {2, 1}, {3, 1}},
-		},
-		{
-			MaxTiles: 4,
-			Expected: []image.Point{{1, 1}, {1, 2}, {1, 3}, {1, 4}, {2, 1}, {2, 2}, {3, 1}, {4, 1}},
-		},
-	}
-
-	for _, c := range cases {
-		actual := getSupportedAspectRatios(c.MaxTiles)
-
-		if diff := cmp.Diff(actual, c.Expected); diff != "" {
-			t.Errorf("mismatch (-got +want):\n%s", diff)
-		}
-	}
-}
-
-func TestGetImageSizeFitToCanvas(t *testing.T) {
-	type imageSizeCase struct {
-		ImageRect  image.Point
-		CanvasRect image.Point
-		TileSize   int
-		Expected   image.Point
-	}
-
-	cases := []imageSizeCase{
-		{
-			ImageRect:  image.Point{400, 400},
-			CanvasRect: image.Point{640, 480},
-			TileSize:   200,
-			Expected:   image.Point{400, 400},
-		},
-		{
-			ImageRect:  image.Point{1024, 768},
-			CanvasRect: image.Point{640, 480},
-			TileSize:   200,
-			Expected:   image.Point{640, 480},
-		},
-		{
-			ImageRect:  image.Point{500, 500},
-			CanvasRect: image.Point{1000, 1000},
-			TileSize:   750,
-			Expected:   image.Point{750, 750},
-		},
-		{
-			ImageRect:  image.Point{500, 1000},
-			CanvasRect: image.Point{2000, 2000},
-			TileSize:   2000,
-			Expected:   image.Point{1000, 2000},
-		},
-		{
-			ImageRect:  image.Point{4000, 3000},
-			CanvasRect: image.Point{2000, 1000},
-			TileSize:   1000,
-			Expected:   image.Point{1333, 1000},
-		},
-		{
-			ImageRect:  image.Point{667, 1000},
-			CanvasRect: image.Point{1000, 1000},
-			TileSize:   560,
-			Expected:   image.Point{667, 1000},
-		},
-	}
-
-	for _, c := range cases {
-		actual := getImageSizeFitToCanvas(c.ImageRect, c.CanvasRect, c.TileSize)
-
-		if actual != c.Expected {
-			t.Errorf("incorrect image rect: '%#v'. expected: '%#v'", actual, c.Expected)
-		}
-	}
-}
-
-func TestGetOptimalTiledCanvas(t *testing.T) {
-	type tiledCanvasSizeCase struct {
-		ImageSize     image.Point
-		MaxImageTiles int
-		TileSize      int
-		Expected      image.Point
-	}
-
-	cases := []tiledCanvasSizeCase{
-		{
-			ImageSize:     image.Point{1024, 768},
-			MaxImageTiles: 4,
-			TileSize:      1000,
-			Expected:      image.Point{2000, 1000},
-		},
-		{
-			ImageSize:     image.Point{1024, 768},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{1120, 1120},
-		},
-		{
-			ImageSize:     image.Point{800, 600},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{1120, 1120},
-		},
-		{
-			ImageSize:     image.Point{640, 480},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{1120, 560},
-		},
-		{
-			ImageSize:     image.Point{320, 200},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{560, 560},
-		},
-		{
-			ImageSize:     image.Point{1320, 200},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{1680, 560},
-		},
-		{
-			ImageSize:     image.Point{2000, 200},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{2240, 560},
-		},
-		{
-			ImageSize:     image.Point{10000, 200},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{2240, 560},
-		},
-		{
-			ImageSize:     image.Point{480, 640},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{560, 1120},
-		},
-		{
-			ImageSize:     image.Point{200, 320},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{560, 560},
-		},
-		{
-			ImageSize:     image.Point{200, 1320},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{560, 1680},
-		},
-		{
-			ImageSize:     image.Point{200, 2000},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{560, 2240},
-		},
-		{
-			ImageSize:     image.Point{200, 10000},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{560, 2240},
-		},
-		{
-			ImageSize:     image.Point{10000, 10000},
-			MaxImageTiles: 4,
-			TileSize:      560,
-			Expected:      image.Point{1120, 1120},
-		},
-	}
-
-	for _, c := range cases {
-		actual := getOptimalTiledCanvas(c.ImageSize, c.MaxImageTiles, c.TileSize)
-
-		if actual != c.Expected {
-			t.Errorf("incorrect tiled canvas: '%#v'. expected: '%#v'", actual, c.Expected)
-		}
-	}
-}
-
-func TestSplitToTiles(t *testing.T) {
-	type splitCase struct {
-		TestImage    image.Image
-		NumTilesSize image.Point
-		Expected     []image.Image
-	}
-
-	cases := []splitCase{
-		{
-			TestImage:    image.NewRGBA(image.Rect(0, 0, 1024, 768)),
-			NumTilesSize: image.Point{1, 1},
-			Expected:     []image.Image{image.NewRGBA(image.Rect(0, 0, 1024, 768))},
-		},
-		{
-			TestImage:    image.NewRGBA(image.Rect(0, 0, 1000, 500)),
-			NumTilesSize: image.Point{2, 1},
-			Expected: []image.Image{
-				image.NewRGBA(image.Rect(0, 0, 500, 500)),
-				image.NewRGBA(image.Rect(500, 0, 1000, 500)),
-			},
-		},
-		{
-			TestImage:    image.NewRGBA(image.Rect(0, 0, 1000, 1000)),
-			NumTilesSize: image.Point{2, 2},
-			Expected: []image.Image{
-				image.NewRGBA(image.Rect(0, 0, 500, 500)),
-				image.NewRGBA(image.Rect(500, 0, 1000, 500)),
-				image.NewRGBA(image.Rect(0, 500, 500, 1000)),
-				image.NewRGBA(image.Rect(500, 500, 1000, 1000)),
-			},
-		},
-	}
-
-	for _, c := range cases {
-		actual := splitToTiles(c.TestImage, c.NumTilesSize)
-
-		if len(actual) != len(c.Expected) {
-			t.Errorf("incorrect number of images '%d': expected: '%d'", len(actual), len(c.Expected))
-		}
-
-		for i := range actual {
-			if actual[i].Bounds() != c.Expected[i].Bounds() {
-				t.Errorf("image size incorrect: '%#v': expected: '%#v'", actual[i].Bounds(), c.Expected[i].Bounds())
-			}
-		}
-	}
-}
-
-func TestResize(t *testing.T) {
-	type resizeCase struct {
-		TestImage           image.Image
-		OutputSize          image.Point
-		MaxImageTiles       int
-		ExpectedImage       image.Image
-		ExpectedAspectRatio image.Point
-	}
-
-	cases := []resizeCase{
-		{
-			TestImage:           image.NewRGBA(image.Rect(0, 0, 200, 200)),
-			OutputSize:          image.Point{100, 100},
-			MaxImageTiles:       1,
-			ExpectedImage:       image.NewRGBA(image.Rect(0, 0, 100, 100)),
-			ExpectedAspectRatio: image.Point{1, 1},
-		},
-		{
-			TestImage:           image.NewRGBA(image.Rect(0, 0, 200, 200)),
-			OutputSize:          image.Point{100, 100},
-			MaxImageTiles:       2,
-			ExpectedImage:       image.NewRGBA(image.Rect(0, 0, 100, 100)),
-			ExpectedAspectRatio: image.Point{1, 1},
-		},
-		{
-			TestImage:           image.NewRGBA(image.Rect(0, 0, 10, 10)),
-			OutputSize:          image.Point{560, 560},
-			MaxImageTiles:       4,
-			ExpectedImage:       image.NewRGBA(image.Rect(0, 0, 560, 560)),
-			ExpectedAspectRatio: image.Point{1, 1},
-		},
-		{
-			TestImage:           image.NewRGBA(image.Rect(0, 0, 2560, 1920)),
-			OutputSize:          image.Point{560, 560},
-			MaxImageTiles:       4,
-			ExpectedImage:       image.NewRGBA(image.Rect(0, 0, 1120, 840)),
-			ExpectedAspectRatio: image.Point{2, 2},
-		},
-		{
-			TestImage:           image.NewRGBA(image.Rect(0, 0, 1024, 768)),
-			OutputSize:          image.Point{560, 560},
-			MaxImageTiles:       4,
-			ExpectedImage:       image.NewRGBA(image.Rect(0, 0, 1024, 768)),
-			ExpectedAspectRatio: image.Point{2, 2},
-		},
-	}
-
-	for _, c := range cases {
-		actualImage, actualAspectRatio := resizeImage(c.TestImage, "png", c.OutputSize, c.MaxImageTiles)
-
-		if actualImage.Bounds() != c.ExpectedImage.Bounds() {
-			t.Errorf("image size incorrect: '%#v': expected: '%#v'", actualImage.Bounds(), c.ExpectedImage.Bounds())
-		}
-
-		if actualAspectRatio != c.ExpectedAspectRatio {
-			t.Errorf("aspect ratio incorrect: '%#v': expected: '%#v'", actualAspectRatio, c.ExpectedAspectRatio)
-		}
-	}
-}
-
-func TestPad(t *testing.T) {
-	type padCase struct {
-		TestImage   image.Image
-		OutputSize  image.Point
-		AspectRatio image.Point
-		Expected    image.Image
-	}
-
-	cases := []padCase{
-		{
-			TestImage:   image.NewRGBA(image.Rect(0, 0, 1000, 667)),
-			OutputSize:  image.Point{560, 560},
-			AspectRatio: image.Point{2, 2},
-			Expected:    image.NewRGBA(image.Rect(0, 0, 1120, 1120)),
-		},
-	}
-
-	for _, c := range cases {
-		actual := padImage(c.TestImage, c.OutputSize, c.AspectRatio)
-
-		if actual.Bounds() != c.Expected.Bounds() {
-			t.Errorf("image size incorrect: '%#v': expected: '%#v'", actual.Bounds(), c.Expected.Bounds())
-		}
-	}
-}
-
-func TestPackImages(t *testing.T) {
-	type packCase struct {
-		TestImage    image.Image
-		AspectRatio  image.Point
-		ExpectedVals int
-	}
-
-	cases := []packCase{
-		{
-			TestImage:    image.NewRGBA(image.Rect(0, 0, 1120, 1120)),
-			AspectRatio:  image.Point{2, 2},
-			ExpectedVals: 2 * 2 * 3 * 560 * 560,
-		},
-		{
-			TestImage:    image.NewRGBA(image.Rect(0, 0, 560, 560)),
-			AspectRatio:  image.Point{1, 1},
-			ExpectedVals: 1 * 1 * 3 * 560 * 560,
-		},
-		{
-			TestImage:    image.NewRGBA(image.Rect(0, 0, 1120, 560)),
-			AspectRatio:  image.Point{1, 2},
-			ExpectedVals: 1 * 2 * 3 * 560 * 560,
-		},
-	}
-
-	for _, c := range cases {
-		actualVals := packImages(c.TestImage, c.AspectRatio)
-		if len(actualVals) != c.ExpectedVals {
-			t.Errorf("packed image size incorrect: '%d': expected: '%d'", len(actualVals), c.ExpectedVals)
-		}
-	}
-}
-
-func TestPreprocess(t *testing.T) {
-	type preprocessCase struct {
-		TestImage             image.Image
-		ExpectedVals          int
-		ExpectedAspectRatioID int
-	}
-
-	cases := []preprocessCase{
-		{
-			TestImage:             image.NewRGBA(image.Rect(0, 0, 10, 10)),
-			ExpectedVals:          0,
-			ExpectedAspectRatioID: 1,
-		},
-		{
-			TestImage:             image.NewRGBA(image.Rect(0, 0, 1024, 768)),
-			ExpectedVals:          0,
-			ExpectedAspectRatioID: 6,
-		},
-	}
-
-	for _, c := range cases {
-		var buf bytes.Buffer
-		err := png.Encode(&buf, c.TestImage)
-		if err != nil {
-			t.Fatal(err)
-		}
-
-		imgData, opts, err := Preprocess(&buf)
-		if err != nil {
-			t.Fatalf("error processing: %q", err)
-		}
-
-		if len(imgData) == 0 {
-			t.Errorf("no image data returned")
-		}
-
-		ar, ok := opts["aspectRatioIndex"]
-		if !ok {
-			t.Fatalf("no aspect ratio found")
-		}
-
-		aspectRatioID := ar.(int)
-
-		if aspectRatioID != c.ExpectedAspectRatioID {
-			t.Errorf("aspect ratio incorrect: '%d': expected: '%d'", aspectRatioID, c.ExpectedAspectRatioID)
-		}
-	}
-}
--- a/model/models/mllama/model.go
+++ b/model/models/mllama/model.go
@@ -2,9 +2,6 @@ package mllama

 import (
 	"bytes"
-	"encoding/binary"
-	"fmt"
-	"hash/fnv"
 	"image"
 	"slices"

@@ -34,10 +31,6 @@ const (
 )

 func New(c fs.Config) (model.Model, error) {
-	// Verify unified config
-	if c.Uint("vision.block_count") == 0 {
-		return nil, fmt.Errorf("non-unified vision model not supported")
-	}
 	m := Model{
 		BytePairEncoding: model.NewBytePairEncoding(
 			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
@@ -45,10 +38,13 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
-				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS: append(
+					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
 			},
 		),
 		ImageProcessor: newImageProcessor(c),
@@ -63,7 +59,7 @@ func New(c fs.Config) (model.Model, error) {
 	return &m, nil
 }

-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
 	if len(m.VisionModel.Transformer.Layers) == 0 || len(m.GlobalTransformer.Layers) == 0 {
 		return nil, model.ErrNoVisionModel
 	}
@@ -73,81 +69,48 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
 		return nil, err
 	}

-	f32s, aspectRatioID, err := m.ImageProcessor.ProcessImage(image)
+	f32s, ratio, err := m.ImageProcessor.ProcessImage(image)
 	if err != nil {
 		return nil, err
 	}

-	pixelValues, err := ctx.Input().FromFloatSlice(f32s,
-		m.ImageProcessor.imageSize,
-		m.ImageProcessor.imageSize,
-		m.ImageProcessor.numChannels,
-		m.ImageProcessor.maxNumTiles,
-	)
-	if err != nil {
-		return nil, err
+	if ratio.numTiles() < m.maxNumTiles {
+		// Pad tiles to maxNumTiles
+		f32s = slices.Grow(f32s, m.imageSize*m.imageSize*m.numChannels*m.maxNumTiles)
+		f32s = f32s[:m.imageSize*m.imageSize*m.numChannels*m.maxNumTiles]
 	}

-	aspectRatio, err := ctx.Input().FromIntSlice([]int32{int32(aspectRatioID)}, 1)
-	if err != nil {
-		return nil, err
-	}
+	pixelValues := ctx.Input().FromFloatSlice(f32s, m.imageSize, m.imageSize, m.numChannels, m.maxNumTiles)
+	aspectRatio := ctx.Input().FromIntSlice([]int32{int32(ratio.rank)}, 1)

 	positionIDs := ctx.Arange(0, 1601, 1, ml.DTypeI32)
 	crossAttentionStates := m.VisionModel.Forward(ctx, pixelValues, positionIDs, aspectRatio)
-	return m.Projector.Forward(ctx, crossAttentionStates), nil
+	projectedOutputs := m.Projector.Forward(ctx, crossAttentionStates)
+
+	return []input.Multimodal{{Tensor: projectedOutputs}}, nil
 }

 func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
-	var images []input.Input
-	fnvHash := fnv.New64a()
-
 	for i := range inputs {
-		if inputs[i].Multimodal == nil {
-			if len(images) > 0 {
-				inputs[i].Multimodal = []ml.Tensor{images[0].Multimodal.(ml.Tensor)}
-				inputs[i].MultimodalHash = images[0].MultimodalHash
-				for j := 1; j < len(images); j++ {
-					inputs[i].Multimodal = append(inputs[i].Multimodal.([]ml.Tensor), images[0].Multimodal.(ml.Tensor))
-					fnvHash.Reset()
-					binary.Write(fnvHash, binary.NativeEndian, inputs[i].MultimodalHash)
-					binary.Write(fnvHash, binary.NativeEndian, inputs[j].MultimodalHash)
-					inputs[i].MultimodalHash = fnvHash.Sum64()
-				}
-				images = nil
-			}
-		} else {
-			images = append(images, inputs[i])
-			inputs[i].Token = -1
+		if inputs[i].Multimodal != nil {
+			inputs[i].Token = 128256 // <|image|>
 		}
 	}

-	inputs = slices.DeleteFunc(inputs, func(input input.Input) bool { return input.Token == -1 })
-
 	return inputs, nil
 }

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 	var crossAttentionStates ml.Tensor
 	if len(batch.Multimodal) > 0 {
-		images := batch.Multimodal[len(batch.Multimodal)-1].Multimodal.([]ml.Tensor)
-		if len(images) > 0 {
-			crossAttentionStates = images[len(images)-1]
-		}
-	}
-
-	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	if err != nil {
-		return nil, err
+		crossAttentionStates = batch.Multimodal[len(batch.Multimodal)-1].Multimodal[0].Tensor
 	}

-	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
-	if err != nil {
-		return nil, err
-	}
+	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))

 	// TODO: attention mask, cross attention mask
-	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, nil, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache)), nil
+	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache)), nil
 }

 func init() {

--- a/model/models/mllama/model_text.go
+++ b/model/models/mllama/model_text.go
@@ -8,6 +8,8 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/fast"
+	"github.com/ollama/ollama/ml/nn/rope"
 )

 type TextSelfAttention struct {
@@ -18,18 +20,17 @@ type TextSelfAttention struct {
 	RopeFactors ml.Tensor  `gguf:"rope_freqs.weight"`
 }

-func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
+func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
 	headDim := opts.hiddenSize / opts.numHeads
-	ropeType := uint32(0)

 	query := sa.Query.Forward(ctx, hiddenState)
 	query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	query = query.RoPE(ctx, positions, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
+	query = fast.RoPE(ctx, query, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))

 	key := sa.Key.Forward(ctx, hiddenState)
 	key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-	key = key.RoPE(ctx, positions, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
+	key = fast.RoPE(ctx, key, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))

 	value := sa.Value.Forward(ctx, hiddenState)
 	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
@@ -44,7 +45,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, _ m
 func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
 	// This will only get called for layers in the cache, which are just the self attention layers
 	if sa, ok := m.Transformer.Layers[layer].(*TextSelfAttentionDecoderLayer); ok {
-		return key.RoPE(ctx, shift, sa.SelfAttention.RopeFactors, m.ropeDim, uint32(0), m.ropeBase, m.ropeScale), nil
+		return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale, rope.WithFactors(sa.SelfAttention.RopeFactors)), nil
 	}

 	return key, nil
@@ -69,11 +70,11 @@ type TextSelfAttentionDecoderLayer struct {
 	MLP     *TextMLP
 }

-func (d *TextSelfAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, positions, outputs, mask, _, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
+func (d *TextSelfAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, positions, outputs, _, _ ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
 	residual := hiddenState

 	hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
-	hiddenState = d.SelfAttention.Forward(ctx, hiddenState, positions, mask, cache, opts)
+	hiddenState = d.SelfAttention.Forward(ctx, hiddenState, positions, cache, opts)

 	// In the final layer (outputs != nil), optimize by pruning to just the token positions
 	// we need logits for.
@@ -151,7 +152,7 @@ type TextCrossAttentionDecoderLayer struct {
 	MLPGate ml.Tensor `gguf:"cross_attn_mlp_gate"`
 }

-func (d *TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _, _, _, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
+func (d *TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _, _, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
 	residual := hiddenState

 	hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
@@ -167,14 +168,14 @@ func (d *TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _,
 }

 type TextDecoderLayer interface {
-	Forward(ctx ml.Context, hiddenState, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor
+	Forward(ctx ml.Context, hiddenState, positionIDs, outputs, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor
 }

 type TextDecoder struct {
 	Layers []TextDecoderLayer
 }

-func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
+func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
 	for i, layer := range d.Layers {
 		layerType := selfAttentionLayer
 		if slices.Contains(opts.crossAttentionLayers, int32(i)) {
@@ -190,7 +191,7 @@ func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs,
 				lastLayerOutputs = outputs
 			}

-			hiddenState = layer.Forward(ctx, hiddenState, positionIDs, lastLayerOutputs, mask, crossAttentionStates, crossAttentionMask, cache, opts)
+			hiddenState = layer.Forward(ctx, hiddenState, positionIDs, lastLayerOutputs, crossAttentionStates, crossAttentionMask, cache, opts)
 		}
 	}

@@ -199,8 +200,8 @@ func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs,

 type TextModelOptions struct {
 	hiddenSize, numHeads, numKVHeads int
+	ropeDim                          int
 	eps, ropeBase, ropeScale         float32
-	ropeDim                          uint32

 	crossAttentionLayers []int32
 }
@@ -214,9 +215,9 @@ type TextModel struct {
 	*TextModelOptions
 }

-func (m *TextModel) Forward(ctx ml.Context, inputIDs, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache) ml.Tensor {
+func (m *TextModel) Forward(ctx ml.Context, inputIDs, positionIDs, outputs, crossAttentionStates, crossAttentionMask ml.Tensor, cache *kvcache.WrapperCache) ml.Tensor {
 	hiddenState := m.TokenEmbedding.Forward(ctx, inputIDs)
-	hiddenState = m.Transformer.Forward(ctx, hiddenState, positionIDs, outputs, mask, crossAttentionStates, crossAttentionMask, cache, m.TextModelOptions)
+	hiddenState = m.Transformer.Forward(ctx, hiddenState, positionIDs, outputs, crossAttentionStates, crossAttentionMask, cache, m.TextModelOptions)
 	hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
 	return m.Output.Forward(ctx, hiddenState)
 }
@@ -240,10 +241,10 @@ func newTextModel(c fs.Config) *TextModel {
 			hiddenSize:           int(c.Uint("embedding_length")),
 			numHeads:             int(c.Uint("attention.head_count")),
 			numKVHeads:           int(c.Uint("attention.head_count_kv")),
+			ropeDim:              int(c.Uint("rope.dimension_count")),
 			eps:                  c.Float("attention.layer_norm_rms_epsilon"),
 			ropeBase:             c.Float("rope.freq_base"),
 			ropeScale:            c.Float("rope.freq_scale", 1),
-			ropeDim:              c.Uint("rope.dimension_count"),
 			crossAttentionLayers: c.Ints("attention.cross_attention_layers"),
 		},
 	}

--- a/model/models/mllama/model_vision.go
+++ b/model/models/mllama/model_vision.go
@@ -15,9 +15,7 @@ type VisionSelfAttention struct {
 	Query  *nn.Linear `gguf:"attn_q"`
 	Key    *nn.Linear `gguf:"attn_k"`
 	Value  *nn.Linear `gguf:"attn_v"`
-	Output *nn.Linear `gguf:"attn_out"`
-
-	Gate ml.Tensor `gguf:"attn_gate"`
+	Output *nn.Linear `gguf:"attn_output"`
 }

 func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
@@ -25,56 +23,38 @@ func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, op

 	query := sa.Query.Forward(ctx, hiddenState)
 	query = query.Reshape(ctx, headDim, opts.numHeads, query.Dim(1), batchSize)
-	query = query.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)

 	key := sa.Key.Forward(ctx, hiddenState)
 	key = key.Reshape(ctx, headDim, opts.numHeads, key.Dim(1), batchSize)
-	key = key.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)

 	value := sa.Value.Forward(ctx, hiddenState)
 	value = value.Reshape(ctx, headDim, opts.numHeads, value.Dim(1), batchSize)
-	value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
-
-	scores := key.Mulmat(ctx, query)
-	scores = scores.Scale(ctx, 1.0/math.Sqrt(float64(headDim)))
-	scores = scores.Softmax(ctx)

-	attention := value.Mulmat(ctx, scores)
-	attention = attention.Reshape(ctx, headDim, attention.Dim(1), opts.numHeads, batchSize)
-	attention = attention.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+	attention := nn.Attention(ctx, query, key, value, 1./math.Sqrt(float64(headDim)), nil)
 	attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize)
-
-	hiddenState = sa.Output.Forward(ctx, attention)
-	if sa.Gate != nil {
-		hiddenState = hiddenState.Mul(ctx, sa.Gate)
-	}
-
-	return hiddenState
+	return sa.Output.Forward(ctx, attention)
 }

 type VisionMLP struct {
-	Down *nn.Linear `gguf:"ffn_down"`
 	Up   *nn.Linear `gguf:"ffn_up"`
-
-	Gate ml.Tensor `gguf:"ffn_gate"`
+	Down *nn.Linear `gguf:"ffn_down"`
 }

 func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
-	hiddenState = mlp.Down.Forward(ctx, hiddenState).GELU(ctx)
-	hiddenState = mlp.Up.Forward(ctx, hiddenState)
-	if mlp.Gate != nil {
-		hiddenState = hiddenState.Mul(ctx, mlp.Gate)
-	}
+	hiddenState = mlp.Up.Forward(ctx, hiddenState).GELU(ctx)
+	hiddenState = mlp.Down.Forward(ctx, hiddenState)

 	return hiddenState
 }

 type VisionEncoderLayer struct {
-	AttentionNorm *nn.LayerNorm `gguf:"ln1"`
+	AttentionNorm *nn.LayerNorm `gguf:"attn_norm"`
 	SelfAttention *VisionSelfAttention
+	AttentionGate ml.Tensor `gguf:"attn_gate"`

-	MLPNorm *nn.LayerNorm `gguf:"ln2"`
+	MLPNorm *nn.LayerNorm `gguf:"ffn_norm"`
 	MLP     *VisionMLP
+	MLPGate ml.Tensor `gguf:"ffn_gate"`
 }

 func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
@@ -83,13 +63,19 @@ func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts
 	// self attention
 	hiddenState = e.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
 	hiddenState = e.SelfAttention.Forward(ctx, hiddenState, opts)
+	if e.AttentionGate != nil {
+		hiddenState = hiddenState.Mul(ctx, e.AttentionGate)
+	}
 	hiddenState = hiddenState.Add(ctx, residual)
 	residual = hiddenState

-	// feed forward
 	hiddenState = e.MLPNorm.Forward(ctx, hiddenState, opts.eps)
 	hiddenState = e.MLP.Forward(ctx, hiddenState, opts)
-	return hiddenState.Add(ctx, residual)
+	if e.MLPGate != nil {
+		hiddenState = hiddenState.Mul(ctx, e.MLPGate)
+	}
+	hiddenState = hiddenState.Add(ctx, residual)
+	return hiddenState
 }

 type VisionEncoder struct {
@@ -114,9 +100,9 @@ type PrecomputedAspectRatioEmbedding struct {
 	Gate      ml.Tensor `gguf:"gate"`
 }

-func (e *PrecomputedAspectRatioEmbedding) Forward(ctx ml.Context, hiddenState ml.Tensor, aspectRatioIDs ml.Tensor, opts *VisionModelOptions) ml.Tensor {
+func (e *PrecomputedAspectRatioEmbedding) Forward(ctx ml.Context, hiddenState ml.Tensor, aspectRatioIDs ml.Tensor, numTiles int, opts *VisionModelOptions) ml.Tensor {
 	embeddings := e.Embedding.Forward(ctx, aspectRatioIDs)
-	embeddings = embeddings.Reshape(ctx, opts.hiddenSize, 1, opts.numTiles)
+	embeddings = embeddings.Reshape(ctx, opts.hiddenSize, 1, numTiles)
 	if e.Gate != nil {
 		embeddings = embeddings.Mul(ctx, e.Gate)
 	}
@@ -132,7 +118,7 @@ type PrecomputedPositionEmbedding struct {
 	TilePositionEmbeddingGate ml.Tensor     `gguf:"tile_position_embd.gate"`
 }

-func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, positionIDs, aspectRatioIDs ml.Tensor, numPositions int, opts *VisionModelOptions) ml.Tensor {
+func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, positionIDs, aspectRatioIDs ml.Tensor, numPositions, numTiles int, opts *VisionModelOptions) ml.Tensor {
 	positionEmbedding := e.PositionEmbedding.Forward(ctx, positionIDs)
 	if e.PositionEmbeddingGate != nil {
 		positionEmbedding = positionEmbedding.Mul(ctx, e.PositionEmbeddingGate)
@@ -141,7 +127,7 @@ func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, posi
 	hiddenState = hiddenState.Add(ctx, positionEmbedding)

 	tilePositionEmbedding := e.TilePositionEmbedding.Forward(ctx, aspectRatioIDs)
-	tilePositionEmbedding = tilePositionEmbedding.Reshape(ctx, opts.hiddenSize, numPositions, opts.numTiles)
+	tilePositionEmbedding = tilePositionEmbedding.Reshape(ctx, opts.hiddenSize, numPositions, numTiles)
 	if e.TilePositionEmbeddingGate != nil {
 		tilePositionEmbedding = tilePositionEmbedding.Mul(ctx, e.TilePositionEmbeddingGate)
 	}
@@ -150,9 +136,9 @@ func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, posi
 }

 type VisionModelOptions struct {
-	hiddenSize, numHeads, numTiles int
-	imageSize, patchSize           int
-	eps                            float32
+	hiddenSize, numHeads int
+	imageSize, patchSize int
+	eps                  float32

 	intermediateLayersIndices []int32
 }
@@ -181,14 +167,16 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues, positionIDs, aspectRa
 		numPositions++
 	}

+	numTiles := pixelValues.Dim(3)
+
 	hiddenState := m.PatchEmbeddings.Forward(ctx, pixelValues, m.patchSize, m.patchSize, 0, 0, 1, 1)
-	hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize, m.numTiles)
+	hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize, numTiles)
 	hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)

-	hiddenState = m.PreTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, m.VisionModelOptions)
-	hiddenState = m.ClassEmbedding.Repeat(ctx, 2, m.numTiles).Concat(ctx, hiddenState, 1)
+	hiddenState = m.PreTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, numTiles, m.VisionModelOptions)
+	hiddenState = m.ClassEmbedding.Repeat(ctx, 2, numTiles).Concat(ctx, hiddenState, 1)

-	hiddenState = m.PositionEmbedding.Forward(ctx, hiddenState, positionIDs, aspectRatioIDs, numPositions, m.VisionModelOptions)
+	hiddenState = m.PositionEmbedding.Forward(ctx, hiddenState, positionIDs, aspectRatioIDs, numPositions, numTiles, m.VisionModelOptions)
 	hiddenState = m.PreLayerNorm.Forward(ctx, hiddenState, m.eps)

 	numPaddingPatches := 8 - (hiddenState.Dim(1)%8)%8
@@ -199,18 +187,18 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues, positionIDs, aspectRa

 	hiddenState = m.PostLayerNorm.Forward(ctx, hiddenState, m.eps)

-	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize)
-	hiddenState = m.PostTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, m.VisionModelOptions)
+	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, numTiles, batchSize)
+	hiddenState = m.PostTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, numTiles, m.VisionModelOptions)

-	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, m.numTiles*(numPositions+numPaddingPatches), batchSize)
+	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numTiles*(numPositions+numPaddingPatches), batchSize)
 	hiddenState, _ = m.GlobalTransformer.Forward(ctx, hiddenState, nil, m.VisionModelOptions)

 	hiddenStates := intermediateHiddenStates[0].Stack(ctx, 0, intermediateHiddenStates[1:]...)
-	hiddenStates = hiddenStates.Reshape(ctx, len(intermediateHiddenStates)*m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize)
-	hiddenStates = hiddenStates.Unpad(ctx, 0, numPaddingPatches, 0, 0)
+	hiddenStates = hiddenStates.Reshape(ctx, len(intermediateHiddenStates)*m.hiddenSize, numPositions+numPaddingPatches, numTiles, batchSize)
+	hiddenStates = hiddenStates.Pad(ctx, 0, -numPaddingPatches, 0, 0)

-	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize)
-	hiddenState = hiddenState.Unpad(ctx, 0, numPaddingPatches, 0, 0)
+	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, numTiles, batchSize)
+	hiddenState = hiddenState.Pad(ctx, 0, -numPaddingPatches, 0, 0)
 	return hiddenState.Concat(ctx, hiddenStates, 0)
 }

@@ -222,7 +210,6 @@ func newVisionModel(c fs.Config) *VisionModel {
 		VisionModelOptions: &VisionModelOptions{
 			hiddenSize: int(c.Uint("vision.embedding_length")),
 			numHeads:   int(c.Uint("vision.attention.head_count")),
-			numTiles:   int(c.Uint("vision.max_num_tiles")),

 			imageSize: int(c.Uint("vision.image_size")),
 			patchSize: int(c.Uint("vision.patch_size")),

--- a/model/models/mllama/process_image.go
+++ b/model/models/mllama/process_image.go
@@ -2,17 +2,31 @@ package mllama

 import (
 	"image"
-	"image/color"
 	"math"
 	"slices"

 	"golang.org/x/image/draw"

 	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/model/imageproc"
 )

+type supportedAspectRatio struct {
+	rank, width, height int
+}
+
+func (a supportedAspectRatio) Point() image.Point {
+	return image.Point{a.width, a.height}
+}
+
+func (a supportedAspectRatio) numTiles() int {
+	return a.width * a.height
+}
+
 type ImageProcessor struct {
 	imageSize, numChannels, maxNumTiles int
+
+	mean, std [3]float32
 }

 func newImageProcessor(c fs.Config) ImageProcessor {
@@ -20,71 +34,49 @@ func newImageProcessor(c fs.Config) ImageProcessor {
 		imageSize:   int(c.Uint("vision.image_size")),
 		numChannels: int(c.Uint("vision.num_channels")),
 		maxNumTiles: int(c.Uint("vision.max_num_tiles")),
+
+		mean: imageproc.ClipDefaultMean,
+		std:  imageproc.ClipDefaultSTD,
 	}
 }

-func (p *ImageProcessor) supportedAspectRatios(maxTiles int) []image.Point {
-	ratios := []image.Point{}
-
-	for w := range maxTiles {
-		for h := range maxTiles {
-			if (w+1)*(h+1) <= maxTiles {
-				ratios = append(ratios, image.Point{w + 1, h + 1})
-			}
+func (p ImageProcessor) supportedAspectRatios() (ratios []supportedAspectRatio) {
+	for w := 1; w <= p.maxNumTiles; w++ {
+		for h := 1; h <= p.maxNumTiles/w; h++ {
+			ratios = append(ratios, supportedAspectRatio{len(ratios) + 1, w, h})
 		}
 	}
-
 	return ratios
 }

-func (p *ImageProcessor) clip(a, a_min, a_max int) int {
-	if a < a_min {
-		return a_min
-	} else if a > a_max {
-		return a_max
-	}
-
-	return a
-}
-
-func (p *ImageProcessor) fitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point {
-	targetWidth := p.clip(imageSize.X, tileSize, canvasSize.X)
-	targetHeight := p.clip(imageSize.Y, tileSize, canvasSize.Y)
+func (p ImageProcessor) fitToCanvas(imageSize, canvasSize image.Point) image.Point {
+	tw := min(max(imageSize.X, p.imageSize), canvasSize.X)
+	th := min(max(imageSize.Y, p.imageSize), canvasSize.Y)

-	scaleWidth := float64(targetWidth) / float64(imageSize.X)
-	scaleHeight := float64(targetHeight) / float64(imageSize.Y)
+	r := math.Min(
+		float64(tw)/float64(imageSize.X),
+		float64(th)/float64(imageSize.Y),
+	)

-	var w, h int
-
-	if scaleWidth < scaleHeight {
-		w = targetWidth
-		h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight)
-	} else {
-		w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth)
-		h = targetHeight
-	}
+	w := min(int(math.Floor(float64(imageSize.X)*r)), tw)
+	h := min(int(math.Floor(float64(imageSize.Y)*r)), th)

 	return image.Point{w, h}
 }

-func (p *ImageProcessor) optimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point {
-	possibleTileArrangements := p.supportedAspectRatios(maxImageTiles)
-	possibleCanvasSizes := []image.Point{}
-	for _, pta := range possibleTileArrangements {
-		possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize})
+func (p ImageProcessor) optimalTiledCanvas(imageSize image.Point) image.Point {
+	possibleTileArrangements := p.supportedAspectRatios()
+	possibleCanvasSizes := make([]image.Point, len(possibleTileArrangements))
+	for i, pta := range possibleTileArrangements {
+		possibleCanvasSizes[i] = image.Point{pta.width * p.imageSize, pta.height * p.imageSize}
 	}

-	scales := []float64{}
-
-	for _, pcs := range possibleCanvasSizes {
-		scaleHeight := float64(pcs.Y) / float64(imageSize.Y)
-		scaleWidth := float64(pcs.X) / float64(imageSize.X)
-
-		if scaleWidth > scaleHeight {
-			scales = append(scales, scaleHeight)
-		} else {
-			scales = append(scales, scaleWidth)
-		}
+	scales := make([]float64, len(possibleCanvasSizes))
+	for i, pcs := range possibleCanvasSizes {
+		scales[i] = min(
+			float64(pcs.Y)/float64(imageSize.Y),
+			float64(pcs.X)/float64(imageSize.X),
+		)
 	}

 	var minUpscale float64
@@ -123,47 +115,41 @@ func (p *ImageProcessor) optimalTiledCanvas(imageSize image.Point, maxImageTiles
 	return selectedCanvas
 }

-func (p *ImageProcessor) splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
+func (p ImageProcessor) splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
 	b := img.Bounds()
 	width := b.Max.X - b.Min.X
 	height := b.Max.Y - b.Min.Y
 	tileHeight := height / numTilesSize.Y
 	tileWidth := width / numTilesSize.X

-	images := []image.Image{}
+	images := make([]image.Image, 0, numTilesSize.Y*numTilesSize.X)

 	for h := range numTilesSize.Y {
 		for w := range numTilesSize.X {
 			rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1))
-			images = append(images, img.(interface {
+			if subImg, ok := img.(interface {
 				SubImage(image.Rectangle) image.Image
-			}).SubImage(rect))
+			}); ok {
+				images = append(images, subImg.SubImage(rect))
+			} else {
+				// Handle the case where img does not implement SubImage
+				// This is a fallback and may not be efficient
+				newImg := image.NewRGBA(rect)
+				draw.Draw(newImg, rect, img, rect.Min, draw.Src)
+				images = append(images, newImg)
+			}
 		}
 	}

 	return images
 }

-// remove the "alpha" channel by drawing over a prefilled image
-//
-//nolint:unused
-func (p *ImageProcessor) compositeImage(img image.Image) image.Image {
-	dst := image.NewRGBA(img.Bounds())
-
-	white := color.RGBA{255, 255, 255, 255}
-	draw.Draw(dst, dst.Bounds(), &image.Uniform{white}, image.Point{}, draw.Src)
-	draw.Draw(dst, dst.Bounds(), img, img.Bounds().Min, draw.Over)
-
-	return dst
-}
-
-func (p *ImageProcessor) resize(img image.Image, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) {
+func (p ImageProcessor) resize(img image.Image) (image.Image, image.Point) {
 	b := img.Bounds()
-	tileSize := outputSize.Y

-	canvasSize := p.optimalTiledCanvas(b.Max, maxImageTiles, tileSize)
-	aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize}
-	newSize := p.fitToCanvas(b.Max, canvasSize, tileSize)
+	canvasSize := p.optimalTiledCanvas(b.Max)
+	aspectRatio := image.Point{canvasSize.X / p.imageSize, canvasSize.Y / p.imageSize}
+	newSize := p.fitToCanvas(b.Max, canvasSize)

 	dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y))

@@ -177,10 +163,10 @@ func (p *ImageProcessor) resize(img image.Image, outputSize image.Point, maxImag
 	return dst, aspectRatio
 }

-func (p *ImageProcessor) pad(img image.Image, outputSize, aspectRatio image.Point) image.Image {
+func (p ImageProcessor) pad(img image.Image, aspectRatio image.Point) image.Image {
 	paddedSize := image.Point{
-		X: outputSize.X * aspectRatio.X,
-		Y: outputSize.Y * aspectRatio.Y,
+		X: p.imageSize * aspectRatio.X,
+		Y: p.imageSize * aspectRatio.Y,
 	}

 	dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y))
@@ -189,7 +175,7 @@ func (p *ImageProcessor) pad(img image.Image, outputSize, aspectRatio image.Poin
 	return dst
 }

-func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, std [3]float32) []float32 {
+func (p ImageProcessor) pack(img image.Image, aspectRatio image.Point) []float32 {
 	subImages := p.splitToTiles(img, aspectRatio)

 	var pixelVals []float32
@@ -205,9 +191,9 @@ func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, st
 				gVal := float32(g>>8) / 255.0
 				bVal := float32(b>>8) / 255.0

-				rVal = (rVal - mean[0]) / std[0]
-				gVal = (gVal - mean[1]) / std[1]
-				bVal = (bVal - mean[2]) / std[2]
+				rVal = (rVal - p.mean[0]) / p.std[0]
+				gVal = (gVal - p.mean[1]) / p.std[1]
+				bVal = (bVal - p.mean[2]) / p.std[2]

 				rVals = append(rVals, rVal)
 				gVals = append(gVals, gVal)
@@ -222,17 +208,15 @@ func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, st
 	return pixelVals
 }

-func (p ImageProcessor) ProcessImage(img image.Image) ([]float32, int, error) {
-	outputSize := image.Point{p.imageSize, p.imageSize}
-
-	// clip values
-	mean := [3]float32{0.48145466, 0.4578275, 0.40821073}
-	std := [3]float32{0.26862954, 0.26130258, 0.27577711}
+func (p ImageProcessor) ProcessImage(img image.Image) ([]float32, supportedAspectRatio, error) {
+	newImage, newImageRatio := p.resize(img)
+	newImage = p.pad(newImage, newImageRatio)
+	pixelValues := p.pack(newImage, newImageRatio)

-	newImage, aspectRatio := p.resize(img, outputSize, p.maxNumTiles)
-	newImage = p.pad(newImage, outputSize, aspectRatio)
+	supportedAspectRatios := p.supportedAspectRatios()
+	aspectRatioID := slices.IndexFunc(supportedAspectRatios, func(i supportedAspectRatio) bool {
+		return i.width == newImageRatio.X && i.height == newImageRatio.Y
+	})

-	data := p.pack(newImage, aspectRatio, mean, std)
-	aspectRatioIndex := slices.Index(p.supportedAspectRatios(p.maxNumTiles), aspectRatio) + 1
-	return data, aspectRatioIndex, nil
+	return pixelValues, supportedAspectRatios[aspectRatioID], nil
 }