Merge branch 'main' into drifkin/array-head-count-simple

b2b270ad · Devon Rifkin · 20c5fd39 · 2bb69b40 · b2b270ad · b2b270ad
Commit b2b270ad authored Jun 23, 2025 by Devon Rifkin
20 changed files
--- a/model/models/mllama/process_image_test.go
+++ b/model/models/mllama/process_image_test.go
+package mllama
+import (
+	"image"
+	"testing"
+	"github.com/google/go-cmp/cmp"
+)
+func TestSupportedAspectRatios(t *testing.T) {
+	cases := []struct {
+		p    ImageProcessor
+		want []supportedAspectRatio
+	}{
+		{
+			p: ImageProcessor{maxNumTiles: 1},
+			want: []supportedAspectRatio{
+				{1, 1, 1},
+			},
+		},
+		{
+			p: ImageProcessor{maxNumTiles: 2},
+			want: []supportedAspectRatio{
+				{1, 1, 1},
+				{2, 1, 2},
+				{3, 2, 1},
+			},
+		},
+		{
+			p: ImageProcessor{maxNumTiles: 3},
+			want: []supportedAspectRatio{
+				{1, 1, 1},
+				{2, 1, 2},
+				{3, 1, 3},
+				{4, 2, 1},
+				{5, 3, 1},
+			},
+		},
+		{
+			p: ImageProcessor{maxNumTiles: 4},
+			want: []supportedAspectRatio{
+				{1, 1, 1},
+				{2, 1, 2},
+				{3, 1, 3},
+				{4, 1, 4},
+				{5, 2, 1},
+				{6, 2, 2},
+				{7, 3, 1},
+				{8, 4, 1},
+			},
+		},
+	}
+	for _, tt := range cases {
+		actual := tt.p.supportedAspectRatios()
+		if diff := cmp.Diff(actual, tt.want, cmp.AllowUnexported(supportedAspectRatio{})); diff != "" {
+			t.Errorf("mismatch (-got +want):\n%s", diff)
+		}
+	}
+}
+func TestFitToCanvas(t *testing.T) {
+	cases := []struct {
+		p      ImageProcessor
+		image  image.Point
+		canvas image.Point
+		expect image.Point
+	}{
+		{
+			p:      ImageProcessor{imageSize: 200},
+			image:  image.Point{400, 400},
+			canvas: image.Point{640, 480},
+			expect: image.Point{400, 400},
+		},
+		{
+			p:      ImageProcessor{imageSize: 200},
+			image:  image.Point{1024, 768},
+			canvas: image.Point{640, 480},
+			expect: image.Point{640, 480},
+		},
+		{
+			p:      ImageProcessor{imageSize: 750},
+			image:  image.Point{500, 500},
+			canvas: image.Point{1000, 1000},
+			expect: image.Point{750, 750},
+		},
+		{
+			p:      ImageProcessor{imageSize: 2000},
+			image:  image.Point{500, 1000},
+			canvas: image.Point{2000, 2000},
+			expect: image.Point{1000, 2000},
+		},
+		{
+			p:      ImageProcessor{imageSize: 1000},
+			image:  image.Point{4000, 3000},
+			canvas: image.Point{2000, 1000},
+			expect: image.Point{1333, 1000},
+		},
+		{
+			p:      ImageProcessor{imageSize: 560},
+			image:  image.Point{667, 1000},
+			canvas: image.Point{1000, 1000},
+			expect: image.Point{667, 1000},
+		},
+	}
+	for _, tt := range cases {
+		actual := tt.p.fitToCanvas(tt.image, tt.canvas)
+		if diff := cmp.Diff(actual, tt.expect); diff != "" {
+			t.Errorf("mismatch (-got +want):\n%s", diff)
+		}
+	}
+}
+func TestOptimalTiledCanvas(t *testing.T) {
+	cases := []struct {
+		p      ImageProcessor
+		image  image.Point
+		expect image.Point
+	}{
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 1000},
+			image:  image.Point{1024, 768},
+			expect: image.Point{2000, 1000},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{1024, 768},
+			expect: image.Point{1120, 1120},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{800, 600},
+			expect: image.Point{1120, 1120},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{640, 480},
+			expect: image.Point{1120, 560},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{320, 200},
+			expect: image.Point{560, 560},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{1320, 200},
+			expect: image.Point{1680, 560},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{2000, 200},
+			expect: image.Point{2240, 560},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{10000, 200},
+			expect: image.Point{2240, 560},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{480, 640},
+			expect: image.Point{560, 1120},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{200, 320},
+			expect: image.Point{560, 560},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{200, 1320},
+			expect: image.Point{560, 1680},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{200, 2000},
+			expect: image.Point{560, 2240},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{200, 10000},
+			expect: image.Point{560, 2240},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{10000, 10000},
+			expect: image.Point{1120, 1120},
+		},
+	}
+	for _, tt := range cases {
+		actual := tt.p.optimalTiledCanvas(tt.image)
+		if diff := cmp.Diff(actual, tt.expect); diff != "" {
+			t.Errorf("mismatch (-got +want):\n%s", diff)
+		}
+	}
+}
+func TestSplitToTiles(t *testing.T) {
+	cases := []struct {
+		imageMax image.Point
+		numTiles image.Point
+		expect   []image.Image
+	}{
+		{
+			imageMax: image.Point{1024, 768},
+			numTiles: image.Point{1, 1},
+			expect:   []image.Image{image.NewRGBA(image.Rect(0, 0, 1024, 768))},
+		},
+		{
+			imageMax: image.Point{1000, 500},
+			numTiles: image.Point{2, 1},
+			expect: []image.Image{
+				image.NewRGBA(image.Rect(0, 0, 500, 500)),
+				image.NewRGBA(image.Rect(500, 0, 1000, 500)),
+			},
+		},
+		{
+			imageMax: image.Point{1000, 1000},
+			numTiles: image.Point{2, 2},
+			expect: []image.Image{
+				image.NewRGBA(image.Rect(0, 0, 500, 500)),
+				image.NewRGBA(image.Rect(500, 0, 1000, 500)),
+				image.NewRGBA(image.Rect(0, 500, 500, 1000)),
+				image.NewRGBA(image.Rect(500, 500, 1000, 1000)),
+			},
+		},
+	}
+	var p ImageProcessor
+	for _, tt := range cases {
+		actual := p.splitToTiles(image.NewRGBA(image.Rectangle{Max: tt.imageMax}), tt.numTiles)
+		if len(actual) != len(tt.expect) {
+			t.Errorf("incorrect number of images '%d': expect: '%d'", len(actual), len(tt.expect))
+		}
+		for i := range actual {
+			if actual[i].Bounds() != tt.expect[i].Bounds() {
+				t.Errorf("image size incorrect: '%#v': expect: '%#v'", actual[i].Bounds(), tt.expect[i].Bounds())
+			}
+		}
+	}
+}
+func TestResize(t *testing.T) {
+	cases := []struct {
+		p                 ImageProcessor
+		imageMax          image.Point
+		expectImage       image.Image
+		expectAspectRatio image.Point
+	}{
+		{
+			p:                 ImageProcessor{maxNumTiles: 1, imageSize: 100},
+			imageMax:          image.Point{200, 200},
+			expectImage:       image.NewRGBA(image.Rect(0, 0, 100, 100)),
+			expectAspectRatio: image.Point{1, 1},
+		},
+		{
+			p:                 ImageProcessor{maxNumTiles: 2, imageSize: 100},
+			imageMax:          image.Point{200, 200},
+			expectImage:       image.NewRGBA(image.Rect(0, 0, 100, 100)),
+			expectAspectRatio: image.Point{1, 1},
+		},
+		{
+			p:                 ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			imageMax:          image.Point{10, 10},
+			expectImage:       image.NewRGBA(image.Rect(0, 0, 560, 560)),
+			expectAspectRatio: image.Point{1, 1},
+		},
+		{
+			p:                 ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			imageMax:          image.Point{2560, 1920},
+			expectImage:       image.NewRGBA(image.Rect(0, 0, 1120, 840)),
+			expectAspectRatio: image.Point{2, 2},
+		},
+		{
+			p:                 ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			imageMax:          image.Point{1024, 768},
+			expectImage:       image.NewRGBA(image.Rect(0, 0, 1024, 768)),
+			expectAspectRatio: image.Point{2, 2},
+		},
+	}
+	for _, tt := range cases {
+		actualImage, actualAspectRatio := tt.p.resize(image.Rectangle{Max: tt.imageMax})
+		if actualImage.Bounds() != tt.expectImage.Bounds() {
+			t.Errorf("image size incorrect: '%#v': expect: '%#v'", actualImage.Bounds(), tt.expectImage.Bounds())
+		}
+		if actualAspectRatio != tt.expectAspectRatio {
+			t.Errorf("aspect ratio incorrect: '%#v': expect: '%#v'", actualAspectRatio, tt.expectAspectRatio)
+		}
+	}
+}
+func TestPad(t *testing.T) {
+	cases := []struct {
+		p           ImageProcessor
+		imageMax    image.Point
+		aspectRatio image.Point
+		expect      image.Image
+	}{
+		{
+			p:           ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			imageMax:    image.Point{1000, 667},
+			aspectRatio: image.Point{2, 2},
+			expect:      image.NewRGBA(image.Rect(0, 0, 1120, 1120)),
+		},
+	}
+	for _, tt := range cases {
+		actual := tt.p.pad(image.Rectangle{Max: tt.imageMax}, tt.aspectRatio)
+		if actual.Bounds() != tt.expect.Bounds() {
+			t.Errorf("image size incorrect: '%#v': expect: '%#v'", actual.Bounds(), tt.expect.Bounds())
+		}
+	}
+}
+func TestPackImages(t *testing.T) {
+	cases := []struct {
+		imageMax    image.Point
+		aspectRatio image.Point
+		expectVals  int
+	}{
+		{
+			imageMax:    image.Point{1120, 1120},
+			aspectRatio: image.Point{2, 2},
+			expectVals:  2 * 2 * 3 * 560 * 560,
+		},
+		{
+			imageMax:    image.Point{560, 560},
+			aspectRatio: image.Point{1, 1},
+			expectVals:  1 * 1 * 3 * 560 * 560,
+		},
+		{
+			imageMax:    image.Point{1120, 560},
+			aspectRatio: image.Point{1, 2},
+			expectVals:  1 * 2 * 3 * 560 * 560,
+		},
+	}
+	for _, tt := range cases {
+		var p ImageProcessor
+		actualVals := p.pack(image.NewRGBA(image.Rectangle{Max: tt.imageMax}), tt.aspectRatio)
+		if len(actualVals) != tt.expectVals {
+			t.Errorf("packed image size incorrect: '%d': expect: '%d'", len(actualVals), tt.expectVals)
+		}
+	}
+}
+func TestPreprocess(t *testing.T) {
+	cases := []struct {
+		imageMax            image.Point
+		expectAspectRatioID int
+	}{
+		{
+			imageMax:            image.Point{10, 10},
+			expectAspectRatioID: 1,
+		},
+		{
+			imageMax:            image.Point{1024, 768},
+			expectAspectRatioID: 6,
+		},
+	}
+	p := ImageProcessor{imageSize: 560, maxNumTiles: 4}
+	for _, tt := range cases {
+		img, aspectRatio, err := p.ProcessImage(image.NewRGBA(image.Rectangle{Max: tt.imageMax}))
+		if err != nil {
+			t.Fatalf("error processing: %q", err)
+		}
+		if len(img) == 0 {
+			t.Errorf("no image data returned")
+		}
+		if aspectRatio.rank != tt.expectAspectRatioID {
+			t.Errorf("aspect ratio incorrect: '%d': expect: '%d'", aspectRatio, tt.expectAspectRatioID)
+		}
+	}
+}
--- a/model/models/models.go
+++ b/model/models/models.go
@@ -7,4 +7,7 @@ import (
 	_ "github.com/ollama/ollama/model/models/llama4"
 	_ "github.com/ollama/ollama/model/models/mistral3"
 	_ "github.com/ollama/ollama/model/models/mllama"
+	_ "github.com/ollama/ollama/model/models/qwen2"
+	_ "github.com/ollama/ollama/model/models/qwen25vl"
+	_ "github.com/ollama/ollama/model/models/qwen3"
 )
--- a/model/models/qwen2/model.go
+++ b/model/models/qwen2/model.go
+package qwen2
+import (
+	"cmp"
+	"math"
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/fast"
+	"github.com/ollama/ollama/ml/nn/rope"
+	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/model/input"
+)
+type Options struct {
+	hiddenSize, numHeads, numKVHeads int
+	headDim, ropeDim                 int
+	eps, ropeBase, ropeScale         float32
+}
+type Attention struct {
+	Query  *nn.Linear `gguf:"attn_q"`
+	Key    *nn.Linear `gguf:"attn_k"`
+	Value  *nn.Linear `gguf:"attn_v"`
+	Output *nn.Linear `gguf:"attn_output"`
+}
+func (attn Attention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+	batchSize := hiddenStates.Dim(1)
+	headDim := cmp.Or(opts.headDim, opts.hiddenSize/opts.numHeads)
+	ropeDim := cmp.Or(opts.ropeDim, headDim)
+	query := attn.Query.Forward(ctx, hiddenStates)
+	query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
+	key := attn.Key.Forward(ctx, hiddenStates)
+	key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
+	value := attn.Value.Forward(ctx, hiddenStates)
+	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
+	query = fast.RoPE(ctx, query, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
+	key = fast.RoPE(ctx, key, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
+	attention := nn.Attention(ctx, query, key, value, 1.0/math.Sqrt(float64(headDim)), cache)
+	attention = attention.Reshape(ctx, headDim*opts.numHeads, batchSize)
+	return attn.Output.Forward(ctx, attention)
+}
+type MLP struct {
+	Gate *nn.Linear `gguf:"ffn_gate"`
+	Up   *nn.Linear `gguf:"ffn_up"`
+	Down *nn.Linear `gguf:"ffn_down"`
+}
+func (mlp MLP) Forward(ctx ml.Context, hiddenStates ml.Tensor) ml.Tensor {
+	hiddenStates = mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenStates))
+	return mlp.Down.Forward(ctx, hiddenStates)
+}
+type DecoderLayer struct {
+	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
+	Attention     *Attention
+	MLPNorm       *nn.RMSNorm `gguf:"ffn_norm"`
+	MLP           *MLP
+}
+func (d DecoderLayer) Forward(ctx ml.Context, hiddenStates, positions, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+	residual := hiddenStates
+	hiddenStates = d.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = d.Attention.Forward(ctx, hiddenStates, positions, cache, opts)
+	if outputs != nil {
+		hiddenStates = hiddenStates.Rows(ctx, outputs)
+		residual = residual.Rows(ctx, outputs)
+	}
+	hiddenStates = hiddenStates.Add(ctx, residual)
+	residual = hiddenStates
+	hiddenStates = d.MLPNorm.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = d.MLP.Forward(ctx, hiddenStates)
+	return hiddenStates.Add(ctx, residual)
+}
+type Model struct {
+	model.Base
+	model.BytePairEncoding
+	TokenEmbedding *nn.Embedding  `gguf:"token_embd"`
+	Layers         []DecoderLayer `gguf:"blk"`
+	OutputNorm     *nn.RMSNorm    `gguf:"output_norm"`
+	Output         *nn.Linear     `gguf:"output,alt:token_embd"`
+	Options
+}
+// Forward implements model.Model.
+func (m Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
+	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs)
+	for i, layer := range m.Layers {
+		m.Cache.SetLayer(i)
+		var outputs ml.Tensor
+		if i == len(m.Layers)-1 {
+			outputs = ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+		}
+		hiddenStates = layer.Forward(ctx, hiddenStates, positions, outputs, m.Cache, &m.Options)
+	}
+	hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.eps)
+	hiddenStates = m.Output.Forward(ctx, hiddenStates)
+	return hiddenStates, nil
+}
+func (m Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
+	ropeDim := cmp.Or(m.ropeDim, m.hiddenSize/m.numHeads)
+	return fast.RoPE(ctx, key, shift, ropeDim, m.ropeBase, m.ropeScale, rope.WithTypeNeoX()), nil
+}
+func New(c fs.Config) (model.Model, error) {
+	m := Model{
+		Layers: make([]DecoderLayer, c.Uint("block_count")),
+		BytePairEncoding: model.NewBytePairEncoding(
+			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
+			&model.Vocabulary{
+				Values: c.Strings("tokenizer.ggml.tokens"),
+				Types:  c.Ints("tokenizer.ggml.token_type"),
+				Merges: c.Strings("tokenizer.ggml.merges"),
+				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS: append(
+					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
+			},
+		),
+		Options: Options{
+			hiddenSize: int(c.Uint("embedding_length")),
+			numHeads:   int(c.Uint("attention.head_count")),
+			numKVHeads: int(c.Uint("attention.head_count_kv")),
+			headDim:    int(c.Uint("attention.key_length")),
+			ropeDim:    int(c.Uint("rope.dimension_count")),
+			ropeBase:   c.Float("rope.freq_base"),
+			ropeScale:  c.Float("rope.freq_scale", 1),
+			eps:        c.Float("attention.layer_norm_rms_epsilon"),
+		},
+	}
+	m.Cache = kvcache.NewCausalCache(m.Shift)
+	return &m, nil
+}
+func init() {
+	model.Register("qwen2", New)
+}
--- a/model/models/qwen25vl/model.go
+++ b/model/models/qwen25vl/model.go
+package qwen25vl
+import (
+	"bytes"
+	"fmt"
+	"image"
+	"slices"
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/model/input"
+)
+type Model struct {
+	model.Base
+	model.BytePairEncoding
+	*TextModel
+	*VisionModel `gguf:"v,vision"`
+	ImageProcessor
+}
+// Implement MultimodalProcessor interface
+var _ model.MultimodalProcessor = (*Model)(nil)
+func New(c fs.Config) (model.Model, error) {
+	m := &Model{
+		BytePairEncoding: model.NewBytePairEncoding(
+			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
+			&model.Vocabulary{
+				Values: c.Strings("tokenizer.ggml.tokens"),
+				Types:  c.Ints("tokenizer.ggml.token_type"),
+				Merges: c.Strings("tokenizer.ggml.merges"),
+				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS: append(
+					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
+			},
+		),
+		TextModel:      NewTextModel(c),
+		VisionModel:    newVisionModel(c),
+		ImageProcessor: newImageProcessor(c),
+	}
+	m.Cache = kvcache.NewCausalCache(m.TextModel.Shift)
+	return m, nil
+}
+func (m *Model) PixelValues(ctx ml.Context, multimodalData []byte) (ml.Tensor, *Grid, error) {
+	image, _, err := image.Decode(bytes.NewReader(multimodalData))
+	if err != nil {
+		return nil, nil, err
+	}
+	f32s, grid, err := m.ImageProcessor.ProcessImage(image)
+	if err != nil {
+		return nil, nil, err
+	}
+	// Calculate tensor dimensions
+	patchDim := m.ImageProcessor.numChannels * m.ImageProcessor.temporalPatchSize *
+		m.ImageProcessor.patchSize * m.ImageProcessor.patchSize
+	numPatches := grid.Temporal * grid.Height * grid.Width
+	pixelValues := ctx.Input().FromFloatSlice(f32s, patchDim, numPatches)
+	return pixelValues, grid, nil
+}
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
+	if len(m.VisionModel.Layers) == 0 {
+		return nil, model.ErrNoVisionModel
+	}
+	pixels, grid, err := m.PixelValues(ctx, multimodalData)
+	if err != nil {
+		return nil, err
+	}
+	visionOutputs := m.VisionModel.Forward(ctx, pixels, grid)
+	return []input.Multimodal{{Tensor: visionOutputs}}, nil
+}
+// PostTokenize arranges Qwen-2.5-VL's inputs for the forward pass
+func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
+	var result []input.Input
+	var (
+		imageToken       int32 = 151655
+		visionStartToken int32 = 151652
+		visionEndToken   int32 = 151653
+	)
+	nImg := 0
+	for _, inp := range inputs {
+		if inp.Multimodal == nil {
+			// If not a multimodal input, add it to the result unchanged
+			result = append(result, inp)
+		} else {
+			// Adding the 'Picture' prefix is a hack, at the time of writing there is no way to prefix
+			// the image tokens with a prompt, so we add a prefix here
+			nImg++
+			pre, err := m.Encode(fmt.Sprintf(" Picture %d: ", nImg), true)
+			if err != nil {
+				return nil, fmt.Errorf("failed to encode image prompt: %w", err)
+			}
+			for i := range pre {
+				result = append(result, input.Input{Token: pre[i]})
+			}
+			patchesPerChunk := inp.Multimodal[0].Tensor.Dim(1)
+			// First add the vision start token
+			result = append(result, input.Input{Token: visionStartToken})
+			// Add the image token with the multimodal tensor data at the first position
+			result = append(result, input.Input{
+				Token:          imageToken,
+				Multimodal:     inp.Multimodal,
+				MultimodalHash: inp.MultimodalHash,
+				SameBatch:      patchesPerChunk,
+			})
+			// Add the placeholder tokens for the remaining positions (tokensPerGrid-1)
+			result = append(result, slices.Repeat([]input.Input{{Token: imageToken}}, patchesPerChunk-1)...)
+			result = append(result, input.Input{Token: visionEndToken})
+		}
+	}
+	return result, nil
+}
+func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
+	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache)
+}
+func init() {
+	model.Register("qwen25vl", New)
+}
--- a/model/models/qwen25vl/model_text.go
+++ b/model/models/qwen25vl/model_text.go
+package qwen25vl
+import (
+	"math"
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/fast"
+	"github.com/ollama/ollama/ml/nn/rope"
+	"github.com/ollama/ollama/model/input"
+)
+type TextOptions struct {
+	hiddenSize, numHeads, numKVHeads int
+	ropeDim, originalContextLength   int
+	eps, ropeBase, ropeScale         float32
+}
+type TextModel struct {
+	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
+	Layers         []Layer       `gguf:"blk"`
+	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
+	Output         *nn.Linear    `gguf:"output,alt:token_embd"`
+	*TextOptions
+}
+func NewTextModel(c fs.Config) *TextModel {
+	m := TextModel{
+		Layers: make([]Layer, c.Uint("block_count")),
+		TextOptions: &TextOptions{
+			hiddenSize:            int(c.Uint("embedding_length")),
+			numHeads:              int(c.Uint("attention.head_count")),
+			numKVHeads:            int(c.Uint("attention.head_count_kv")),
+			ropeDim:               int(c.Uint("rope.dimension_count", 128)),
+			originalContextLength: int(c.Uint("context_length", 128000)),
+			eps:                   c.Float("attention.layer_norm_rms_epsilon"),
+			ropeBase:              c.Float("rope.freq_base"),
+			ropeScale:             c.Float("rope.freq_scale", 1),
+		},
+	}
+	return &m
+}
+// SelfAttention implements the multi-head self-attention mechanism
+// with separate projections for query, key, value and output transformations
+type SelfAttention struct {
+	Query  *nn.Linear `gguf:"attn_q"`
+	Key    *nn.Linear `gguf:"attn_k"`
+	Value  *nn.Linear `gguf:"attn_v"`
+	Output *nn.Linear `gguf:"attn_output"`
+}
+func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
+	batchSize := hiddenState.Dim(1)
+	headDim := opts.hiddenSize / opts.numHeads
+	q := sa.Query.Forward(ctx, hiddenState)
+	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
+	q = fast.RoPE(ctx, q, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithOriginalContextLength(opts.originalContextLength), rope.WithTypeNeoX())
+	k := sa.Key.Forward(ctx, hiddenState)
+	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
+	k = fast.RoPE(ctx, k, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithOriginalContextLength(opts.originalContextLength), rope.WithTypeNeoX())
+	v := sa.Value.Forward(ctx, hiddenState)
+	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
+	scaleFactor := 1.0 / math.Sqrt(float64(headDim))
+	kqv := nn.Attention(ctx, q, k, v, scaleFactor, cache)
+	kqv = kqv.Reshape(ctx, opts.hiddenSize, batchSize)
+	return sa.Output.Forward(ctx, kqv)
+}
+// Shift applies rotary position embeddings to the key tensor for causal attention caching
+func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
+	return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale, rope.WithOriginalContextLength(m.originalContextLength), rope.WithTypeNeoX()), nil
+}
+// MLP implements the feed-forward network component with SwiGLU activation
+type MLP struct {
+	Up   *nn.Linear `gguf:"ffn_up"`
+	Down *nn.Linear `gguf:"ffn_down"`
+	Gate *nn.Linear `gguf:"ffn_gate"`
+}
+func (mlp *MLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *TextOptions) ml.Tensor {
+	// Apply SwiGLU activation gating
+	hiddenState = mlp.Gate.Forward(ctx, hiddenState).SILU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenState))
+	// Project back to hidden dimension
+	return mlp.Down.Forward(ctx, hiddenState)
+}
+// Layer represents a single transformer layer combining self-attention and feed-forward components
+type Layer struct {
+	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
+	SelfAttention *SelfAttention
+	MLPNorm       *nn.RMSNorm `gguf:"ffn_norm"`
+	MLP           *MLP
+}
+func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
+	// Self-attention branch with residual connection
+	residual := hiddenState
+	hiddenState = l.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
+	hiddenState = l.SelfAttention.Forward(ctx, hiddenState, positionIDs, cache, opts)
+	// In the final layer (outputs != nil), optimize by pruning to just the token positions
+	// we need logits for.
+	if outputs != nil {
+		hiddenState = hiddenState.Rows(ctx, outputs)
+		residual = residual.Rows(ctx, outputs)
+	}
+	hiddenState = hiddenState.Add(ctx, residual)
+	// Feed-forward branch with residual connection
+	residual = hiddenState
+	hiddenState = l.MLPNorm.Forward(ctx, hiddenState, opts.eps)
+	hiddenState = l.MLP.Forward(ctx, hiddenState, opts)
+	return hiddenState.Add(ctx, residual)
+}
+func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor, batch input.Batch, cache kvcache.Cache) (ml.Tensor, error) {
+	// Initial token embedding
+	hiddenStates := m.TokenEmbedding.Forward(ctx, inputs).Duplicate(ctx)
+	for _, mi := range batch.Multimodal {
+		img := mi.Multimodal[0].Tensor
+		ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1))))
+	}
+	// Process through transformer layers
+	for i, layer := range m.Layers {
+		cache.SetLayer(i)
+		var lastLayerOutputs ml.Tensor
+		if i == len(m.Layers)-1 {
+			lastLayerOutputs = outputs
+		}
+		hiddenStates = layer.Forward(ctx, hiddenStates, positions, lastLayerOutputs, cache, m.TextOptions)
+	}
+	hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.eps)
+	return m.Output.Forward(ctx, hiddenStates), nil
+}
--- a/model/models/qwen25vl/model_vision.go
+++ b/model/models/qwen25vl/model_vision.go
+package qwen25vl
+import (
+	"math"
+	"slices"
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+)
+// We only support batch size of 1
+var batchSize int = 1
+func rotateHalf(ctx ml.Context, t ml.Tensor) ml.Tensor {
+	x1 := t.View(ctx, 0, t.Dim(0)/2, t.Stride(1), t.Dim(1), t.Stride(2), t.Dim(2), t.Stride(3), t.Dim(3))
+	x2 := t.View(ctx, t.Stride(0)*t.Dim(0)/2, t.Dim(0)/2, t.Stride(1), t.Dim(1), t.Stride(2), t.Dim(2), t.Stride(3), t.Dim(3)).Contiguous(ctx)
+	return x2.Neg(ctx).Concat(ctx, x1, 0)
+}
+func applyRotaryPositionalEmbedding(ctx ml.Context, t, cos, sin ml.Tensor) ml.Tensor {
+	return t.Mul(ctx, cos).Add(ctx, rotateHalf(ctx, t).Mul(ctx, sin))
+}
+func blockDiagonalMask(ctx ml.Context, seqLength int, bounds []int, numHeads int) ml.Tensor {
+	// Create a flat slice for the mask (all -inf initially to block all attention)
+	flat := make([]float32, seqLength*seqLength)
+	for i := range flat {
+		flat[i] = float32(math.Inf(-1)) // Negative infinity to block attention
+	}
+	// Fill in the mask with zeros for tokens that CAN attend to each other
+	for i := 1; i < len(bounds); i++ {
+		start := bounds[i-1]
+		end := bounds[i]
+		// Enable attention within this sequence block by setting values to 0
+		for row := start; row < end; row++ {
+			for col := start; col < end; col++ {
+				idx := row*seqLength + col
+				flat[idx] = 0.0 // 0 allows attention, -inf blocks it
+			}
+		}
+	}
+	mask := ctx.Input().FromFloatSlice(flat, seqLength, seqLength)
+	// Reshape to match [seqLength, seqLength, 1] for broadcasting
+	mask = mask.Reshape(ctx, seqLength, seqLength, 1)
+	return mask
+}
+type VisionSelfAttention struct {
+	Query  *nn.Linear `gguf:"attn_q"`
+	Key    *nn.Linear `gguf:"attn_k"`
+	Value  *nn.Linear `gguf:"attn_v"`
+	Output *nn.Linear `gguf:"attn_out"`
+}
+func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenStates, cos, sin, mask ml.Tensor, opts *VisionModelOptions) ml.Tensor {
+	query := sa.Query.Forward(ctx, hiddenStates)
+	key := sa.Key.Forward(ctx, hiddenStates)
+	value := sa.Value.Forward(ctx, hiddenStates)
+	query = query.Reshape(ctx, opts.headDim, opts.numHeads, query.Dim(1), batchSize)
+	key = key.Reshape(ctx, opts.headDim, opts.numHeads, key.Dim(1), batchSize)
+	value = value.Reshape(ctx, opts.headDim, opts.numHeads, value.Dim(1), batchSize)
+	query = applyRotaryPositionalEmbedding(ctx, query, cos, sin)
+	key = applyRotaryPositionalEmbedding(ctx, key, cos, sin)
+	// Scale factor for scaled dot-product attention
+	scale := 1.0 / math.Sqrt(float64(opts.headDim))
+	// Scaled dot-product attention
+	query = query.Permute(ctx, 0, 2, 1, 3)
+	key = key.Permute(ctx, 0, 2, 1, 3)
+	value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
+	kq := key.MulmatFullPrec(ctx, query)
+	kq = kq.Scale(ctx, scale)
+	if mask != nil {
+		kq = kq.Add(ctx, mask)
+	}
+	kq = kq.Softmax(ctx)
+	kqv := value.Mulmat(ctx, kq)
+	attention := kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+	attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize)
+	return sa.Output.Forward(ctx, attention)
+}
+// VisionMLP implements the multi-layer perceptron
+type VisionMLP struct {
+	Gate *nn.Linear `gguf:"ffn_gate"`
+	Up   *nn.Linear `gguf:"ffn_up"`
+	Down *nn.Linear `gguf:"ffn_down"`
+}
+func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *VisionModelOptions) ml.Tensor {
+	// Using activation as specified in config (likely GELU or SiLU/Swish)
+	gateOutput := mlp.Gate.Forward(ctx, hiddenStates)
+	upOutput := mlp.Up.Forward(ctx, hiddenStates)
+	hiddenStates = gateOutput.SILU(ctx).Mul(ctx, upOutput)
+	return mlp.Down.Forward(ctx, hiddenStates)
+}
+type VisionEncoderLayer struct {
+	Norm1         *nn.RMSNorm `gguf:"ln1"`
+	SelfAttention *VisionSelfAttention
+	Norm2         *nn.RMSNorm `gguf:"ln2"`
+	MLP           *VisionMLP
+}
+func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenStates, cos, sin, mask ml.Tensor, opts *VisionModelOptions) ml.Tensor {
+	residual := hiddenStates
+	hiddenStates = e.Norm1.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = e.SelfAttention.Forward(ctx, hiddenStates, cos, sin, mask, opts)
+	hiddenStates = hiddenStates.Add(ctx, residual)
+	residual = hiddenStates
+	hiddenStates = e.Norm2.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = e.MLP.Forward(ctx, hiddenStates, opts)
+	return hiddenStates.Add(ctx, residual)
+}
+// VisionModelOptions contains configuration options
+type VisionModelOptions struct {
+	hiddenSize        int
+	numHeads          int
+	headDim           int
+	patchSize         int
+	numChannels       int
+	eps               float32
+	ropeTheta         float32
+	spatialMergeSize  int
+	windowSize        int
+	fullAttnBlocks    []int32
+	temporalPatchSize int
+}
+type PatchEmbedding struct {
+	PatchConv0 *nn.Conv2D `gguf:"patch_embd_0"`
+	PatchConv1 *nn.Conv2D `gguf:"patch_embd_1"`
+}
+func (pe *PatchEmbedding) Forward(ctx ml.Context, pixelValues ml.Tensor, opts *VisionModelOptions) ml.Tensor {
+	numPatches := pixelValues.Shape()[1]
+	// Reshape the input tensor to match the expected dimensions
+	pixelValues = pixelValues.Reshape(ctx, opts.patchSize*opts.patchSize, opts.temporalPatchSize, opts.numChannels, numPatches)
+	// Permute the tensor to bring the temporal dimension to the front
+	pixelValues = pixelValues.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
+	// Split the tensor into parts for the temporal convolutions
+	in0 := pixelValues.View(ctx, 0, 1, pixelValues.Stride(1), pixelValues.Dim(1), pixelValues.Stride(2), pixelValues.Dim(2), pixelValues.Stride(3), pixelValues.Dim(3)).Contiguous(ctx)
+	in0 = in0.Reshape(ctx, opts.patchSize, opts.patchSize, opts.numChannels, numPatches)
+	in1 := pixelValues.View(ctx, pixelValues.Stride(0), 1, pixelValues.Stride(1), pixelValues.Dim(1), pixelValues.Stride(2), pixelValues.Dim(2), pixelValues.Stride(3), pixelValues.Dim(3)).Contiguous(ctx)
+	in1 = in1.Reshape(ctx, opts.patchSize, opts.patchSize, opts.numChannels, numPatches)
+	s0, s1 := opts.patchSize, opts.patchSize // Use full stride
+	p0, p1 := 0, 0                           // padding
+	d0, d1 := 1, 1                           // dilation
+	out0 := pe.PatchConv0.Forward(ctx, in0, s0, s1, p0, p1, d0, d1)
+	out1 := pe.PatchConv1.Forward(ctx, in1, s0, s1, p0, p1, d0, d1)
+	// Add the outputs from the two temporal convolutions
+	out := out0.Add(ctx, out1)
+	// Reshape the output tensor to match the expected dimensions
+	return out.Reshape(ctx, opts.hiddenSize, numPatches)
+}
+// VisionPatchMerger implements patch merging for the Qwen vision model
+type VisionPatchMerger struct {
+	LNQ  *nn.RMSNorm `gguf:"ln_q"`
+	MLP0 *nn.Linear  `gguf:"mlp.0"`
+	MLP2 *nn.Linear  `gguf:"mlp.2"`
+}
+// Forward computes patch merging for the vision model
+func (pm *VisionPatchMerger) Forward(ctx ml.Context, visionOutputs ml.Tensor, opts *VisionModelOptions) ml.Tensor {
+	normalized := pm.LNQ.Forward(ctx, visionOutputs, opts.eps)
+	hiddenSize := visionOutputs.Dim(0) * (opts.spatialMergeSize * opts.spatialMergeSize)
+	// Reshape the normalized output to view the hidden size dimension
+	reshaped := normalized.Reshape(ctx, hiddenSize, normalized.Dim(1)/(opts.spatialMergeSize*opts.spatialMergeSize), batchSize)
+	hidden := pm.MLP0.Forward(ctx, reshaped)
+	activated := hidden.GELU(ctx)
+	output := pm.MLP2.Forward(ctx, activated)
+	return output
+}
+// VisionModel implements the Qwen vision model
+type VisionModel struct {
+	PatchEmbedding *PatchEmbedding
+	Layers         []VisionEncoderLayer `gguf:"blk"`
+	PatchMerger    *VisionPatchMerger   `gguf:"merger"`
+	*VisionModelOptions
+}
+// Forward computes the vision model for an input tensor
+func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor, grid *Grid) ml.Tensor {
+	// Extract patch embeddings
+	hiddenStates := m.PatchEmbedding.Forward(ctx, pixelValues, m.VisionModelOptions)
+	positionEmbedding := m.PositionalEmbedding(ctx, grid)
+	windowIndex, bounds := m.WindowIndex(ctx, grid)
+	spatialMergeUnit := m.spatialMergeSize * m.spatialMergeSize
+	hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0)*spatialMergeUnit, hiddenStates.Dim(1)/spatialMergeUnit)
+	hiddenStates = hiddenStates.Rows(ctx, windowIndex)
+	hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0)/spatialMergeUnit, hiddenStates.Dim(1)*spatialMergeUnit)
+	positionEmbedding = positionEmbedding.Reshape(ctx, positionEmbedding.Dim(0)*spatialMergeUnit, positionEmbedding.Dim(1)/spatialMergeUnit)
+	positionEmbedding = positionEmbedding.Rows(ctx, windowIndex)
+	positionEmbedding = positionEmbedding.Reshape(ctx, positionEmbedding.Dim(0)/spatialMergeUnit, positionEmbedding.Dim(1)*spatialMergeUnit)
+	positionEmbedding = positionEmbedding.Concat(ctx, positionEmbedding, 0)
+	cos, sin := positionEmbedding.Cos(ctx), positionEmbedding.Sin(ctx)
+	cos = cos.Reshape(ctx, cos.Dim(0), 1, cos.Dim(1))
+	sin = sin.Reshape(ctx, sin.Dim(0), 1, sin.Dim(1))
+	mask := blockDiagonalMask(ctx, hiddenStates.Dim(1), bounds, m.VisionModelOptions.numHeads)
+	// Apply encoder layers
+	for i, layer := range m.Layers {
+		if slices.Contains(m.fullAttnBlocks, int32(i)) {
+			hiddenStates = layer.Forward(ctx, hiddenStates, cos, sin, nil, m.VisionModelOptions)
+		} else {
+			hiddenStates = layer.Forward(
+				ctx,
+				hiddenStates,
+				cos,
+				sin,
+				mask,
+				m.VisionModelOptions,
+			)
+		}
+	}
+	hiddenStates = m.PatchMerger.Forward(ctx, hiddenStates, m.VisionModelOptions)
+	reverseWindowIndex := windowIndex.Argsort(ctx)
+	return hiddenStates.Rows(ctx, reverseWindowIndex)
+}
+// WindowIndex divides the grid into windows and returns:
+//  1. A tensor containing flattened indices of all grid points organized by windows
+//  2. A slice of boundaries that mark where each window's data begins and ends
+//     in the flattened representation, scaled by spatialMergeSize squared
+//
+// The boundaries slice always starts with 0 and contains cumulative ending
+// positions for each window, allowing downstream processing to identify
+// window boundaries in the tensor data.
+func (m *VisionModel) WindowIndex(ctx ml.Context, grid *Grid) (ml.Tensor, []int) {
+	vitMergerWindowSize := m.windowSize / m.spatialMergeSize / m.patchSize
+	llmGridH := grid.Height / m.spatialMergeSize
+	llmGridW := grid.Width / m.spatialMergeSize
+	// Calculate window parameters
+	numWindowsH := int(math.Ceil(float64(llmGridH) / float64(vitMergerWindowSize)))
+	numWindowsW := int(math.Ceil(float64(llmGridW) / float64(vitMergerWindowSize)))
+	// Initialize index_new slice
+	var index []int32
+	// Initialize bounds with the first element as 0
+	bounds := []int{0}
+	totalSeqLen := 0
+	// Process each window without padding
+	for wh := range numWindowsH {
+		for ww := range numWindowsW {
+			// Calculate window boundaries
+			hStart := wh * vitMergerWindowSize
+			wStart := ww * vitMergerWindowSize
+			hEnd := min(hStart+vitMergerWindowSize, llmGridH)
+			wEnd := min(wStart+vitMergerWindowSize, llmGridW)
+			// Calculate sequence length for this window
+			seqLen := (hEnd - hStart) * (wEnd - wStart)
+			// Collect indices for this window
+			for h := hStart; h < hEnd; h++ {
+				for w := wStart; w < wEnd; w++ {
+					index = append(index, int32(h*llmGridW+w))
+				}
+			}
+			totalSeqLen += seqLen
+			bounds = append(bounds, totalSeqLen*(m.spatialMergeSize*m.spatialMergeSize)+bounds[0])
+		}
+	}
+	t := ctx.Input().FromIntSlice(index, len(index))
+	return t, bounds
+}
+// PositionalEmbedding generates rotary position embeddings for attention mechanisms
+func (m *VisionModel) PositionalEmbedding(ctx ml.Context, grid *Grid) ml.Tensor {
+	dim := m.headDim / 2
+	freq := dim / 2
+	theta := float64(m.ropeTheta)
+	merge := m.spatialMergeSize
+	// Create frequency patterns for position encoding
+	maxGridSize := max(grid.Height, grid.Width)
+	freqVals := make([]float32, freq*maxGridSize)
+	for i := range maxGridSize {
+		for j := range freq {
+			freqVals[i*freq+j] = float32(i) / float32(math.Pow(theta, float64(j*2)/float64(dim)))
+		}
+	}
+	freqs := ctx.Input().FromFloatSlice(freqVals, freq, maxGridSize)
+	// Create position coordinates (y,x pairs) for the grid
+	// In PyTorch: Equivalent to generating position ids with torch.arange()
+	coords := make([]int32, 0, grid.Height*grid.Width*2)
+	for y := range grid.Height {
+		for x := range grid.Width {
+			coords = append(coords, int32(y), int32(x))
+		}
+	}
+	pos := ctx.Input().FromIntSlice(coords, 2, grid.Width, grid.Height)
+	// Reshape and permute positions to match spatial merging pattern
+	pos = pos.Reshape(ctx, 2, grid.Width, merge, grid.Height/merge)
+	pos = pos.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+	pos = pos.Reshape(ctx, 2, merge, merge, grid.Width/merge*grid.Height/merge)
+	pos = pos.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+	pos = pos.Reshape(ctx, 2*merge*merge*grid.Width/merge*grid.Height/merge)
+	// Use position indices to look up corresponding frequency values
+	positionalEmbedding := freqs.Rows(ctx, pos)
+	positionalEmbedding = positionalEmbedding.Reshape(ctx, positionalEmbedding.Dim(0)*2, positionalEmbedding.Dim(1)/2)
+	return positionalEmbedding
+}
+// newVisionModel creates a new instance of the Qwen vision model
+func newVisionModel(c fs.Config) *VisionModel {
+	patchSize := int(c.Uint("vision.patch_size", 14))
+	hiddenSize := int(c.Uint("vision.embedding_length", 1280))
+	numHeads := int(c.Uint("vision.attention.head_count", 16))
+	numChannels := int(c.Uint("vision.num_channels", 3))
+	eps := c.Float("vision.attention.layer_norm_epsilon", 1e-6)
+	ropeTheta := c.Float("vision.rope.freq_base", 10000.0)
+	spatialMergeSize := int(c.Uint("vision.spatial_merge_size", 2))
+	windowSize := int(c.Uint("vision.window_size", 112))
+	fullAttnBlocks := c.Ints("qwen25vl.vision.fullatt_block_indexes", []int32{7, 15, 23, 31})
+	temporalPatchSize := int(c.Uint("vision.temporal_patch_size", 2))
+	model := &VisionModel{
+		Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count", 32)),
+		VisionModelOptions: &VisionModelOptions{
+			hiddenSize:        hiddenSize,
+			numHeads:          numHeads,
+			headDim:           hiddenSize / numHeads,
+			patchSize:         patchSize,
+			numChannels:       numChannels,
+			eps:               eps,
+			ropeTheta:         ropeTheta,
+			spatialMergeSize:  spatialMergeSize,
+			windowSize:        windowSize,
+			temporalPatchSize: temporalPatchSize,
+			fullAttnBlocks:    fullAttnBlocks,
+		},
+	}
+	return model
+}
--- a/model/models/qwen25vl/process_image.go
+++ b/model/models/qwen25vl/process_image.go
+package qwen25vl
+import (
+	"fmt"
+	"image"
+	"math"
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/model/imageproc"
+)
+// ImageProcessor contains configuration for the Qwen 2.5 VL image processing
+type ImageProcessor struct {
+	numChannels       int
+	patchSize         int
+	temporalPatchSize int
+	mergeSize         int
+	minPixels         int
+	maxPixels         int
+	factor            int
+	rescaleFactor     float32
+	imageMean         []float32
+	imageStd          []float32
+}
+// newImageProcessor creates a new image processor with default values
+func newImageProcessor(c fs.Config) ImageProcessor {
+	patchSize := int(c.Uint("vision.patch_size", 14))
+	mergeSize := int(c.Uint("vision.spatial_merge_size", 2))
+	return ImageProcessor{
+		numChannels:       int(c.Uint("vision.num_channels", 3)), // not set
+		patchSize:         patchSize,
+		temporalPatchSize: 2,
+		mergeSize:         mergeSize,
+		minPixels:         56 * 56,
+		maxPixels:         int(c.Uint("vision.max_pixels", 28*28*1280)), // 1MP limit
+		factor:            patchSize * mergeSize,
+		rescaleFactor:     1.0 / 255.0,
+		imageMean:         imageproc.ClipDefaultMean[:],
+		imageStd:          imageproc.ClipDefaultSTD[:],
+	}
+}
+// SmartResize implements the smart resize algorithm
+func (p *ImageProcessor) SmartResize(height, width int) (int, int) {
+	factor := p.factor
+	if height < factor || width < factor {
+		panic(fmt.Sprintf("height:%d or width:%d must be larger than factor:%d", height, width, factor))
+	} else if aspectRatio := max(height, width) / min(height, width); aspectRatio > 200 {
+		panic(fmt.Sprintf("absolute aspect ratio must be smaller than 200, got %v", aspectRatio))
+	}
+	round := func(x float64) int { return int(math.RoundToEven(x)) }
+	hBar := round(float64(height)/float64(factor)) * factor
+	wBar := round(float64(width)/float64(factor)) * factor
+	if hBar*wBar > p.maxPixels {
+		beta := math.Sqrt(float64(height*width) / float64(p.maxPixels))
+		hBar = int(math.Floor(float64(height)/beta/float64(factor))) * factor
+		wBar = int(math.Floor(float64(width)/beta/float64(factor))) * factor
+	} else if hBar*wBar < p.minPixels {
+		beta := math.Sqrt(float64(p.minPixels) / float64(height*width))
+		hBar = int(math.Ceil(float64(height)*beta/float64(factor))) * factor
+		wBar = int(math.Ceil(float64(width)*beta/float64(factor))) * factor
+	}
+	return hBar, wBar
+}
+type Grid struct {
+	Height   int
+	Width    int
+	Temporal int
+}
+func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, *Grid, error) {
+	origWidth := img.Bounds().Dx()
+	origHeight := img.Bounds().Dy()
+	// Calculate smart resize dimensions
+	resizedHeight, resizedWidth := p.SmartResize(origHeight, origWidth)
+	// Resize image using existing functions
+	resizedImg := imageproc.Resize(img, image.Point{X: resizedWidth, Y: resizedHeight}, imageproc.ResizeBilinear)
+	normalizedPixels := imageproc.Normalize(
+		resizedImg,
+		[3]float32{p.imageMean[0], p.imageMean[1], p.imageMean[2]},
+		[3]float32{p.imageStd[0], p.imageStd[1], p.imageStd[2]},
+		true, // rescale
+		true, // channelFirst
+	)
+	// Calculate grid dimensions
+	grid := &Grid{
+		Height:   resizedHeight / p.patchSize,
+		Width:    resizedWidth / p.patchSize,
+		Temporal: 1, // For single images, temporal dimension is 1
+	}
+	patches, err := p.createPatches(normalizedPixels, resizedHeight, resizedWidth, grid)
+	if err != nil {
+		return nil, nil, fmt.Errorf("failed to create patches: %v", err)
+	}
+	// Return patches and grid dimensions
+	return patches, grid, nil
+}
+func (p *ImageProcessor) createPatches(pixels []float32, height, width int, grid *Grid) ([]float32, error) {
+	channels := p.numChannels
+	patchSize := p.patchSize
+	mergeSize := p.mergeSize
+	temporalPatchSize := p.temporalPatchSize
+	// Calculate output dimensions
+	numPatches := grid.Temporal * grid.Height * grid.Width
+	patchDim := channels * temporalPatchSize * patchSize * patchSize
+	result := make([]float32, numPatches*patchDim)
+	patchIndex := 0
+	// Single temporal frame handling (copies to all frames)
+	for range grid.Temporal {
+		for h := 0; h < grid.Height; h += mergeSize {
+			for w := 0; w < grid.Width; w += mergeSize {
+				// Handle the 2x2 merged patches
+				for mh := range mergeSize {
+					for mw := range mergeSize {
+						baseOffset := patchIndex * patchDim
+						// Extract patch data for first temporal frame
+						for c := range channels {
+							channelOffset := baseOffset + (c * temporalPatchSize * patchSize * patchSize)
+							for py := range patchSize {
+								for px := range patchSize {
+									// Calculate source pixel coordinates
+									y := (h+mh)*patchSize + py
+									x := (w+mw)*patchSize + px
+									// Source index in input tensor (CHW format)
+									srcIdx := c*height*width + y*width + x
+									// Destination index in first temporal frame
+									dstIdx := channelOffset + (py * patchSize) + px
+									if srcIdx < len(pixels) && dstIdx < len(result) {
+										result[dstIdx] = pixels[srcIdx]
+									}
+								}
+							}
+						}
+						// Copy first temporal frame to all other frames
+						if temporalPatchSize > 1 {
+							for c := range channels {
+								channelOffset := baseOffset + (c * temporalPatchSize * patchSize * patchSize)
+								firstFrameOffset := channelOffset
+								frameSize := patchSize * patchSize
+								// Copy first frame to all other frames
+								for tp := 1; tp < temporalPatchSize; tp++ {
+									currentFrameOffset := channelOffset + (tp * frameSize)
+									copy(result[currentFrameOffset:currentFrameOffset+frameSize],
+										result[firstFrameOffset:firstFrameOffset+frameSize])
+								}
+							}
+						}
+						patchIndex++
+					}
+				}
+			}
+		}
+	}
+	return result, nil
+}
--- a/model/models/qwen2vl/imageproc.go
+++ b/model/models/qwen2vl/imageproc.go
-package qwen2vl
-import (
-	"fmt"
-	"image"
-	_ "image/jpeg"
-	_ "image/png"
-	"io"
-	"math"
-	"github.com/ollama/ollama/model/imageproc"
-)
-const (
-	DefaultFactor    = 28
-	DefaultMinPixels = 56 * 56
-	DefaultMaxPixels = 14 * 14 * 4 * 1280
-)
-// smartResize calculates the size of the image to resize to based on the
-// factor, minPixels, and maxPixels.
-func smartResize(size image.Point, factor, minPixels, maxPixels int) image.Point {
-	// 1. Both dimensions of size are divisible by factor
-	// 2. The area of the image is between minPixels and maxPixels
-	// 3. The aspect ratio of the image is as close to 1:1 as possible
-	if size.Y < factor || size.X < factor {
-		panic("image is too small to resize")
-	} else if max(size.X, size.Y)/min(size.X, size.Y) > 200 {
-		panic("aspect ratio must be less than 200:1")
-	}
-	f := float64(factor)
-	width := float64(size.X)
-	height := float64(size.Y)
-	xBar := math.Round(width/f) * f
-	yBar := math.Round(height/f) * f
-	if xBar*yBar > float64(maxPixels) {
-		beta := math.Sqrt(height * width / float64(maxPixels))
-		xBar = math.Floor(width/beta/f) * f
-		yBar = math.Floor(height/beta/f) * f
-	} else if xBar*yBar < float64(minPixels) {
-		beta := math.Sqrt(float64(minPixels) / (height * width))
-		xBar = math.Ceil(width*beta/f) * f
-		yBar = math.Ceil(height*beta/f) * f
-	}
-	return image.Point{int(xBar), int(yBar)}
-}
-func resizeImage(img image.Image, format string, size image.Point) image.Image {
-	if format == "png" {
-		img = imageproc.Composite(img)
-	}
-	return imageproc.Resize(img, size, imageproc.ResizeBilinear)
-}
-func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) {
-	img, format, err := image.Decode(imageData)
-	if err != nil {
-		return nil, nil, fmt.Errorf("failed to decode image: %w", err)
-	}
-	size := smartResize(img.Bounds().Max, DefaultFactor, DefaultMinPixels, DefaultMaxPixels)
-	img = resizeImage(img, format, size)
-	data := imageproc.Normalize(img, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, true, true)
-	opts := map[string]any{}
-	return data, opts, nil
-}
--- a/model/models/qwen2vl/imageproc_test.go
+++ b/model/models/qwen2vl/imageproc_test.go
-package qwen2vl
-import (
-	"bytes"
-	"image"
-	"image/png"
-	"testing"
-)
-func TestSmartResize(t *testing.T) {
-	type smartResizeCase struct {
-		TestImage image.Image
-		Expected  image.Point
-	}
-	cases := []smartResizeCase{
-		{
-			TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 1024)),
-			Expected:  image.Point{980, 980},
-		},
-		{
-			TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)),
-			Expected:  image.Point{1036, 756},
-		},
-		{
-			TestImage: image.NewRGBA(image.Rect(0, 0, 2000, 2000)),
-			Expected:  image.Point{980, 980},
-		},
-	}
-	for _, c := range cases {
-		b := c.TestImage.Bounds().Max
-		actual := smartResize(b, DefaultFactor, DefaultMinPixels, DefaultMaxPixels)
-		if actual != c.Expected {
-			t.Errorf("expected: %v, actual: %v", c.Expected, actual)
-		}
-	}
-}
-func TestPreprocess(t *testing.T) {
-	type preprocessCase struct {
-		TestImage   image.Image
-		ExpectedLen int
-	}
-	cases := []preprocessCase{
-		{
-			TestImage:   image.NewRGBA(image.Rect(0, 0, 256, 256)),
-			ExpectedLen: 252 * 252 * 3 * 1,
-		},
-		{
-			TestImage:   image.NewRGBA(image.Rect(0, 0, 2000, 2000)),
-			ExpectedLen: 980 * 980 * 3 * 1,
-		},
-	}
-	for _, c := range cases {
-		var buf bytes.Buffer
-		err := png.Encode(&buf, c.TestImage)
-		if err != nil {
-			t.Fatal(err)
-		}
-		imgData, _, err := Preprocess(&buf)
-		if err != nil {
-			t.Fatalf("error processing: %q", err)
-		}
-		switch len(imgData) {
-		case 0:
-			t.Errorf("no image data returned")
-		case c.ExpectedLen:
-			// ok
-		default:
-			t.Errorf("unexpected image data length: %d, expected: %d", len(imgData), c.ExpectedLen)
-		}
-	}
-}
--- a/model/models/qwen3/model.go
+++ b/model/models/qwen3/model.go
+package qwen3
+import (
+	"cmp"
+	"math"
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/fast"
+	"github.com/ollama/ollama/ml/nn/rope"
+	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/model/input"
+)
+type Options struct {
+	hiddenSize, numHeads, numKVHeads int
+	eps                              float32
+	ropeBase, ropeScale              float32
+	keyLength, valueLength int
+	numExperts, numExpertsUsed int
+	normTopKProb               bool
+}
+func (o Options) headDim() int {
+	return cmp.Or(o.keyLength, o.valueLength, o.hiddenSize/o.numHeads)
+}
+type Attention struct {
+	QueryNorm *nn.RMSNorm `gguf:"attn_q_norm"`
+	Query     *nn.Linear  `gguf:"attn_q"`
+	KeyNorm   *nn.RMSNorm `gguf:"attn_k_norm"`
+	Key       *nn.Linear  `gguf:"attn_k"`
+	Value     *nn.Linear  `gguf:"attn_v"`
+	Output    *nn.Linear  `gguf:"attn_output"`
+}
+func (sa *Attention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+	batchSize := hiddenStates.Dim(1)
+	query := sa.Query.Forward(ctx, hiddenStates)
+	key := sa.Key.Forward(ctx, hiddenStates)
+	value := sa.Value.Forward(ctx, hiddenStates)
+	query = query.Reshape(ctx, opts.headDim(), opts.numHeads, batchSize)
+	key = key.Reshape(ctx, opts.headDim(), opts.numKVHeads, batchSize)
+	value = value.Reshape(ctx, opts.headDim(), opts.numKVHeads, batchSize)
+	query = sa.QueryNorm.Forward(ctx, query, opts.eps)
+	key = sa.KeyNorm.Forward(ctx, key, opts.eps)
+	query = fast.RoPE(ctx, query, positions, opts.headDim(), opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
+	key = fast.RoPE(ctx, key, positions, opts.headDim(), opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
+	attention := nn.Attention(ctx, query, key, value, 1./math.Sqrt(float64(opts.headDim())), cache)
+	attention = attention.Reshape(ctx, attention.Dim(0)*attention.Dim(1), batchSize)
+	return sa.Output.Forward(ctx, attention)
+}
+type MLP interface {
+	Forward(ml.Context, ml.Tensor, *Options) ml.Tensor
+}
+type sparse struct {
+	Router *nn.Linear `gguf:"ffn_gate_inp"`
+	Gate   *nn.Linear `gguf:"ffn_gate_exps"`
+	Up     *nn.Linear `gguf:"ffn_up_exps"`
+	Down   *nn.Linear `gguf:"ffn_down_exps"`
+}
+func (mlp *sparse) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *Options) ml.Tensor {
+	hiddenDim, sequenceLength, batchSize := hiddenStates.Dim(0), hiddenStates.Dim(1), hiddenStates.Dim(2)
+	hiddenStates = hiddenStates.Reshape(ctx, hiddenDim, sequenceLength*batchSize)
+	routerLogits := mlp.Router.Forward(ctx, hiddenStates)
+	routingWeights := routerLogits.Softmax(ctx)
+	selectedExperts := routingWeights.TopK(ctx, opts.numExpertsUsed)
+	routingWeights = routingWeights.Reshape(ctx, 1, opts.numExperts, hiddenStates.Dim(1)).Rows(ctx, selectedExperts)
+	if opts.normTopKProb {
+		routingWeights = routingWeights.Reshape(ctx, opts.numExpertsUsed, hiddenStates.Dim(1))
+		routingWeights = routingWeights.Div(ctx, routingWeights.SumRows(ctx))
+		routingWeights = routingWeights.Reshape(ctx, 1, opts.numExpertsUsed, hiddenStates.Dim(1))
+	}
+	hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0), 1, hiddenStates.Dim(1))
+	upStates := mlp.Up.Weight.MulmatID(ctx, hiddenStates, selectedExperts)
+	hiddenStates = mlp.Gate.Weight.MulmatID(ctx, hiddenStates, selectedExperts)
+	hiddenStates = hiddenStates.SILU(ctx)
+	hiddenStates = hiddenStates.Mul(ctx, upStates)
+	experts := mlp.Down.Weight.MulmatID(ctx, hiddenStates, selectedExperts)
+	experts = experts.Mul(ctx, routingWeights)
+	nextStates := experts.View(ctx, 0, experts.Dim(0), experts.Stride(2), experts.Dim(2))
+	for i := 1; i < opts.numExpertsUsed; i++ {
+		nextStates = nextStates.Add(ctx, experts.View(ctx, i*experts.Stride(1), experts.Dim(0), experts.Stride(2), experts.Dim(2)))
+	}
+	return nextStates
+}
+type dense struct {
+	Gate *nn.Linear `gguf:"ffn_gate"`
+	Up   *nn.Linear `gguf:"ffn_up"`
+	Down *nn.Linear `gguf:"ffn_down"`
+}
+func (mlp *dense) Forward(ctx ml.Context, hiddenStates ml.Tensor, _ *Options) ml.Tensor {
+	hiddenStates = mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenStates))
+	return mlp.Down.Forward(ctx, hiddenStates)
+}
+type Layer struct {
+	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
+	*Attention
+	MLPNorm *nn.RMSNorm `gguf:"ffn_norm"`
+	MLP
+}
+func (d *Layer) Forward(ctx ml.Context, hiddenStates, positions, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+	residual := hiddenStates
+	hiddenStates = d.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = d.Attention.Forward(ctx, hiddenStates, positions, cache, opts)
+	if outputs != nil {
+		hiddenStates = hiddenStates.Rows(ctx, outputs)
+		residual = residual.Rows(ctx, outputs)
+	}
+	hiddenStates = hiddenStates.Add(ctx, residual)
+	residual = hiddenStates
+	hiddenStates = d.MLPNorm.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = d.MLP.Forward(ctx, hiddenStates, opts)
+	return hiddenStates.Add(ctx, residual)
+}
+type Model struct {
+	model.Base
+	model.BytePairEncoding
+	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
+	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
+	Output         *nn.Linear    `gguf:"output,alt:token_embd"`
+	Layers []Layer `gguf:"blk"`
+	*Options
+}
+// Forward implements model.Model.
+func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
+	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs)
+	for i, layer := range m.Layers {
+		m.Cache.SetLayer(i)
+		var outputs ml.Tensor
+		if i == len(m.Layers)-1 {
+			outputs = ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+		}
+		hiddenStates = layer.Forward(ctx, hiddenStates, positions, outputs, m.Cache, m.Options)
+	}
+	hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.eps)
+	return m.Output.Forward(ctx, hiddenStates), nil
+}
+func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
+	return fast.RoPE(ctx, key, shift, m.headDim(), m.ropeBase, m.ropeScale, rope.WithTypeNeoX()), nil
+}
+var _ model.Model = (*Model)(nil)
+func New(c fs.Config) (model.Model, error) {
+	layers := make([]Layer, c.Uint("block_count"))
+	for i := range layers {
+		if c.String("general.architecture") == "qwen3moe" {
+			layers[i].MLP = &sparse{}
+		} else {
+			layers[i].MLP = &dense{}
+		}
+	}
+	m := Model{
+		BytePairEncoding: model.NewBytePairEncoding(
+			`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
+			&model.Vocabulary{
+				Values: c.Strings("tokenizer.ggml.tokens"),
+				Types:  c.Ints("tokenizer.ggml.token_type"),
+				Merges: c.Strings("tokenizer.ggml.merges"),
+				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS: append(
+					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
+			},
+		),
+		Layers: layers,
+		Options: &Options{
+			hiddenSize:     int(c.Uint("embedding_length")),
+			numHeads:       int(c.Uint("attention.head_count")),
+			numKVHeads:     int(c.Uint("attention.head_count_kv")),
+			keyLength:      int(c.Uint("attention.key_length")),
+			valueLength:    int(c.Uint("attention.value_length")),
+			eps:            c.Float("attention.layer_norm_rms_epsilon"),
+			ropeBase:       c.Float("rope.freq_base"),
+			ropeScale:      c.Float("rope.freq_scale", 1),
+			numExperts:     int(c.Uint("expert_count")),
+			numExpertsUsed: int(c.Uint("expert_used_count")),
+			normTopKProb:   c.Bool("norm_top_k_prob", true),
+		},
+	}
+	m.Cache = kvcache.NewCausalCache(m.Shift)
+	return &m, nil
+}
+func init() {
+	model.Register("qwen3", New)
+	model.Register("qwen3moe", New)
+}
--- a/model/process_text_spm.go
+++ b/model/process_text_spm.go
@@ -2,10 +2,13 @@ package model
 import (
 	"container/heap"
+	"context"
 	"fmt"
 	"log/slog"
 	"strconv"
 	"strings"
+	"github.com/ollama/ollama/logutil"
 )
 const spmWhitespaceSep = "▁"
@@ -22,7 +25,7 @@ func (spm SentencePieceModel) Vocabulary() *Vocabulary {
 }
 func NewSentencePieceModel(vocab *Vocabulary) SentencePieceModel {
-	slog.Debug("Tokens", "num tokens", len(vocab.Values), "vals", vocab.Values[:5], "scores", vocab.Scores[:5], "types", vocab.Types[:5])
+	slog.Log(context.TODO(), logutil.LevelTrace, "Tokens", "num tokens", len(vocab.Values), "vals", vocab.Values[:5], "scores", vocab.Scores[:5], "types", vocab.Types[:5])
 	counter := map[int]int{}
 	var maxTokenLen int
@@ -36,7 +39,7 @@ func NewSentencePieceModel(vocab *Vocabulary) SentencePieceModel {
 		}
 	}
-	slog.Debug("Token counts", "normal", counter[TOKEN_TYPE_NORMAL], "unknown", counter[TOKEN_TYPE_UNKNOWN], "control", counter[TOKEN_TYPE_CONTROL],
+	slog.Log(context.TODO(), logutil.LevelTrace, "Token counts", "normal", counter[TOKEN_TYPE_NORMAL], "unknown", counter[TOKEN_TYPE_UNKNOWN], "control", counter[TOKEN_TYPE_CONTROL],
 		"user defined", counter[TOKEN_TYPE_USER_DEFINED], "unused", counter[TOKEN_TYPE_UNUSED], "byte", counter[TOKEN_TYPE_BYTE],
 		"max token len", maxTokenLen)
@@ -179,24 +182,10 @@ func (spm SentencePieceModel) Encode(s string, addSpecial bool) ([]int32, error)
 		}
 	}
-	if addSpecial && len(ids) > 0 {
+	slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "string", s, "ids", ids)
-		if spm.vocab.AddBOS {
-			if ids[0] == spm.vocab.BOS {
-				slog.Warn("adding bos token to prompt which already has it", "id", spm.vocab.BOS)
-			}
-			slog.Debug("adding bos token to prompt", "id", spm.vocab.BOS)
-			ids = append([]int32{spm.vocab.BOS}, ids...)
-		}
-		if spm.vocab.AddEOS {
+	if addSpecial && len(ids) > 0 {
-			if ids[len(ids)-1] == spm.vocab.EOS {
+		ids = spm.vocab.addSpecials(ids)
-				slog.Warn("adding eos token to prompt which already has it", "id", spm.vocab.EOS)
-			}
-			slog.Debug("adding eos token to prompt", "id", spm.vocab.EOS)
-			ids = append(ids, spm.vocab.EOS)
-		}
 	}
 	return ids, nil
@@ -257,5 +246,6 @@ func (spm SentencePieceModel) Decode(ids []int32) (string, error) {
 		}
 	}
+	slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "ids", ids, "string", sb.String())
 	return sb.String(), nil
 }
--- a/model/process_text_spm_test.go
+++ b/model/process_text_spm_test.go
--- a/model/textprocessor.go
+++ b/model/textprocessor.go
+package model
+const (
+	TOKEN_TYPE_NORMAL = iota + 1
+	TOKEN_TYPE_UNKNOWN
+	TOKEN_TYPE_CONTROL
+	TOKEN_TYPE_USER_DEFINED
+	TOKEN_TYPE_UNUSED
+	TOKEN_TYPE_BYTE
+)
+type TextProcessor interface {
+	Encode(s string, addSpecial bool) ([]int32, error)
+	Decode([]int32) (string, error)
+	Is(int32, Special) bool
+	Vocabulary() *Vocabulary
+}
--- a/model/vocabulary.go
+++ b/model/vocabulary.go
+package model
+import (
+	"log/slog"
+	"slices"
+	"sync"
+)
+type Special int32
+const (
+	SpecialBOS Special = iota
+	SpecialEOS
+)
+type Vocabulary struct {
+	Values []string
+	Types  []int32
+	Scores []float32
+	Merges []string
+	BOS, EOS       []int32
+	AddBOS, AddEOS bool
+	specialOnce sync.Once
+	special     []string
+	valuesOnce sync.Once
+	values     map[string]int32
+	mergeOnce sync.Once
+	merge     map[string]int32
+}
+func (v *Vocabulary) Is(id int32, special Special) bool {
+	switch special {
+	case SpecialBOS:
+		return slices.Contains(v.BOS, id)
+	case SpecialEOS:
+		return slices.Contains(v.EOS, id)
+	default:
+		return false
+	}
+}
+func (v *Vocabulary) addSpecials(ids []int32) []int32 {
+	if v.AddBOS && len(v.BOS) > 0 {
+		if slices.Contains(v.BOS, ids[0]) {
+			slog.Warn("adding bos token to prompt which already has it", "id", v.BOS)
+		}
+		slog.Debug("adding bos token to prompt", "id", v.BOS)
+		ids = append([]int32{v.BOS[0]}, ids...)
+	}
+	if v.AddEOS && len(v.EOS) > 0 {
+		if slices.Contains(v.BOS, ids[len(ids)-1]) {
+			slog.Warn("adding eos token to prompt which already has it", "id", v.EOS)
+		}
+		slog.Debug("adding eos token to prompt", "id", v.EOS)
+		ids = append(ids, v.EOS[0])
+	}
+	return ids
+}
+func (v *Vocabulary) Encode(s string) int32 {
+	v.valuesOnce.Do(func() {
+		v.values = make(map[string]int32, len(v.Values))
+		for i, value := range v.Values {
+			v.values[value] = int32(i)
+		}
+	})
+	if id, ok := v.values[s]; ok {
+		return id
+	}
+	return -1
+}
+func (v *Vocabulary) Decode(id int32) string {
+	return v.Values[id]
+}
+func (v *Vocabulary) SpecialVocabulary() []string {
+	v.specialOnce.Do(func() {
+		for i := range v.Values {
+			if v.Types[i] == TOKEN_TYPE_CONTROL || v.Types[i] == TOKEN_TYPE_USER_DEFINED {
+				v.special = append(v.special, v.Values[i])
+			}
+		}
+	})
+	return v.special
+}
+func (v *Vocabulary) Merge(left, right string) int {
+	v.mergeOnce.Do(func() {
+		v.merge = make(map[string]int32, len(v.Merges))
+		for i, merge := range v.Merges {
+			v.merge[merge] = int32(i)
+		}
+	})
+	if id, ok := v.merge[left+" "+right]; ok {
+		return int(id)
+	}
+	return -1
+}
--- a/model/vocabulary_test.go
+++ b/model/vocabulary_test.go
+package model
+import "testing"
+func TestVocabulary_SpecialVocabulary(t *testing.T) {
+	vocab := &Vocabulary{
+		Values: []string{"<|startoftext|>", "<|endoftext|>", "<|tool_call_start|>", "<|tool_call_end|>", "hi"},
+		Types:  []int32{TOKEN_TYPE_CONTROL, TOKEN_TYPE_CONTROL, TOKEN_TYPE_USER_DEFINED, TOKEN_TYPE_USER_DEFINED, TOKEN_TYPE_NORMAL},
+	}
+	specialVocab := vocab.SpecialVocabulary()
+	if len(specialVocab) != 4 {
+		t.Errorf("expected 4 special tokens, got %d", len(specialVocab))
+	}
+}
--- a/parser/parser.go
+++ b/parser/parser.go
@@ -292,13 +292,18 @@ func filesForModel(path string) ([]string, error) {
 	}
 	files = append(files, js...)
-	if tks, _ := glob(filepath.Join(path, "tokenizer.model"), "application/octet-stream"); len(tks) > 0 {
+	// only include tokenizer.model is tokenizer.json is not present
-		// add tokenizer.model if it exists, tokenizer.json is automatically picked up by the previous glob
+	if !slices.ContainsFunc(files, func(s string) bool {
-		// tokenizer.model might be a unresolved git lfs reference; error if it is
+		return slices.Contains(strings.Split(s, string(os.PathSeparator)), "tokenizer.json")
-		files = append(files, tks...)
+	}) {
-	} else if tks, _ := glob(filepath.Join(path, "**/tokenizer.model"), "text/plain"); len(tks) > 0 {
+		if tks, _ := glob(filepath.Join(path, "tokenizer.model"), "application/octet-stream"); len(tks) > 0 {
-		// some times tokenizer.model is in a subdirectory (e.g. meta-llama/Meta-Llama-3-8B)
+			// add tokenizer.model if it exists, tokenizer.json is automatically picked up by the previous glob
-		files = append(files, tks...)
+			// tokenizer.model might be a unresolved git lfs reference; error if it is
+			files = append(files, tks...)
+		} else if tks, _ := glob(filepath.Join(path, "**/tokenizer.model"), "text/plain"); len(tks) > 0 {
+			// some times tokenizer.model is in a subdirectory (e.g. meta-llama/Meta-Llama-3-8B)
+			files = append(files, tks...)
+		}
 	}
 	return files, nil

--- a/readline/types.go
+++ b/readline/types.go
@@ -61,6 +61,8 @@ const (
 	ColorGrey    = Esc + "[38;5;245m"
 	ColorDefault = Esc + "[0m"
+	ColorBold = Esc + "[1m"
 	StartBracketedPaste = Esc + "[?2004h"
 	EndBracketedPaste   = Esc + "[?2004l"
 )

--- a/runner/llamarunner/cache.go
+++ b/runner/llamarunner/cache.go
@@ -104,8 +104,8 @@ func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCach
 	slog.Debug("loading cache slot", "id", slot.Id, "cache", len(slot.Inputs), "prompt", len(prompt),
 		"used", numPast, "remaining", len(prompt)-numPast)
+	slot.Inputs = prompt[:numPast]
 	prompt = prompt[numPast:]
-	slot.Inputs = slot.Inputs[:numPast]
 	return slot, prompt, nil
 }

--- a/runner/llamarunner/image.go
+++ b/runner/llamarunner/image.go
@@ -5,7 +5,6 @@ import (
 	"fmt"
 	"hash/maphash"
 	"log/slog"
-	"slices"
 	"sync"
 	"time"
@@ -18,8 +17,7 @@ type ImageContext struct {
 	// mu is required to be held when generating embeddings or accessing the cache
 	mu sync.Mutex
-	clip   *llama.ClipContext
+	clip *llama.ClipContext
-	mllama *llama.MllamaContext
 	// cache of images to embeddings
 	images    []imageCache
@@ -35,8 +33,6 @@ func NewImageContext(llamaContext *llama.Context, modelPath string) (*ImageConte
 	var c ImageContext
 	if arch == "clip" {
 		c.clip, err = llama.NewClipContext(llamaContext, modelPath)
-	} else if arch == "mllama" {
-		c.mllama, err = llama.NewMllamaContext(llamaContext, modelPath)
 	} else {
 		return nil, fmt.Errorf("unknown vision model architecture: %s", arch)
 	}
@@ -58,12 +54,9 @@ func (c *ImageContext) Free(modelPath string) {
 	if c.clip != nil {
 		c.clip.Free()
 	}
-	if c.mllama != nil {
-		c.mllama.Free()
-	}
 }
-func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspectRatioId int) ([][]float32, error) {
+func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte) ([][]float32, error) {
 	if c == nil {
 		return nil, nil
 	}
@@ -79,12 +72,7 @@ func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspect
 	embed, err := c.findImage(hash)
 	if err != nil {
-		if c.mllama != nil {
+		if c.clip != nil {
-			embed, err = c.mllama.NewEmbed(llamaContext, data, aspectRatioId)
-			if err != nil {
-				return nil, err
-			}
-		} else if c.clip != nil {
 			embed, err = c.clip.NewEmbed(llamaContext, data)
 			if err != nil {
 				return nil, err
@@ -105,33 +93,11 @@ func (c *ImageContext) BatchSize(configuredBatchSize int) int {
 		return 0
 	}
-	// Mllama maps an image to 1 embedding token (llava creates many tokens)
-	// and doesn't support more than a single image per request.
-	// The embeddings are large (100 MB), so allocating a big batch can fail
-	// on some systems
-	if c.mllama != nil {
-		return 1
-	}
 	return configuredBatchSize
 }
 func (c *ImageContext) EmbedSize(llamaContext *llama.Context) int {
-	if c != nil && c.mllama != nil {
+	return llamaContext.Model().NEmbd()
-		return c.mllama.EmbedSize(llamaContext)
-	} else {
-		return llamaContext.Model().NEmbd()
-	}
-}
-func (c *ImageContext) NeedCrossAttention(inputs ...input) bool {
-	if c == nil || c.mllama == nil {
-		return false
-	}
-	return slices.ContainsFunc(inputs, func(input input) bool {
-		return input.embed != nil
-	})
 }
 type imageCache struct {

--- a/runner/llamarunner/runner.go
+++ b/runner/llamarunner/runner.go
@@ -11,7 +11,6 @@ import (
 	"net"
 	"net/http"
 	"os"
-	"path/filepath"
 	"regexp"
 	"runtime"
 	"strconv"
@@ -23,8 +22,10 @@ import (
 	"golang.org/x/sync/semaphore"
 	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/llama"
 	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/logutil"
 	"github.com/ollama/ollama/runner/common"
 )
@@ -56,10 +57,6 @@ type Sequence struct {
 	// input cache being used by this sequence
 	cache *InputCacheSlot
-	// does this sequence require cross-attention layers to be processed? - if we have seen
-	// an image for certain multi-modal models
-	crossAttention bool
 	// channel to send responses over
 	responses chan string
@@ -204,7 +201,7 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input, error)
 				return nil, fmt.Errorf("invalid image index: %d", n)
 			}
-			embed, err := s.image.NewEmbed(s.lc, images[imageIndex].Data, images[imageIndex].AspectRatioID)
+			embed, err := s.image.NewEmbed(s.lc, images[imageIndex].Data)
 			if err != nil {
 				return nil, err
 			}
@@ -367,7 +364,6 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 	defer s.mu.Unlock()
 	var batch *llama.Batch
-	crossAttention := false
 	seqIdx := s.nextSeq - 1
 	for range s.seqs {
@@ -415,9 +411,8 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 					batch = tokenBatch
 				} else {
 					batch = embedBatch
-					seq.crossAttention = s.image.NeedCrossAttention(input)
 				}
-			} else if embedding != batch.IsEmbedding() || crossAttention != seq.crossAttention {
+			} else if embedding != batch.IsEmbedding() {
 				s.nextSeq = seqIdx
 				break
 			}
@@ -426,7 +421,6 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 				break
 			}
-			crossAttention = seq.crossAttention
 			batch.Add(input.token, input.embed, len(seq.cache.Inputs)+len(seq.pendingInputs), i+1 == len(seq.inputs), seq.cache.Id)
 			seq.pendingInputs = append(seq.pendingInputs, input)
 			seq.iBatch = batch.NumTokens() - 1
@@ -439,20 +433,11 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 		return nil
 	}
-	s.lc.SetCrossAttention(crossAttention)
 	err := s.lc.Decode(batch)
 	if err != nil {
 		return fmt.Errorf("failed to decode batch: %w", err)
 	}
-	if crossAttention {
-		// synchronize state to ensure the cross attention batch is complete.
-		// needed specifically for multi-GPU systems otherwise an inflight
-		// task may be incorrectly invalidated causing a crash
-		s.lc.Synchronize()
-	}
 	for i, seq := range s.seqs {
 		if seq == nil {
 			continue
@@ -621,8 +606,6 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 				return
 			}
-			seq.crossAttention = s.image.NeedCrossAttention(seq.cache.Inputs...)
 			s.seqs[i] = seq
 			s.cond.Signal()
 			found = true
@@ -680,8 +663,6 @@ func (s *Server) embeddings(w http.ResponseWriter, r *http.Request) {
 	w.Header().Set("Content-Type", "application/json")
-	slog.Debug("embedding request", "content", req.Content)
 	seq, err := s.NewSequence(req.Content, nil, NewSequenceParams{embedding: true})
 	if err != nil {
 		http.Error(w, fmt.Sprintf("Failed to create new sequence: %v", err), http.StatusInternalServerError)
@@ -815,7 +796,7 @@ func Execute(args []string) error {
 	kvCacheType := fs.String("kv-cache-type", "", "quantization type for KV cache (default: f16)")
 	port := fs.Int("port", 8080, "Port to expose the server on")
 	threads := fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
-	verbose := fs.Bool("verbose", false, "verbose output (default: disabled)")
+	_ = fs.Bool("verbose", false, "verbose output (default: disabled)")
 	noMmap := fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
 	tensorSplit := fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
 	multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
@@ -830,22 +811,7 @@ func Execute(args []string) error {
 	if err := fs.Parse(args); err != nil {
 		return err
 	}
-	level := slog.LevelInfo
+	slog.SetDefault(logutil.NewLogger(os.Stderr, envconfig.LogLevel()))
-	if *verbose {
-		level = slog.LevelDebug
-	}
-	handler := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
-		Level:     level,
-		AddSource: true,
-		ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr {
-			if attr.Key == slog.SourceKey {
-				source := attr.Value.Any().(*slog.Source)
-				source.File = filepath.Base(source.File)
-			}
-			return attr
-		},
-	})
-	slog.SetDefault(slog.New(handler))
 	slog.Info("starting go runner")
 	llama.BackendInit()