chore: update mllama to use ollama engine (#10637)

23125648 · Michael Yang · GitHub · 0478d440 · 23125648 · 23125648
Unverified Commit 23125648 authored May 13, 2025 by Michael Yang Committed by GitHub May 13, 2025
7 changed files
--- a/model/models/mllama/process_image_test.go
+++ b/model/models/mllama/process_image_test.go
+package mllama
+import (
+	"image"
+	"testing"
+	"github.com/google/go-cmp/cmp"
+)
+func TestSupportedAspectRatios(t *testing.T) {
+	cases := []struct {
+		p    ImageProcessor
+		want []supportedAspectRatio
+	}{
+		{
+			p: ImageProcessor{maxNumTiles: 1},
+			want: []supportedAspectRatio{
+				{1, 1, 1},
+			},
+		},
+		{
+			p: ImageProcessor{maxNumTiles: 2},
+			want: []supportedAspectRatio{
+				{1, 1, 1},
+				{2, 1, 2},
+				{3, 2, 1},
+			},
+		},
+		{
+			p: ImageProcessor{maxNumTiles: 3},
+			want: []supportedAspectRatio{
+				{1, 1, 1},
+				{2, 1, 2},
+				{3, 1, 3},
+				{4, 2, 1},
+				{5, 3, 1},
+			},
+		},
+		{
+			p: ImageProcessor{maxNumTiles: 4},
+			want: []supportedAspectRatio{
+				{1, 1, 1},
+				{2, 1, 2},
+				{3, 1, 3},
+				{4, 1, 4},
+				{5, 2, 1},
+				{6, 2, 2},
+				{7, 3, 1},
+				{8, 4, 1},
+			},
+		},
+	}
+	for _, tt := range cases {
+		actual := tt.p.supportedAspectRatios()
+		if diff := cmp.Diff(actual, tt.want, cmp.AllowUnexported(supportedAspectRatio{})); diff != "" {
+			t.Errorf("mismatch (-got +want):\n%s", diff)
+		}
+	}
+}
+func TestFitToCanvas(t *testing.T) {
+	cases := []struct {
+		p      ImageProcessor
+		image  image.Point
+		canvas image.Point
+		expect image.Point
+	}{
+		{
+			p:      ImageProcessor{imageSize: 200},
+			image:  image.Point{400, 400},
+			canvas: image.Point{640, 480},
+			expect: image.Point{400, 400},
+		},
+		{
+			p:      ImageProcessor{imageSize: 200},
+			image:  image.Point{1024, 768},
+			canvas: image.Point{640, 480},
+			expect: image.Point{640, 480},
+		},
+		{
+			p:      ImageProcessor{imageSize: 750},
+			image:  image.Point{500, 500},
+			canvas: image.Point{1000, 1000},
+			expect: image.Point{750, 750},
+		},
+		{
+			p:      ImageProcessor{imageSize: 2000},
+			image:  image.Point{500, 1000},
+			canvas: image.Point{2000, 2000},
+			expect: image.Point{1000, 2000},
+		},
+		{
+			p:      ImageProcessor{imageSize: 1000},
+			image:  image.Point{4000, 3000},
+			canvas: image.Point{2000, 1000},
+			expect: image.Point{1333, 1000},
+		},
+		{
+			p:      ImageProcessor{imageSize: 560},
+			image:  image.Point{667, 1000},
+			canvas: image.Point{1000, 1000},
+			expect: image.Point{667, 1000},
+		},
+	}
+	for _, tt := range cases {
+		actual := tt.p.fitToCanvas(tt.image, tt.canvas)
+		if diff := cmp.Diff(actual, tt.expect); diff != "" {
+			t.Errorf("mismatch (-got +want):\n%s", diff)
+		}
+	}
+}
+func TestOptimalTiledCanvas(t *testing.T) {
+	cases := []struct {
+		p      ImageProcessor
+		image  image.Point
+		expect image.Point
+	}{
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 1000},
+			image:  image.Point{1024, 768},
+			expect: image.Point{2000, 1000},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{1024, 768},
+			expect: image.Point{1120, 1120},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{800, 600},
+			expect: image.Point{1120, 1120},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{640, 480},
+			expect: image.Point{1120, 560},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{320, 200},
+			expect: image.Point{560, 560},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{1320, 200},
+			expect: image.Point{1680, 560},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{2000, 200},
+			expect: image.Point{2240, 560},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{10000, 200},
+			expect: image.Point{2240, 560},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{480, 640},
+			expect: image.Point{560, 1120},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{200, 320},
+			expect: image.Point{560, 560},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{200, 1320},
+			expect: image.Point{560, 1680},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{200, 2000},
+			expect: image.Point{560, 2240},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{200, 10000},
+			expect: image.Point{560, 2240},
+		},
+		{
+			p:      ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			image:  image.Point{10000, 10000},
+			expect: image.Point{1120, 1120},
+		},
+	}
+	for _, tt := range cases {
+		actual := tt.p.optimalTiledCanvas(tt.image)
+		if diff := cmp.Diff(actual, tt.expect); diff != "" {
+			t.Errorf("mismatch (-got +want):\n%s", diff)
+		}
+	}
+}
+func TestSplitToTiles(t *testing.T) {
+	cases := []struct {
+		imageMax image.Point
+		numTiles image.Point
+		expect   []image.Image
+	}{
+		{
+			imageMax: image.Point{1024, 768},
+			numTiles: image.Point{1, 1},
+			expect:   []image.Image{image.NewRGBA(image.Rect(0, 0, 1024, 768))},
+		},
+		{
+			imageMax: image.Point{1000, 500},
+			numTiles: image.Point{2, 1},
+			expect: []image.Image{
+				image.NewRGBA(image.Rect(0, 0, 500, 500)),
+				image.NewRGBA(image.Rect(500, 0, 1000, 500)),
+			},
+		},
+		{
+			imageMax: image.Point{1000, 1000},
+			numTiles: image.Point{2, 2},
+			expect: []image.Image{
+				image.NewRGBA(image.Rect(0, 0, 500, 500)),
+				image.NewRGBA(image.Rect(500, 0, 1000, 500)),
+				image.NewRGBA(image.Rect(0, 500, 500, 1000)),
+				image.NewRGBA(image.Rect(500, 500, 1000, 1000)),
+			},
+		},
+	}
+	var p ImageProcessor
+	for _, tt := range cases {
+		actual := p.splitToTiles(image.NewRGBA(image.Rectangle{Max: tt.imageMax}), tt.numTiles)
+		if len(actual) != len(tt.expect) {
+			t.Errorf("incorrect number of images '%d': expect: '%d'", len(actual), len(tt.expect))
+		}
+		for i := range actual {
+			if actual[i].Bounds() != tt.expect[i].Bounds() {
+				t.Errorf("image size incorrect: '%#v': expect: '%#v'", actual[i].Bounds(), tt.expect[i].Bounds())
+			}
+		}
+	}
+}
+func TestResize(t *testing.T) {
+	cases := []struct {
+		p                 ImageProcessor
+		imageMax          image.Point
+		expectImage       image.Image
+		expectAspectRatio image.Point
+	}{
+		{
+			p:                 ImageProcessor{maxNumTiles: 1, imageSize: 100},
+			imageMax:          image.Point{200, 200},
+			expectImage:       image.NewRGBA(image.Rect(0, 0, 100, 100)),
+			expectAspectRatio: image.Point{1, 1},
+		},
+		{
+			p:                 ImageProcessor{maxNumTiles: 2, imageSize: 100},
+			imageMax:          image.Point{200, 200},
+			expectImage:       image.NewRGBA(image.Rect(0, 0, 100, 100)),
+			expectAspectRatio: image.Point{1, 1},
+		},
+		{
+			p:                 ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			imageMax:          image.Point{10, 10},
+			expectImage:       image.NewRGBA(image.Rect(0, 0, 560, 560)),
+			expectAspectRatio: image.Point{1, 1},
+		},
+		{
+			p:                 ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			imageMax:          image.Point{2560, 1920},
+			expectImage:       image.NewRGBA(image.Rect(0, 0, 1120, 840)),
+			expectAspectRatio: image.Point{2, 2},
+		},
+		{
+			p:                 ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			imageMax:          image.Point{1024, 768},
+			expectImage:       image.NewRGBA(image.Rect(0, 0, 1024, 768)),
+			expectAspectRatio: image.Point{2, 2},
+		},
+	}
+	for _, tt := range cases {
+		actualImage, actualAspectRatio := tt.p.resize(image.Rectangle{Max: tt.imageMax})
+		if actualImage.Bounds() != tt.expectImage.Bounds() {
+			t.Errorf("image size incorrect: '%#v': expect: '%#v'", actualImage.Bounds(), tt.expectImage.Bounds())
+		}
+		if actualAspectRatio != tt.expectAspectRatio {
+			t.Errorf("aspect ratio incorrect: '%#v': expect: '%#v'", actualAspectRatio, tt.expectAspectRatio)
+		}
+	}
+}
+func TestPad(t *testing.T) {
+	cases := []struct {
+		p           ImageProcessor
+		imageMax    image.Point
+		aspectRatio image.Point
+		expect      image.Image
+	}{
+		{
+			p:           ImageProcessor{maxNumTiles: 4, imageSize: 560},
+			imageMax:    image.Point{1000, 667},
+			aspectRatio: image.Point{2, 2},
+			expect:      image.NewRGBA(image.Rect(0, 0, 1120, 1120)),
+		},
+	}
+	for _, tt := range cases {
+		actual := tt.p.pad(image.Rectangle{Max: tt.imageMax}, tt.aspectRatio)
+		if actual.Bounds() != tt.expect.Bounds() {
+			t.Errorf("image size incorrect: '%#v': expect: '%#v'", actual.Bounds(), tt.expect.Bounds())
+		}
+	}
+}
+func TestPackImages(t *testing.T) {
+	cases := []struct {
+		imageMax    image.Point
+		aspectRatio image.Point
+		expectVals  int
+	}{
+		{
+			imageMax:    image.Point{1120, 1120},
+			aspectRatio: image.Point{2, 2},
+			expectVals:  2 * 2 * 3 * 560 * 560,
+		},
+		{
+			imageMax:    image.Point{560, 560},
+			aspectRatio: image.Point{1, 1},
+			expectVals:  1 * 1 * 3 * 560 * 560,
+		},
+		{
+			imageMax:    image.Point{1120, 560},
+			aspectRatio: image.Point{1, 2},
+			expectVals:  1 * 2 * 3 * 560 * 560,
+		},
+	}
+	for _, tt := range cases {
+		var p ImageProcessor
+		actualVals := p.pack(image.NewRGBA(image.Rectangle{Max: tt.imageMax}), tt.aspectRatio)
+		if len(actualVals) != tt.expectVals {
+			t.Errorf("packed image size incorrect: '%d': expect: '%d'", len(actualVals), tt.expectVals)
+		}
+	}
+}
+func TestPreprocess(t *testing.T) {
+	cases := []struct {
+		imageMax            image.Point
+		expectAspectRatioID int
+	}{
+		{
+			imageMax:            image.Point{10, 10},
+			expectAspectRatioID: 1,
+		},
+		{
+			imageMax:            image.Point{1024, 768},
+			expectAspectRatioID: 6,
+		},
+	}
+	p := ImageProcessor{imageSize: 560, maxNumTiles: 4}
+	for _, tt := range cases {
+		img, aspectRatio, err := p.ProcessImage(image.NewRGBA(image.Rectangle{Max: tt.imageMax}))
+		if err != nil {
+			t.Fatalf("error processing: %q", err)
+		}
+		if len(img) == 0 {
+			t.Errorf("no image data returned")
+		}
+		if aspectRatio.rank != tt.expectAspectRatioID {
+			t.Errorf("aspect ratio incorrect: '%d': expect: '%d'", aspectRatio, tt.expectAspectRatioID)
+		}
+	}
+}
--- a/runner/llamarunner/image.go
+++ b/runner/llamarunner/image.go
@@ -5,7 +5,6 @@ import (
 	"fmt"
 	"hash/maphash"
 	"log/slog"
-	"slices"
 	"sync"
 	"time"
@@ -18,8 +17,7 @@ type ImageContext struct {
 	// mu is required to be held when generating embeddings or accessing the cache
 	mu sync.Mutex
-	clip   *llama.ClipContext
+	clip *llama.ClipContext
-	mllama *llama.MllamaContext
 	// cache of images to embeddings
 	images    []imageCache
@@ -35,8 +33,6 @@ func NewImageContext(llamaContext *llama.Context, modelPath string) (*ImageConte
 	var c ImageContext
 	if arch == "clip" {
 		c.clip, err = llama.NewClipContext(llamaContext, modelPath)
-	} else if arch == "mllama" {
-		c.mllama, err = llama.NewMllamaContext(llamaContext, modelPath)
 	} else {
 		return nil, fmt.Errorf("unknown vision model architecture: %s", arch)
 	}
@@ -58,12 +54,9 @@ func (c *ImageContext) Free(modelPath string) {
 	if c.clip != nil {
 		c.clip.Free()
 	}
-	if c.mllama != nil {
-		c.mllama.Free()
-	}
 }
-func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspectRatioId int) ([][]float32, error) {
+func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte) ([][]float32, error) {
 	if c == nil {
 		return nil, nil
 	}
@@ -79,12 +72,7 @@ func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspect
 	embed, err := c.findImage(hash)
 	if err != nil {
-		if c.mllama != nil {
+		if c.clip != nil {
-			embed, err = c.mllama.NewEmbed(llamaContext, data, aspectRatioId)
-			if err != nil {
-				return nil, err
-			}
-		} else if c.clip != nil {
 			embed, err = c.clip.NewEmbed(llamaContext, data)
 			if err != nil {
 				return nil, err
@@ -105,33 +93,11 @@ func (c *ImageContext) BatchSize(configuredBatchSize int) int {
 		return 0
 	}
-	// Mllama maps an image to 1 embedding token (llava creates many tokens)
-	// and doesn't support more than a single image per request.
-	// The embeddings are large (100 MB), so allocating a big batch can fail
-	// on some systems
-	if c.mllama != nil {
-		return 1
-	}
 	return configuredBatchSize
 }
 func (c *ImageContext) EmbedSize(llamaContext *llama.Context) int {
-	if c != nil && c.mllama != nil {
+	return llamaContext.Model().NEmbd()
-		return c.mllama.EmbedSize(llamaContext)
-	} else {
-		return llamaContext.Model().NEmbd()
-	}
-}
-func (c *ImageContext) NeedCrossAttention(inputs ...input) bool {
-	if c == nil || c.mllama == nil {
-		return false
-	}
-	return slices.ContainsFunc(inputs, func(input input) bool {
-		return input.embed != nil
-	})
 }
 type imageCache struct {

--- a/runner/llamarunner/runner.go
+++ b/runner/llamarunner/runner.go
@@ -57,10 +57,6 @@ type Sequence struct {
 	// input cache being used by this sequence
 	cache *InputCacheSlot
-	// does this sequence require cross-attention layers to be processed? - if we have seen
-	// an image for certain multi-modal models
-	crossAttention bool
 	// channel to send responses over
 	responses chan string
@@ -205,7 +201,7 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input, error)
 				return nil, fmt.Errorf("invalid image index: %d", n)
 			}
-			embed, err := s.image.NewEmbed(s.lc, images[imageIndex].Data, images[imageIndex].AspectRatioID)
+			embed, err := s.image.NewEmbed(s.lc, images[imageIndex].Data)
 			if err != nil {
 				return nil, err
 			}
@@ -368,7 +364,6 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 	defer s.mu.Unlock()
 	var batch *llama.Batch
-	crossAttention := false
 	seqIdx := s.nextSeq - 1
 	for range s.seqs {
@@ -416,9 +411,8 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 					batch = tokenBatch
 				} else {
 					batch = embedBatch
-					seq.crossAttention = s.image.NeedCrossAttention(input)
 				}
-			} else if embedding != batch.IsEmbedding() || crossAttention != seq.crossAttention {
+			} else if embedding != batch.IsEmbedding() {
 				s.nextSeq = seqIdx
 				break
 			}
@@ -427,7 +421,6 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 				break
 			}
-			crossAttention = seq.crossAttention
 			batch.Add(input.token, input.embed, len(seq.cache.Inputs)+len(seq.pendingInputs), i+1 == len(seq.inputs), seq.cache.Id)
 			seq.pendingInputs = append(seq.pendingInputs, input)
 			seq.iBatch = batch.NumTokens() - 1
@@ -440,20 +433,11 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 		return nil
 	}
-	s.lc.SetCrossAttention(crossAttention)
 	err := s.lc.Decode(batch)
 	if err != nil {
 		return fmt.Errorf("failed to decode batch: %w", err)
 	}
-	if crossAttention {
-		// synchronize state to ensure the cross attention batch is complete.
-		// needed specifically for multi-GPU systems otherwise an inflight
-		// task may be incorrectly invalidated causing a crash
-		s.lc.Synchronize()
-	}
 	for i, seq := range s.seqs {
 		if seq == nil {
 			continue
@@ -622,8 +606,6 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 				return
 			}
-			seq.crossAttention = s.image.NeedCrossAttention(seq.cache.Inputs...)
 			s.seqs[i] = seq
 			s.cond.Signal()
 			found = true

--- a/server/prompt.go
+++ b/server/prompt.go
@@ -3,47 +3,32 @@ package server
 import (
 	"bytes"
 	"context"
-	"encoding/binary"
 	"errors"
 	"fmt"
 	"log/slog"
+	"slices"
 	"strings"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/llm"
-	"github.com/ollama/ollama/model/models/mllama"
 	"github.com/ollama/ollama/template"
 )
 type tokenizeFunc func(context.Context, string) ([]int, error)
-var errTooManyImages = errors.New("vision model only supports a single image per message")
 // chatPrompt accepts a list of messages and returns the prompt and images that should be used for the next chat turn.
 // chatPrompt truncates any messages that exceed the context window of the model, making sure to always include 1) the
 // latest message and 2) system messages
 func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.Options, msgs []api.Message, tools []api.Tool) (prompt string, images []llm.ImageData, _ error) {
 	var system []api.Message
-	isMllama := checkMllamaModelFamily(m)
-	var imageNumTokens int
 	// TODO: Ideally we would compute this from the projector metadata but some pieces are implementation dependent
-	if isMllama {
+	// Clip images are represented as 768 tokens, each an embedding
-		// Our mllama implementation packs all of the embeddings into a single token
+	imageNumTokens := 768
-		imageNumTokens = 1
-	} else {
-		// Clip images are represented as 768 tokens, each an embedding
-		imageNumTokens = 768
-	}
 	n := len(msgs) - 1
 	// in reverse, find all messages that fit into context window
 	for i := n; i >= 0; i-- {
-		if isMllama && len(msgs[i].Images) > 1 {
-			return "", nil, errTooManyImages
-		}
 		// always include the last message
 		if i == n {
 			continue
@@ -84,48 +69,17 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
 	currMsgIdx := n
 	for cnt, msg := range msgs[currMsgIdx:] {
-		prefix := ""
+		if slices.Contains(m.Config.ModelFamilies, "mllama") && len(msg.Images) > 1 {
-		imgPrompt := ""
+			return "", nil, errors.New("this model only supports one image while more than one image requested")
+		}
+		var prefix string
 		prompt := msg.Content
 		for _, i := range msg.Images {
-			var imgData llm.ImageData
+			imgData := llm.ImageData{
+				ID:   len(images),
-			if isMllama {
+				Data: i,
-				if len(m.ProjectorPaths) == 0 {
-					imgData = llm.ImageData{
-						ID:   len(images),
-						Data: i,
-					}
-				} else {
-					data, opts, err := mllama.Preprocess(bytes.NewReader(i))
-					if err != nil {
-						return "", nil, err
-					}
-					buf := new(bytes.Buffer)
-					err = binary.Write(buf, binary.LittleEndian, data)
-					if err != nil {
-						return "", nil, err
-					}
-					ar, ok := opts["aspectRatioIndex"].(int)
-					if !ok {
-						return "", nil, fmt.Errorf("missing aspect ratio for image")
-					}
-					imgData = llm.ImageData{
-						ID:            len(images),
-						Data:          buf.Bytes(),
-						AspectRatioID: ar,
-					}
-				}
-				imgPrompt = "<|image|>"
-			} else {
-				imgData = llm.ImageData{
-					ID:   len(images),
-					Data: i,
-				}
 			}
 			imgTag := fmt.Sprintf("[img-%d]", imgData.ID)
@@ -137,7 +91,7 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
 			images = append(images, imgData)
 		}
-		msgs[currMsgIdx+cnt].Content = prefix + imgPrompt + prompt
+		msgs[currMsgIdx+cnt].Content = prefix + prompt
 	}
 	// truncate any messages that do not fit into the context window
@@ -148,12 +102,3 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
 	return b.String(), images, nil
 }
-func checkMllamaModelFamily(m *Model) bool {
-	for _, arch := range m.Config.ModelFamilies {
-		if arch == "mllama" {
-			return true
-		}
-	}
-	return false
-}
--- a/server/prompt_test.go
+++ b/server/prompt_test.go
@@ -2,8 +2,6 @@ package server
 import (
 	"bytes"
-	"image"
-	"image/png"
 	"testing"
 	"github.com/google/go-cmp/cmp"
@@ -14,10 +12,9 @@ import (
 func TestChatPrompt(t *testing.T) {
 	type expect struct {
-		prompt        string
+		prompt string
-		images        [][]byte
+		images [][]byte
-		aspectRatioID int
+		error  error
-		error         error
 	}
 	tmpl, err := template.Parse(`
@@ -28,28 +25,6 @@ func TestChatPrompt(t *testing.T) {
 		t.Fatal(err)
 	}
 	visionModel := Model{Template: tmpl, ProjectorPaths: []string{"vision"}}
-	mllamaModel := Model{Template: tmpl, ProjectorPaths: []string{"vision"}, Config: ConfigV2{ModelFamilies: []string{"mllama"}}}
-	createImg := func(width, height int) ([]byte, error) {
-		img := image.NewRGBA(image.Rect(0, 0, width, height))
-		var buf bytes.Buffer
-		if err := png.Encode(&buf, img); err != nil {
-			return nil, err
-		}
-		return buf.Bytes(), nil
-	}
-	imgBuf, err := createImg(5, 5)
-	if err != nil {
-		t.Fatal(err)
-	}
-	imgBuf2, err := createImg(6, 6)
-	if err != nil {
-		t.Fatal(err)
-	}
 	cases := []struct {
 		name  string
@@ -227,90 +202,6 @@ func TestChatPrompt(t *testing.T) {
 				images: [][]byte{[]byte("one hotdog"), []byte("two hotdogs")},
 			},
 		},
-		{
-			name:  "messages with mllama (no images)",
-			model: mllamaModel,
-			limit: 2048,
-			msgs: []api.Message{
-				{Role: "user", Content: "You're a test, Harry!"},
-				{Role: "assistant", Content: "I-I'm a what?"},
-				{Role: "user", Content: "A test. And a thumping good one at that, I'd wager."},
-			},
-			expect: expect{
-				prompt: "You're a test, Harry! I-I'm a what? A test. And a thumping good one at that, I'd wager. ",
-			},
-		},
-		{
-			name:  "messages with mllama single prompt",
-			model: mllamaModel,
-			limit: 2048,
-			msgs: []api.Message{
-				{Role: "user", Content: "How many hotdogs are in this image?", Images: []api.ImageData{imgBuf}},
-			},
-			expect: expect{
-				prompt:        "[img-0]<|image|>How many hotdogs are in this image? ",
-				images:        [][]byte{imgBuf},
-				aspectRatioID: 1,
-			},
-		},
-		{
-			name:  "messages with mllama",
-			model: mllamaModel,
-			limit: 2048,
-			msgs: []api.Message{
-				{Role: "user", Content: "You're a test, Harry!"},
-				{Role: "assistant", Content: "I-I'm a what?"},
-				{Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf}},
-			},
-			expect: expect{
-				prompt:        "You're a test, Harry! I-I'm a what? [img-0]<|image|>A test. And a thumping good one at that, I'd wager. ",
-				images:        [][]byte{imgBuf},
-				aspectRatioID: 1,
-			},
-		},
-		{
-			name:  "multiple messages with mllama",
-			model: mllamaModel,
-			limit: 2048,
-			msgs: []api.Message{
-				{Role: "user", Content: "You're a test, Harry!", Images: []api.ImageData{imgBuf}},
-				{Role: "assistant", Content: "I-I'm a what?"},
-				{Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf2}},
-			},
-			expect: expect{
-				prompt:        "[img-0]<|image|>You're a test, Harry! I-I'm a what? [img-1]<|image|>A test. And a thumping good one at that, I'd wager. ",
-				images:        [][]byte{imgBuf, imgBuf2},
-				aspectRatioID: 1,
-			},
-		},
-		{
-			name:  "earlier image with mllama",
-			model: mllamaModel,
-			limit: 2048,
-			msgs: []api.Message{
-				{Role: "user", Content: "How many hotdogs are in this image?", Images: []api.ImageData{imgBuf}},
-				{Role: "assistant", Content: "There are four hotdogs."},
-				{Role: "user", Content: "Which ones have mustard?"},
-			},
-			expect: expect{
-				prompt:        "[img-0]<|image|>How many hotdogs are in this image? There are four hotdogs. Which ones have mustard? ",
-				images:        [][]byte{imgBuf},
-				aspectRatioID: 1,
-			},
-		},
-		{
-			name:  "too many images with mllama",
-			model: mllamaModel,
-			limit: 2048,
-			msgs: []api.Message{
-				{Role: "user", Content: "You're a test, Harry!"},
-				{Role: "assistant", Content: "I-I'm a what?"},
-				{Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf, imgBuf}},
-			},
-			expect: expect{
-				error: errTooManyImages,
-			},
-		},
 	}
 	for _, tt := range cases {
@@ -341,10 +232,6 @@ func TestChatPrompt(t *testing.T) {
 					if !bytes.Equal(images[i].Data, tt.images[i]) {
 						t.Errorf("expected %q, got %q", tt.images[i], images[i].Data)
 					}
-				} else {
-					if images[i].AspectRatioID != tt.aspectRatioID {
-						t.Errorf("expected aspect ratio %d, got %d", tt.aspectRatioID, images[i].AspectRatioID)
-					}
 				}
 			}
 		})

--- a/server/routes.go
+++ b/server/routes.go
@@ -4,7 +4,6 @@ import (
 	"bytes"
 	"cmp"
 	"context"
-	"encoding/binary"
 	"encoding/json"
 	"errors"
 	"fmt"
@@ -35,7 +34,6 @@ import (
 	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/logutil"
-	"github.com/ollama/ollama/model/models/mllama"
 	"github.com/ollama/ollama/openai"
 	"github.com/ollama/ollama/server/internal/client/ollama"
 	"github.com/ollama/ollama/server/internal/registry"
@@ -100,6 +98,10 @@ func (s *Server) scheduleRunner(ctx context.Context, name string, caps []model.C
 		return nil, nil, nil, err
 	}
+	if slices.Contains(model.Config.ModelFamilies, "mllama") && len(model.ProjectorPaths) > 0 {
+		return nil, nil, nil, fmt.Errorf("'llama3.2-vision' is no longer compatible with your version of Ollama and has been replaced by a newer version. To re-download, run 'ollama pull llama3.2-vision'")
+	}
 	if err := model.CheckCapabilities(caps...); err != nil {
 		return nil, nil, nil, fmt.Errorf("%s %w", name, err)
 	}
@@ -206,38 +208,14 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		return
 	}
-	isMllama := checkMllamaModelFamily(m)
+	if slices.Contains(m.Config.ModelFamilies, "mllama") && len(req.Images) > 1 {
-	if isMllama && len(req.Images) > 1 {
+		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "this model only supports one image while more than one image requested"})
-		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "this model only supports one image: more than one image sent"})
 		return
 	}
 	images := make([]llm.ImageData, len(req.Images))
 	for i := range req.Images {
-		if isMllama && len(m.ProjectorPaths) > 0 {
+		images[i] = llm.ImageData{ID: i, Data: req.Images[i]}
-			data, opts, err := mllama.Preprocess(bytes.NewReader(req.Images[i]))
-			if err != nil {
-				c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})
-				return
-			}
-			ar, ok := opts["aspectRatioIndex"].(int)
-			if !ok {
-				c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})
-				return
-			}
-			buf := new(bytes.Buffer)
-			err = binary.Write(buf, binary.LittleEndian, data)
-			if err != nil {
-				c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})
-				return
-			}
-			images[i] = llm.ImageData{ID: i, Data: buf.Bytes(), AspectRatioID: ar}
-		} else {
-			images[i] = llm.ImageData{ID: i, Data: req.Images[i]}
-		}
 	}
 	prompt := req.Prompt
@@ -269,9 +247,6 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 			for _, i := range images {
 				imgPrompt := ""
-				if isMllama {
-					imgPrompt = "<|image|>"
-				}
 				msgs = append(msgs, api.Message{Role: "user", Content: fmt.Sprintf("[img-%d]"+imgPrompt, i.ID)})
 			}

--- a/server/sched.go
+++ b/server/sched.go
@@ -8,6 +8,7 @@ import (
 	"os"
 	"reflect"
 	"runtime"
+	"slices"
 	"sort"
 	"strconv"
 	"strings"
@@ -132,11 +133,11 @@ func (s *Scheduler) processPending(ctx context.Context) {
 				continue
 			}
 			numParallel := int(envconfig.NumParallel())
-			// TODO (jmorganca): mllama doesn't support parallel yet
+			// `mllama` is a snowflake and uses an encoder cache which cannot be used with num_parallel > 1
-			// see https://github.com/ollama/ollama/issues/4165
+			// ref: https://github.com/ollama/ollama/issues/4165
-			if checkMllamaModelFamily(pending.model) && numParallel != 1 {
+			if slices.Contains(pending.model.Config.ModelFamilies, "mllama") && numParallel != 1 {
 				numParallel = 1
-				slog.Warn("mllama doesn't support parallel requests yet")
+				slog.Warn("mllama does not currently support parallel requests")
 			}
 			for {