Unverified Commit 23125648 authored by Michael Yang's avatar Michael Yang Committed by GitHub
Browse files

chore: update mllama to use ollama engine (#10637)

parent 0478d440
package mllama
import (
"image"
"testing"
"github.com/google/go-cmp/cmp"
)
func TestSupportedAspectRatios(t *testing.T) {
cases := []struct {
p ImageProcessor
want []supportedAspectRatio
}{
{
p: ImageProcessor{maxNumTiles: 1},
want: []supportedAspectRatio{
{1, 1, 1},
},
},
{
p: ImageProcessor{maxNumTiles: 2},
want: []supportedAspectRatio{
{1, 1, 1},
{2, 1, 2},
{3, 2, 1},
},
},
{
p: ImageProcessor{maxNumTiles: 3},
want: []supportedAspectRatio{
{1, 1, 1},
{2, 1, 2},
{3, 1, 3},
{4, 2, 1},
{5, 3, 1},
},
},
{
p: ImageProcessor{maxNumTiles: 4},
want: []supportedAspectRatio{
{1, 1, 1},
{2, 1, 2},
{3, 1, 3},
{4, 1, 4},
{5, 2, 1},
{6, 2, 2},
{7, 3, 1},
{8, 4, 1},
},
},
}
for _, tt := range cases {
actual := tt.p.supportedAspectRatios()
if diff := cmp.Diff(actual, tt.want, cmp.AllowUnexported(supportedAspectRatio{})); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
}
}
}
func TestFitToCanvas(t *testing.T) {
cases := []struct {
p ImageProcessor
image image.Point
canvas image.Point
expect image.Point
}{
{
p: ImageProcessor{imageSize: 200},
image: image.Point{400, 400},
canvas: image.Point{640, 480},
expect: image.Point{400, 400},
},
{
p: ImageProcessor{imageSize: 200},
image: image.Point{1024, 768},
canvas: image.Point{640, 480},
expect: image.Point{640, 480},
},
{
p: ImageProcessor{imageSize: 750},
image: image.Point{500, 500},
canvas: image.Point{1000, 1000},
expect: image.Point{750, 750},
},
{
p: ImageProcessor{imageSize: 2000},
image: image.Point{500, 1000},
canvas: image.Point{2000, 2000},
expect: image.Point{1000, 2000},
},
{
p: ImageProcessor{imageSize: 1000},
image: image.Point{4000, 3000},
canvas: image.Point{2000, 1000},
expect: image.Point{1333, 1000},
},
{
p: ImageProcessor{imageSize: 560},
image: image.Point{667, 1000},
canvas: image.Point{1000, 1000},
expect: image.Point{667, 1000},
},
}
for _, tt := range cases {
actual := tt.p.fitToCanvas(tt.image, tt.canvas)
if diff := cmp.Diff(actual, tt.expect); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
}
}
}
func TestOptimalTiledCanvas(t *testing.T) {
cases := []struct {
p ImageProcessor
image image.Point
expect image.Point
}{
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 1000},
image: image.Point{1024, 768},
expect: image.Point{2000, 1000},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
image: image.Point{1024, 768},
expect: image.Point{1120, 1120},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
image: image.Point{800, 600},
expect: image.Point{1120, 1120},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
image: image.Point{640, 480},
expect: image.Point{1120, 560},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
image: image.Point{320, 200},
expect: image.Point{560, 560},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
image: image.Point{1320, 200},
expect: image.Point{1680, 560},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
image: image.Point{2000, 200},
expect: image.Point{2240, 560},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
image: image.Point{10000, 200},
expect: image.Point{2240, 560},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
image: image.Point{480, 640},
expect: image.Point{560, 1120},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
image: image.Point{200, 320},
expect: image.Point{560, 560},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
image: image.Point{200, 1320},
expect: image.Point{560, 1680},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
image: image.Point{200, 2000},
expect: image.Point{560, 2240},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
image: image.Point{200, 10000},
expect: image.Point{560, 2240},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
image: image.Point{10000, 10000},
expect: image.Point{1120, 1120},
},
}
for _, tt := range cases {
actual := tt.p.optimalTiledCanvas(tt.image)
if diff := cmp.Diff(actual, tt.expect); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
}
}
}
func TestSplitToTiles(t *testing.T) {
cases := []struct {
imageMax image.Point
numTiles image.Point
expect []image.Image
}{
{
imageMax: image.Point{1024, 768},
numTiles: image.Point{1, 1},
expect: []image.Image{image.NewRGBA(image.Rect(0, 0, 1024, 768))},
},
{
imageMax: image.Point{1000, 500},
numTiles: image.Point{2, 1},
expect: []image.Image{
image.NewRGBA(image.Rect(0, 0, 500, 500)),
image.NewRGBA(image.Rect(500, 0, 1000, 500)),
},
},
{
imageMax: image.Point{1000, 1000},
numTiles: image.Point{2, 2},
expect: []image.Image{
image.NewRGBA(image.Rect(0, 0, 500, 500)),
image.NewRGBA(image.Rect(500, 0, 1000, 500)),
image.NewRGBA(image.Rect(0, 500, 500, 1000)),
image.NewRGBA(image.Rect(500, 500, 1000, 1000)),
},
},
}
var p ImageProcessor
for _, tt := range cases {
actual := p.splitToTiles(image.NewRGBA(image.Rectangle{Max: tt.imageMax}), tt.numTiles)
if len(actual) != len(tt.expect) {
t.Errorf("incorrect number of images '%d': expect: '%d'", len(actual), len(tt.expect))
}
for i := range actual {
if actual[i].Bounds() != tt.expect[i].Bounds() {
t.Errorf("image size incorrect: '%#v': expect: '%#v'", actual[i].Bounds(), tt.expect[i].Bounds())
}
}
}
}
func TestResize(t *testing.T) {
cases := []struct {
p ImageProcessor
imageMax image.Point
expectImage image.Image
expectAspectRatio image.Point
}{
{
p: ImageProcessor{maxNumTiles: 1, imageSize: 100},
imageMax: image.Point{200, 200},
expectImage: image.NewRGBA(image.Rect(0, 0, 100, 100)),
expectAspectRatio: image.Point{1, 1},
},
{
p: ImageProcessor{maxNumTiles: 2, imageSize: 100},
imageMax: image.Point{200, 200},
expectImage: image.NewRGBA(image.Rect(0, 0, 100, 100)),
expectAspectRatio: image.Point{1, 1},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
imageMax: image.Point{10, 10},
expectImage: image.NewRGBA(image.Rect(0, 0, 560, 560)),
expectAspectRatio: image.Point{1, 1},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
imageMax: image.Point{2560, 1920},
expectImage: image.NewRGBA(image.Rect(0, 0, 1120, 840)),
expectAspectRatio: image.Point{2, 2},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
imageMax: image.Point{1024, 768},
expectImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)),
expectAspectRatio: image.Point{2, 2},
},
}
for _, tt := range cases {
actualImage, actualAspectRatio := tt.p.resize(image.Rectangle{Max: tt.imageMax})
if actualImage.Bounds() != tt.expectImage.Bounds() {
t.Errorf("image size incorrect: '%#v': expect: '%#v'", actualImage.Bounds(), tt.expectImage.Bounds())
}
if actualAspectRatio != tt.expectAspectRatio {
t.Errorf("aspect ratio incorrect: '%#v': expect: '%#v'", actualAspectRatio, tt.expectAspectRatio)
}
}
}
func TestPad(t *testing.T) {
cases := []struct {
p ImageProcessor
imageMax image.Point
aspectRatio image.Point
expect image.Image
}{
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
imageMax: image.Point{1000, 667},
aspectRatio: image.Point{2, 2},
expect: image.NewRGBA(image.Rect(0, 0, 1120, 1120)),
},
}
for _, tt := range cases {
actual := tt.p.pad(image.Rectangle{Max: tt.imageMax}, tt.aspectRatio)
if actual.Bounds() != tt.expect.Bounds() {
t.Errorf("image size incorrect: '%#v': expect: '%#v'", actual.Bounds(), tt.expect.Bounds())
}
}
}
func TestPackImages(t *testing.T) {
cases := []struct {
imageMax image.Point
aspectRatio image.Point
expectVals int
}{
{
imageMax: image.Point{1120, 1120},
aspectRatio: image.Point{2, 2},
expectVals: 2 * 2 * 3 * 560 * 560,
},
{
imageMax: image.Point{560, 560},
aspectRatio: image.Point{1, 1},
expectVals: 1 * 1 * 3 * 560 * 560,
},
{
imageMax: image.Point{1120, 560},
aspectRatio: image.Point{1, 2},
expectVals: 1 * 2 * 3 * 560 * 560,
},
}
for _, tt := range cases {
var p ImageProcessor
actualVals := p.pack(image.NewRGBA(image.Rectangle{Max: tt.imageMax}), tt.aspectRatio)
if len(actualVals) != tt.expectVals {
t.Errorf("packed image size incorrect: '%d': expect: '%d'", len(actualVals), tt.expectVals)
}
}
}
func TestPreprocess(t *testing.T) {
cases := []struct {
imageMax image.Point
expectAspectRatioID int
}{
{
imageMax: image.Point{10, 10},
expectAspectRatioID: 1,
},
{
imageMax: image.Point{1024, 768},
expectAspectRatioID: 6,
},
}
p := ImageProcessor{imageSize: 560, maxNumTiles: 4}
for _, tt := range cases {
img, aspectRatio, err := p.ProcessImage(image.NewRGBA(image.Rectangle{Max: tt.imageMax}))
if err != nil {
t.Fatalf("error processing: %q", err)
}
if len(img) == 0 {
t.Errorf("no image data returned")
}
if aspectRatio.rank != tt.expectAspectRatioID {
t.Errorf("aspect ratio incorrect: '%d': expect: '%d'", aspectRatio, tt.expectAspectRatioID)
}
}
}
...@@ -5,7 +5,6 @@ import ( ...@@ -5,7 +5,6 @@ import (
"fmt" "fmt"
"hash/maphash" "hash/maphash"
"log/slog" "log/slog"
"slices"
"sync" "sync"
"time" "time"
...@@ -18,8 +17,7 @@ type ImageContext struct { ...@@ -18,8 +17,7 @@ type ImageContext struct {
// mu is required to be held when generating embeddings or accessing the cache // mu is required to be held when generating embeddings or accessing the cache
mu sync.Mutex mu sync.Mutex
clip *llama.ClipContext clip *llama.ClipContext
mllama *llama.MllamaContext
// cache of images to embeddings // cache of images to embeddings
images []imageCache images []imageCache
...@@ -35,8 +33,6 @@ func NewImageContext(llamaContext *llama.Context, modelPath string) (*ImageConte ...@@ -35,8 +33,6 @@ func NewImageContext(llamaContext *llama.Context, modelPath string) (*ImageConte
var c ImageContext var c ImageContext
if arch == "clip" { if arch == "clip" {
c.clip, err = llama.NewClipContext(llamaContext, modelPath) c.clip, err = llama.NewClipContext(llamaContext, modelPath)
} else if arch == "mllama" {
c.mllama, err = llama.NewMllamaContext(llamaContext, modelPath)
} else { } else {
return nil, fmt.Errorf("unknown vision model architecture: %s", arch) return nil, fmt.Errorf("unknown vision model architecture: %s", arch)
} }
...@@ -58,12 +54,9 @@ func (c *ImageContext) Free(modelPath string) { ...@@ -58,12 +54,9 @@ func (c *ImageContext) Free(modelPath string) {
if c.clip != nil { if c.clip != nil {
c.clip.Free() c.clip.Free()
} }
if c.mllama != nil {
c.mllama.Free()
}
} }
func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspectRatioId int) ([][]float32, error) { func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte) ([][]float32, error) {
if c == nil { if c == nil {
return nil, nil return nil, nil
} }
...@@ -79,12 +72,7 @@ func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspect ...@@ -79,12 +72,7 @@ func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspect
embed, err := c.findImage(hash) embed, err := c.findImage(hash)
if err != nil { if err != nil {
if c.mllama != nil { if c.clip != nil {
embed, err = c.mllama.NewEmbed(llamaContext, data, aspectRatioId)
if err != nil {
return nil, err
}
} else if c.clip != nil {
embed, err = c.clip.NewEmbed(llamaContext, data) embed, err = c.clip.NewEmbed(llamaContext, data)
if err != nil { if err != nil {
return nil, err return nil, err
...@@ -105,33 +93,11 @@ func (c *ImageContext) BatchSize(configuredBatchSize int) int { ...@@ -105,33 +93,11 @@ func (c *ImageContext) BatchSize(configuredBatchSize int) int {
return 0 return 0
} }
// Mllama maps an image to 1 embedding token (llava creates many tokens)
// and doesn't support more than a single image per request.
// The embeddings are large (100 MB), so allocating a big batch can fail
// on some systems
if c.mllama != nil {
return 1
}
return configuredBatchSize return configuredBatchSize
} }
func (c *ImageContext) EmbedSize(llamaContext *llama.Context) int { func (c *ImageContext) EmbedSize(llamaContext *llama.Context) int {
if c != nil && c.mllama != nil { return llamaContext.Model().NEmbd()
return c.mllama.EmbedSize(llamaContext)
} else {
return llamaContext.Model().NEmbd()
}
}
func (c *ImageContext) NeedCrossAttention(inputs ...input) bool {
if c == nil || c.mllama == nil {
return false
}
return slices.ContainsFunc(inputs, func(input input) bool {
return input.embed != nil
})
} }
type imageCache struct { type imageCache struct {
......
...@@ -57,10 +57,6 @@ type Sequence struct { ...@@ -57,10 +57,6 @@ type Sequence struct {
// input cache being used by this sequence // input cache being used by this sequence
cache *InputCacheSlot cache *InputCacheSlot
// does this sequence require cross-attention layers to be processed? - if we have seen
// an image for certain multi-modal models
crossAttention bool
// channel to send responses over // channel to send responses over
responses chan string responses chan string
...@@ -205,7 +201,7 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input, error) ...@@ -205,7 +201,7 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input, error)
return nil, fmt.Errorf("invalid image index: %d", n) return nil, fmt.Errorf("invalid image index: %d", n)
} }
embed, err := s.image.NewEmbed(s.lc, images[imageIndex].Data, images[imageIndex].AspectRatioID) embed, err := s.image.NewEmbed(s.lc, images[imageIndex].Data)
if err != nil { if err != nil {
return nil, err return nil, err
} }
...@@ -368,7 +364,6 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch) ...@@ -368,7 +364,6 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
defer s.mu.Unlock() defer s.mu.Unlock()
var batch *llama.Batch var batch *llama.Batch
crossAttention := false
seqIdx := s.nextSeq - 1 seqIdx := s.nextSeq - 1
for range s.seqs { for range s.seqs {
...@@ -416,9 +411,8 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch) ...@@ -416,9 +411,8 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
batch = tokenBatch batch = tokenBatch
} else { } else {
batch = embedBatch batch = embedBatch
seq.crossAttention = s.image.NeedCrossAttention(input)
} }
} else if embedding != batch.IsEmbedding() || crossAttention != seq.crossAttention { } else if embedding != batch.IsEmbedding() {
s.nextSeq = seqIdx s.nextSeq = seqIdx
break break
} }
...@@ -427,7 +421,6 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch) ...@@ -427,7 +421,6 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
break break
} }
crossAttention = seq.crossAttention
batch.Add(input.token, input.embed, len(seq.cache.Inputs)+len(seq.pendingInputs), i+1 == len(seq.inputs), seq.cache.Id) batch.Add(input.token, input.embed, len(seq.cache.Inputs)+len(seq.pendingInputs), i+1 == len(seq.inputs), seq.cache.Id)
seq.pendingInputs = append(seq.pendingInputs, input) seq.pendingInputs = append(seq.pendingInputs, input)
seq.iBatch = batch.NumTokens() - 1 seq.iBatch = batch.NumTokens() - 1
...@@ -440,20 +433,11 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch) ...@@ -440,20 +433,11 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
return nil return nil
} }
s.lc.SetCrossAttention(crossAttention)
err := s.lc.Decode(batch) err := s.lc.Decode(batch)
if err != nil { if err != nil {
return fmt.Errorf("failed to decode batch: %w", err) return fmt.Errorf("failed to decode batch: %w", err)
} }
if crossAttention {
// synchronize state to ensure the cross attention batch is complete.
// needed specifically for multi-GPU systems otherwise an inflight
// task may be incorrectly invalidated causing a crash
s.lc.Synchronize()
}
for i, seq := range s.seqs { for i, seq := range s.seqs {
if seq == nil { if seq == nil {
continue continue
...@@ -622,8 +606,6 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) { ...@@ -622,8 +606,6 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
return return
} }
seq.crossAttention = s.image.NeedCrossAttention(seq.cache.Inputs...)
s.seqs[i] = seq s.seqs[i] = seq
s.cond.Signal() s.cond.Signal()
found = true found = true
......
...@@ -3,47 +3,32 @@ package server ...@@ -3,47 +3,32 @@ package server
import ( import (
"bytes" "bytes"
"context" "context"
"encoding/binary"
"errors" "errors"
"fmt" "fmt"
"log/slog" "log/slog"
"slices"
"strings" "strings"
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/llm"
"github.com/ollama/ollama/model/models/mllama"
"github.com/ollama/ollama/template" "github.com/ollama/ollama/template"
) )
type tokenizeFunc func(context.Context, string) ([]int, error) type tokenizeFunc func(context.Context, string) ([]int, error)
var errTooManyImages = errors.New("vision model only supports a single image per message")
// chatPrompt accepts a list of messages and returns the prompt and images that should be used for the next chat turn. // chatPrompt accepts a list of messages and returns the prompt and images that should be used for the next chat turn.
// chatPrompt truncates any messages that exceed the context window of the model, making sure to always include 1) the // chatPrompt truncates any messages that exceed the context window of the model, making sure to always include 1) the
// latest message and 2) system messages // latest message and 2) system messages
func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.Options, msgs []api.Message, tools []api.Tool) (prompt string, images []llm.ImageData, _ error) { func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.Options, msgs []api.Message, tools []api.Tool) (prompt string, images []llm.ImageData, _ error) {
var system []api.Message var system []api.Message
isMllama := checkMllamaModelFamily(m)
var imageNumTokens int
// TODO: Ideally we would compute this from the projector metadata but some pieces are implementation dependent // TODO: Ideally we would compute this from the projector metadata but some pieces are implementation dependent
if isMllama { // Clip images are represented as 768 tokens, each an embedding
// Our mllama implementation packs all of the embeddings into a single token imageNumTokens := 768
imageNumTokens = 1
} else {
// Clip images are represented as 768 tokens, each an embedding
imageNumTokens = 768
}
n := len(msgs) - 1 n := len(msgs) - 1
// in reverse, find all messages that fit into context window // in reverse, find all messages that fit into context window
for i := n; i >= 0; i-- { for i := n; i >= 0; i-- {
if isMllama && len(msgs[i].Images) > 1 {
return "", nil, errTooManyImages
}
// always include the last message // always include the last message
if i == n { if i == n {
continue continue
...@@ -84,48 +69,17 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api. ...@@ -84,48 +69,17 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
currMsgIdx := n currMsgIdx := n
for cnt, msg := range msgs[currMsgIdx:] { for cnt, msg := range msgs[currMsgIdx:] {
prefix := "" if slices.Contains(m.Config.ModelFamilies, "mllama") && len(msg.Images) > 1 {
imgPrompt := "" return "", nil, errors.New("this model only supports one image while more than one image requested")
}
var prefix string
prompt := msg.Content prompt := msg.Content
for _, i := range msg.Images { for _, i := range msg.Images {
var imgData llm.ImageData imgData := llm.ImageData{
ID: len(images),
if isMllama { Data: i,
if len(m.ProjectorPaths) == 0 {
imgData = llm.ImageData{
ID: len(images),
Data: i,
}
} else {
data, opts, err := mllama.Preprocess(bytes.NewReader(i))
if err != nil {
return "", nil, err
}
buf := new(bytes.Buffer)
err = binary.Write(buf, binary.LittleEndian, data)
if err != nil {
return "", nil, err
}
ar, ok := opts["aspectRatioIndex"].(int)
if !ok {
return "", nil, fmt.Errorf("missing aspect ratio for image")
}
imgData = llm.ImageData{
ID: len(images),
Data: buf.Bytes(),
AspectRatioID: ar,
}
}
imgPrompt = "<|image|>"
} else {
imgData = llm.ImageData{
ID: len(images),
Data: i,
}
} }
imgTag := fmt.Sprintf("[img-%d]", imgData.ID) imgTag := fmt.Sprintf("[img-%d]", imgData.ID)
...@@ -137,7 +91,7 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api. ...@@ -137,7 +91,7 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
images = append(images, imgData) images = append(images, imgData)
} }
msgs[currMsgIdx+cnt].Content = prefix + imgPrompt + prompt msgs[currMsgIdx+cnt].Content = prefix + prompt
} }
// truncate any messages that do not fit into the context window // truncate any messages that do not fit into the context window
...@@ -148,12 +102,3 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api. ...@@ -148,12 +102,3 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
return b.String(), images, nil return b.String(), images, nil
} }
func checkMllamaModelFamily(m *Model) bool {
for _, arch := range m.Config.ModelFamilies {
if arch == "mllama" {
return true
}
}
return false
}
...@@ -2,8 +2,6 @@ package server ...@@ -2,8 +2,6 @@ package server
import ( import (
"bytes" "bytes"
"image"
"image/png"
"testing" "testing"
"github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp"
...@@ -14,10 +12,9 @@ import ( ...@@ -14,10 +12,9 @@ import (
func TestChatPrompt(t *testing.T) { func TestChatPrompt(t *testing.T) {
type expect struct { type expect struct {
prompt string prompt string
images [][]byte images [][]byte
aspectRatioID int error error
error error
} }
tmpl, err := template.Parse(` tmpl, err := template.Parse(`
...@@ -28,28 +25,6 @@ func TestChatPrompt(t *testing.T) { ...@@ -28,28 +25,6 @@ func TestChatPrompt(t *testing.T) {
t.Fatal(err) t.Fatal(err)
} }
visionModel := Model{Template: tmpl, ProjectorPaths: []string{"vision"}} visionModel := Model{Template: tmpl, ProjectorPaths: []string{"vision"}}
mllamaModel := Model{Template: tmpl, ProjectorPaths: []string{"vision"}, Config: ConfigV2{ModelFamilies: []string{"mllama"}}}
createImg := func(width, height int) ([]byte, error) {
img := image.NewRGBA(image.Rect(0, 0, width, height))
var buf bytes.Buffer
if err := png.Encode(&buf, img); err != nil {
return nil, err
}
return buf.Bytes(), nil
}
imgBuf, err := createImg(5, 5)
if err != nil {
t.Fatal(err)
}
imgBuf2, err := createImg(6, 6)
if err != nil {
t.Fatal(err)
}
cases := []struct { cases := []struct {
name string name string
...@@ -227,90 +202,6 @@ func TestChatPrompt(t *testing.T) { ...@@ -227,90 +202,6 @@ func TestChatPrompt(t *testing.T) {
images: [][]byte{[]byte("one hotdog"), []byte("two hotdogs")}, images: [][]byte{[]byte("one hotdog"), []byte("two hotdogs")},
}, },
}, },
{
name: "messages with mllama (no images)",
model: mllamaModel,
limit: 2048,
msgs: []api.Message{
{Role: "user", Content: "You're a test, Harry!"},
{Role: "assistant", Content: "I-I'm a what?"},
{Role: "user", Content: "A test. And a thumping good one at that, I'd wager."},
},
expect: expect{
prompt: "You're a test, Harry! I-I'm a what? A test. And a thumping good one at that, I'd wager. ",
},
},
{
name: "messages with mllama single prompt",
model: mllamaModel,
limit: 2048,
msgs: []api.Message{
{Role: "user", Content: "How many hotdogs are in this image?", Images: []api.ImageData{imgBuf}},
},
expect: expect{
prompt: "[img-0]<|image|>How many hotdogs are in this image? ",
images: [][]byte{imgBuf},
aspectRatioID: 1,
},
},
{
name: "messages with mllama",
model: mllamaModel,
limit: 2048,
msgs: []api.Message{
{Role: "user", Content: "You're a test, Harry!"},
{Role: "assistant", Content: "I-I'm a what?"},
{Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf}},
},
expect: expect{
prompt: "You're a test, Harry! I-I'm a what? [img-0]<|image|>A test. And a thumping good one at that, I'd wager. ",
images: [][]byte{imgBuf},
aspectRatioID: 1,
},
},
{
name: "multiple messages with mllama",
model: mllamaModel,
limit: 2048,
msgs: []api.Message{
{Role: "user", Content: "You're a test, Harry!", Images: []api.ImageData{imgBuf}},
{Role: "assistant", Content: "I-I'm a what?"},
{Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf2}},
},
expect: expect{
prompt: "[img-0]<|image|>You're a test, Harry! I-I'm a what? [img-1]<|image|>A test. And a thumping good one at that, I'd wager. ",
images: [][]byte{imgBuf, imgBuf2},
aspectRatioID: 1,
},
},
{
name: "earlier image with mllama",
model: mllamaModel,
limit: 2048,
msgs: []api.Message{
{Role: "user", Content: "How many hotdogs are in this image?", Images: []api.ImageData{imgBuf}},
{Role: "assistant", Content: "There are four hotdogs."},
{Role: "user", Content: "Which ones have mustard?"},
},
expect: expect{
prompt: "[img-0]<|image|>How many hotdogs are in this image? There are four hotdogs. Which ones have mustard? ",
images: [][]byte{imgBuf},
aspectRatioID: 1,
},
},
{
name: "too many images with mllama",
model: mllamaModel,
limit: 2048,
msgs: []api.Message{
{Role: "user", Content: "You're a test, Harry!"},
{Role: "assistant", Content: "I-I'm a what?"},
{Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf, imgBuf}},
},
expect: expect{
error: errTooManyImages,
},
},
} }
for _, tt := range cases { for _, tt := range cases {
...@@ -341,10 +232,6 @@ func TestChatPrompt(t *testing.T) { ...@@ -341,10 +232,6 @@ func TestChatPrompt(t *testing.T) {
if !bytes.Equal(images[i].Data, tt.images[i]) { if !bytes.Equal(images[i].Data, tt.images[i]) {
t.Errorf("expected %q, got %q", tt.images[i], images[i].Data) t.Errorf("expected %q, got %q", tt.images[i], images[i].Data)
} }
} else {
if images[i].AspectRatioID != tt.aspectRatioID {
t.Errorf("expected aspect ratio %d, got %d", tt.aspectRatioID, images[i].AspectRatioID)
}
} }
} }
}) })
......
...@@ -4,7 +4,6 @@ import ( ...@@ -4,7 +4,6 @@ import (
"bytes" "bytes"
"cmp" "cmp"
"context" "context"
"encoding/binary"
"encoding/json" "encoding/json"
"errors" "errors"
"fmt" "fmt"
...@@ -35,7 +34,6 @@ import ( ...@@ -35,7 +34,6 @@ import (
"github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/llm"
"github.com/ollama/ollama/logutil" "github.com/ollama/ollama/logutil"
"github.com/ollama/ollama/model/models/mllama"
"github.com/ollama/ollama/openai" "github.com/ollama/ollama/openai"
"github.com/ollama/ollama/server/internal/client/ollama" "github.com/ollama/ollama/server/internal/client/ollama"
"github.com/ollama/ollama/server/internal/registry" "github.com/ollama/ollama/server/internal/registry"
...@@ -100,6 +98,10 @@ func (s *Server) scheduleRunner(ctx context.Context, name string, caps []model.C ...@@ -100,6 +98,10 @@ func (s *Server) scheduleRunner(ctx context.Context, name string, caps []model.C
return nil, nil, nil, err return nil, nil, nil, err
} }
if slices.Contains(model.Config.ModelFamilies, "mllama") && len(model.ProjectorPaths) > 0 {
return nil, nil, nil, fmt.Errorf("'llama3.2-vision' is no longer compatible with your version of Ollama and has been replaced by a newer version. To re-download, run 'ollama pull llama3.2-vision'")
}
if err := model.CheckCapabilities(caps...); err != nil { if err := model.CheckCapabilities(caps...); err != nil {
return nil, nil, nil, fmt.Errorf("%s %w", name, err) return nil, nil, nil, fmt.Errorf("%s %w", name, err)
} }
...@@ -206,38 +208,14 @@ func (s *Server) GenerateHandler(c *gin.Context) { ...@@ -206,38 +208,14 @@ func (s *Server) GenerateHandler(c *gin.Context) {
return return
} }
isMllama := checkMllamaModelFamily(m) if slices.Contains(m.Config.ModelFamilies, "mllama") && len(req.Images) > 1 {
if isMllama && len(req.Images) > 1 { c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "this model only supports one image while more than one image requested"})
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "this model only supports one image: more than one image sent"})
return return
} }
images := make([]llm.ImageData, len(req.Images)) images := make([]llm.ImageData, len(req.Images))
for i := range req.Images { for i := range req.Images {
if isMllama && len(m.ProjectorPaths) > 0 { images[i] = llm.ImageData{ID: i, Data: req.Images[i]}
data, opts, err := mllama.Preprocess(bytes.NewReader(req.Images[i]))
if err != nil {
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})
return
}
ar, ok := opts["aspectRatioIndex"].(int)
if !ok {
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})
return
}
buf := new(bytes.Buffer)
err = binary.Write(buf, binary.LittleEndian, data)
if err != nil {
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})
return
}
images[i] = llm.ImageData{ID: i, Data: buf.Bytes(), AspectRatioID: ar}
} else {
images[i] = llm.ImageData{ID: i, Data: req.Images[i]}
}
} }
prompt := req.Prompt prompt := req.Prompt
...@@ -269,9 +247,6 @@ func (s *Server) GenerateHandler(c *gin.Context) { ...@@ -269,9 +247,6 @@ func (s *Server) GenerateHandler(c *gin.Context) {
for _, i := range images { for _, i := range images {
imgPrompt := "" imgPrompt := ""
if isMllama {
imgPrompt = "<|image|>"
}
msgs = append(msgs, api.Message{Role: "user", Content: fmt.Sprintf("[img-%d]"+imgPrompt, i.ID)}) msgs = append(msgs, api.Message{Role: "user", Content: fmt.Sprintf("[img-%d]"+imgPrompt, i.ID)})
} }
......
...@@ -8,6 +8,7 @@ import ( ...@@ -8,6 +8,7 @@ import (
"os" "os"
"reflect" "reflect"
"runtime" "runtime"
"slices"
"sort" "sort"
"strconv" "strconv"
"strings" "strings"
...@@ -132,11 +133,11 @@ func (s *Scheduler) processPending(ctx context.Context) { ...@@ -132,11 +133,11 @@ func (s *Scheduler) processPending(ctx context.Context) {
continue continue
} }
numParallel := int(envconfig.NumParallel()) numParallel := int(envconfig.NumParallel())
// TODO (jmorganca): mllama doesn't support parallel yet // `mllama` is a snowflake and uses an encoder cache which cannot be used with num_parallel > 1
// see https://github.com/ollama/ollama/issues/4165 // ref: https://github.com/ollama/ollama/issues/4165
if checkMllamaModelFamily(pending.model) && numParallel != 1 { if slices.Contains(pending.model.Config.ModelFamilies, "mllama") && numParallel != 1 {
numParallel = 1 numParallel = 1
slog.Warn("mllama doesn't support parallel requests yet") slog.Warn("mllama does not currently support parallel requests")
} }
for { for {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment