Unverified Commit 23125648 authored by Michael Yang's avatar Michael Yang Committed by GitHub
Browse files

chore: update mllama to use ollama engine (#10637)

parent 0478d440
package mllama
import (
"image"
"testing"
"github.com/google/go-cmp/cmp"
)
func TestSupportedAspectRatios(t *testing.T) {
cases := []struct {
p ImageProcessor
want []supportedAspectRatio
}{
{
p: ImageProcessor{maxNumTiles: 1},
want: []supportedAspectRatio{
{1, 1, 1},
},
},
{
p: ImageProcessor{maxNumTiles: 2},
want: []supportedAspectRatio{
{1, 1, 1},
{2, 1, 2},
{3, 2, 1},
},
},
{
p: ImageProcessor{maxNumTiles: 3},
want: []supportedAspectRatio{
{1, 1, 1},
{2, 1, 2},
{3, 1, 3},
{4, 2, 1},
{5, 3, 1},
},
},
{
p: ImageProcessor{maxNumTiles: 4},
want: []supportedAspectRatio{
{1, 1, 1},
{2, 1, 2},
{3, 1, 3},
{4, 1, 4},
{5, 2, 1},
{6, 2, 2},
{7, 3, 1},
{8, 4, 1},
},
},
}
for _, tt := range cases {
actual := tt.p.supportedAspectRatios()
if diff := cmp.Diff(actual, tt.want, cmp.AllowUnexported(supportedAspectRatio{})); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
}
}
}
func TestFitToCanvas(t *testing.T) {
cases := []struct {
p ImageProcessor
image image.Point
canvas image.Point
expect image.Point
}{
{
p: ImageProcessor{imageSize: 200},
image: image.Point{400, 400},
canvas: image.Point{640, 480},
expect: image.Point{400, 400},
},
{
p: ImageProcessor{imageSize: 200},
image: image.Point{1024, 768},
canvas: image.Point{640, 480},
expect: image.Point{640, 480},
},
{
p: ImageProcessor{imageSize: 750},
image: image.Point{500, 500},
canvas: image.Point{1000, 1000},
expect: image.Point{750, 750},
},
{
p: ImageProcessor{imageSize: 2000},
image: image.Point{500, 1000},
canvas: image.Point{2000, 2000},
expect: image.Point{1000, 2000},
},
{
p: ImageProcessor{imageSize: 1000},
image: image.Point{4000, 3000},
canvas: image.Point{2000, 1000},
expect: image.Point{1333, 1000},
},
{
p: ImageProcessor{imageSize: 560},
image: image.Point{667, 1000},
canvas: image.Point{1000, 1000},
expect: image.Point{667, 1000},
},
}
for _, tt := range cases {
actual := tt.p.fitToCanvas(tt.image, tt.canvas)
if diff := cmp.Diff(actual, tt.expect); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
}
}
}
func TestOptimalTiledCanvas(t *testing.T) {
cases := []struct {
p ImageProcessor
image image.Point
expect image.Point
}{
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 1000},
image: image.Point{1024, 768},
expect: image.Point{2000, 1000},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
image: image.Point{1024, 768},
expect: image.Point{1120, 1120},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
image: image.Point{800, 600},
expect: image.Point{1120, 1120},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
image: image.Point{640, 480},
expect: image.Point{1120, 560},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
image: image.Point{320, 200},
expect: image.Point{560, 560},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
image: image.Point{1320, 200},
expect: image.Point{1680, 560},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
image: image.Point{2000, 200},
expect: image.Point{2240, 560},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
image: image.Point{10000, 200},
expect: image.Point{2240, 560},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
image: image.Point{480, 640},
expect: image.Point{560, 1120},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
image: image.Point{200, 320},
expect: image.Point{560, 560},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
image: image.Point{200, 1320},
expect: image.Point{560, 1680},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
image: image.Point{200, 2000},
expect: image.Point{560, 2240},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
image: image.Point{200, 10000},
expect: image.Point{560, 2240},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
image: image.Point{10000, 10000},
expect: image.Point{1120, 1120},
},
}
for _, tt := range cases {
actual := tt.p.optimalTiledCanvas(tt.image)
if diff := cmp.Diff(actual, tt.expect); diff != "" {
t.Errorf("mismatch (-got +want):\n%s", diff)
}
}
}
func TestSplitToTiles(t *testing.T) {
cases := []struct {
imageMax image.Point
numTiles image.Point
expect []image.Image
}{
{
imageMax: image.Point{1024, 768},
numTiles: image.Point{1, 1},
expect: []image.Image{image.NewRGBA(image.Rect(0, 0, 1024, 768))},
},
{
imageMax: image.Point{1000, 500},
numTiles: image.Point{2, 1},
expect: []image.Image{
image.NewRGBA(image.Rect(0, 0, 500, 500)),
image.NewRGBA(image.Rect(500, 0, 1000, 500)),
},
},
{
imageMax: image.Point{1000, 1000},
numTiles: image.Point{2, 2},
expect: []image.Image{
image.NewRGBA(image.Rect(0, 0, 500, 500)),
image.NewRGBA(image.Rect(500, 0, 1000, 500)),
image.NewRGBA(image.Rect(0, 500, 500, 1000)),
image.NewRGBA(image.Rect(500, 500, 1000, 1000)),
},
},
}
var p ImageProcessor
for _, tt := range cases {
actual := p.splitToTiles(image.NewRGBA(image.Rectangle{Max: tt.imageMax}), tt.numTiles)
if len(actual) != len(tt.expect) {
t.Errorf("incorrect number of images '%d': expect: '%d'", len(actual), len(tt.expect))
}
for i := range actual {
if actual[i].Bounds() != tt.expect[i].Bounds() {
t.Errorf("image size incorrect: '%#v': expect: '%#v'", actual[i].Bounds(), tt.expect[i].Bounds())
}
}
}
}
func TestResize(t *testing.T) {
cases := []struct {
p ImageProcessor
imageMax image.Point
expectImage image.Image
expectAspectRatio image.Point
}{
{
p: ImageProcessor{maxNumTiles: 1, imageSize: 100},
imageMax: image.Point{200, 200},
expectImage: image.NewRGBA(image.Rect(0, 0, 100, 100)),
expectAspectRatio: image.Point{1, 1},
},
{
p: ImageProcessor{maxNumTiles: 2, imageSize: 100},
imageMax: image.Point{200, 200},
expectImage: image.NewRGBA(image.Rect(0, 0, 100, 100)),
expectAspectRatio: image.Point{1, 1},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
imageMax: image.Point{10, 10},
expectImage: image.NewRGBA(image.Rect(0, 0, 560, 560)),
expectAspectRatio: image.Point{1, 1},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
imageMax: image.Point{2560, 1920},
expectImage: image.NewRGBA(image.Rect(0, 0, 1120, 840)),
expectAspectRatio: image.Point{2, 2},
},
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
imageMax: image.Point{1024, 768},
expectImage: image.NewRGBA(image.Rect(0, 0, 1024, 768)),
expectAspectRatio: image.Point{2, 2},
},
}
for _, tt := range cases {
actualImage, actualAspectRatio := tt.p.resize(image.Rectangle{Max: tt.imageMax})
if actualImage.Bounds() != tt.expectImage.Bounds() {
t.Errorf("image size incorrect: '%#v': expect: '%#v'", actualImage.Bounds(), tt.expectImage.Bounds())
}
if actualAspectRatio != tt.expectAspectRatio {
t.Errorf("aspect ratio incorrect: '%#v': expect: '%#v'", actualAspectRatio, tt.expectAspectRatio)
}
}
}
func TestPad(t *testing.T) {
cases := []struct {
p ImageProcessor
imageMax image.Point
aspectRatio image.Point
expect image.Image
}{
{
p: ImageProcessor{maxNumTiles: 4, imageSize: 560},
imageMax: image.Point{1000, 667},
aspectRatio: image.Point{2, 2},
expect: image.NewRGBA(image.Rect(0, 0, 1120, 1120)),
},
}
for _, tt := range cases {
actual := tt.p.pad(image.Rectangle{Max: tt.imageMax}, tt.aspectRatio)
if actual.Bounds() != tt.expect.Bounds() {
t.Errorf("image size incorrect: '%#v': expect: '%#v'", actual.Bounds(), tt.expect.Bounds())
}
}
}
func TestPackImages(t *testing.T) {
cases := []struct {
imageMax image.Point
aspectRatio image.Point
expectVals int
}{
{
imageMax: image.Point{1120, 1120},
aspectRatio: image.Point{2, 2},
expectVals: 2 * 2 * 3 * 560 * 560,
},
{
imageMax: image.Point{560, 560},
aspectRatio: image.Point{1, 1},
expectVals: 1 * 1 * 3 * 560 * 560,
},
{
imageMax: image.Point{1120, 560},
aspectRatio: image.Point{1, 2},
expectVals: 1 * 2 * 3 * 560 * 560,
},
}
for _, tt := range cases {
var p ImageProcessor
actualVals := p.pack(image.NewRGBA(image.Rectangle{Max: tt.imageMax}), tt.aspectRatio)
if len(actualVals) != tt.expectVals {
t.Errorf("packed image size incorrect: '%d': expect: '%d'", len(actualVals), tt.expectVals)
}
}
}
func TestPreprocess(t *testing.T) {
cases := []struct {
imageMax image.Point
expectAspectRatioID int
}{
{
imageMax: image.Point{10, 10},
expectAspectRatioID: 1,
},
{
imageMax: image.Point{1024, 768},
expectAspectRatioID: 6,
},
}
p := ImageProcessor{imageSize: 560, maxNumTiles: 4}
for _, tt := range cases {
img, aspectRatio, err := p.ProcessImage(image.NewRGBA(image.Rectangle{Max: tt.imageMax}))
if err != nil {
t.Fatalf("error processing: %q", err)
}
if len(img) == 0 {
t.Errorf("no image data returned")
}
if aspectRatio.rank != tt.expectAspectRatioID {
t.Errorf("aspect ratio incorrect: '%d': expect: '%d'", aspectRatio, tt.expectAspectRatioID)
}
}
}
......@@ -5,7 +5,6 @@ import (
"fmt"
"hash/maphash"
"log/slog"
"slices"
"sync"
"time"
......@@ -18,8 +17,7 @@ type ImageContext struct {
// mu is required to be held when generating embeddings or accessing the cache
mu sync.Mutex
clip *llama.ClipContext
mllama *llama.MllamaContext
clip *llama.ClipContext
// cache of images to embeddings
images []imageCache
......@@ -35,8 +33,6 @@ func NewImageContext(llamaContext *llama.Context, modelPath string) (*ImageConte
var c ImageContext
if arch == "clip" {
c.clip, err = llama.NewClipContext(llamaContext, modelPath)
} else if arch == "mllama" {
c.mllama, err = llama.NewMllamaContext(llamaContext, modelPath)
} else {
return nil, fmt.Errorf("unknown vision model architecture: %s", arch)
}
......@@ -58,12 +54,9 @@ func (c *ImageContext) Free(modelPath string) {
if c.clip != nil {
c.clip.Free()
}
if c.mllama != nil {
c.mllama.Free()
}
}
func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspectRatioId int) ([][]float32, error) {
func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte) ([][]float32, error) {
if c == nil {
return nil, nil
}
......@@ -79,12 +72,7 @@ func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspect
embed, err := c.findImage(hash)
if err != nil {
if c.mllama != nil {
embed, err = c.mllama.NewEmbed(llamaContext, data, aspectRatioId)
if err != nil {
return nil, err
}
} else if c.clip != nil {
if c.clip != nil {
embed, err = c.clip.NewEmbed(llamaContext, data)
if err != nil {
return nil, err
......@@ -105,33 +93,11 @@ func (c *ImageContext) BatchSize(configuredBatchSize int) int {
return 0
}
// Mllama maps an image to 1 embedding token (llava creates many tokens)
// and doesn't support more than a single image per request.
// The embeddings are large (100 MB), so allocating a big batch can fail
// on some systems
if c.mllama != nil {
return 1
}
return configuredBatchSize
}
func (c *ImageContext) EmbedSize(llamaContext *llama.Context) int {
if c != nil && c.mllama != nil {
return c.mllama.EmbedSize(llamaContext)
} else {
return llamaContext.Model().NEmbd()
}
}
func (c *ImageContext) NeedCrossAttention(inputs ...input) bool {
if c == nil || c.mllama == nil {
return false
}
return slices.ContainsFunc(inputs, func(input input) bool {
return input.embed != nil
})
return llamaContext.Model().NEmbd()
}
type imageCache struct {
......
......@@ -57,10 +57,6 @@ type Sequence struct {
// input cache being used by this sequence
cache *InputCacheSlot
// does this sequence require cross-attention layers to be processed? - if we have seen
// an image for certain multi-modal models
crossAttention bool
// channel to send responses over
responses chan string
......@@ -205,7 +201,7 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input, error)
return nil, fmt.Errorf("invalid image index: %d", n)
}
embed, err := s.image.NewEmbed(s.lc, images[imageIndex].Data, images[imageIndex].AspectRatioID)
embed, err := s.image.NewEmbed(s.lc, images[imageIndex].Data)
if err != nil {
return nil, err
}
......@@ -368,7 +364,6 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
defer s.mu.Unlock()
var batch *llama.Batch
crossAttention := false
seqIdx := s.nextSeq - 1
for range s.seqs {
......@@ -416,9 +411,8 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
batch = tokenBatch
} else {
batch = embedBatch
seq.crossAttention = s.image.NeedCrossAttention(input)
}
} else if embedding != batch.IsEmbedding() || crossAttention != seq.crossAttention {
} else if embedding != batch.IsEmbedding() {
s.nextSeq = seqIdx
break
}
......@@ -427,7 +421,6 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
break
}
crossAttention = seq.crossAttention
batch.Add(input.token, input.embed, len(seq.cache.Inputs)+len(seq.pendingInputs), i+1 == len(seq.inputs), seq.cache.Id)
seq.pendingInputs = append(seq.pendingInputs, input)
seq.iBatch = batch.NumTokens() - 1
......@@ -440,20 +433,11 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
return nil
}
s.lc.SetCrossAttention(crossAttention)
err := s.lc.Decode(batch)
if err != nil {
return fmt.Errorf("failed to decode batch: %w", err)
}
if crossAttention {
// synchronize state to ensure the cross attention batch is complete.
// needed specifically for multi-GPU systems otherwise an inflight
// task may be incorrectly invalidated causing a crash
s.lc.Synchronize()
}
for i, seq := range s.seqs {
if seq == nil {
continue
......@@ -622,8 +606,6 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
return
}
seq.crossAttention = s.image.NeedCrossAttention(seq.cache.Inputs...)
s.seqs[i] = seq
s.cond.Signal()
found = true
......
......@@ -3,47 +3,32 @@ package server
import (
"bytes"
"context"
"encoding/binary"
"errors"
"fmt"
"log/slog"
"slices"
"strings"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/model/models/mllama"
"github.com/ollama/ollama/template"
)
type tokenizeFunc func(context.Context, string) ([]int, error)
var errTooManyImages = errors.New("vision model only supports a single image per message")
// chatPrompt accepts a list of messages and returns the prompt and images that should be used for the next chat turn.
// chatPrompt truncates any messages that exceed the context window of the model, making sure to always include 1) the
// latest message and 2) system messages
func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.Options, msgs []api.Message, tools []api.Tool) (prompt string, images []llm.ImageData, _ error) {
var system []api.Message
isMllama := checkMllamaModelFamily(m)
var imageNumTokens int
// TODO: Ideally we would compute this from the projector metadata but some pieces are implementation dependent
if isMllama {
// Our mllama implementation packs all of the embeddings into a single token
imageNumTokens = 1
} else {
// Clip images are represented as 768 tokens, each an embedding
imageNumTokens = 768
}
// Clip images are represented as 768 tokens, each an embedding
imageNumTokens := 768
n := len(msgs) - 1
// in reverse, find all messages that fit into context window
for i := n; i >= 0; i-- {
if isMllama && len(msgs[i].Images) > 1 {
return "", nil, errTooManyImages
}
// always include the last message
if i == n {
continue
......@@ -84,48 +69,17 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
currMsgIdx := n
for cnt, msg := range msgs[currMsgIdx:] {
prefix := ""
imgPrompt := ""
if slices.Contains(m.Config.ModelFamilies, "mllama") && len(msg.Images) > 1 {
return "", nil, errors.New("this model only supports one image while more than one image requested")
}
var prefix string
prompt := msg.Content
for _, i := range msg.Images {
var imgData llm.ImageData
if isMllama {
if len(m.ProjectorPaths) == 0 {
imgData = llm.ImageData{
ID: len(images),
Data: i,
}
} else {
data, opts, err := mllama.Preprocess(bytes.NewReader(i))
if err != nil {
return "", nil, err
}
buf := new(bytes.Buffer)
err = binary.Write(buf, binary.LittleEndian, data)
if err != nil {
return "", nil, err
}
ar, ok := opts["aspectRatioIndex"].(int)
if !ok {
return "", nil, fmt.Errorf("missing aspect ratio for image")
}
imgData = llm.ImageData{
ID: len(images),
Data: buf.Bytes(),
AspectRatioID: ar,
}
}
imgPrompt = "<|image|>"
} else {
imgData = llm.ImageData{
ID: len(images),
Data: i,
}
imgData := llm.ImageData{
ID: len(images),
Data: i,
}
imgTag := fmt.Sprintf("[img-%d]", imgData.ID)
......@@ -137,7 +91,7 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
images = append(images, imgData)
}
msgs[currMsgIdx+cnt].Content = prefix + imgPrompt + prompt
msgs[currMsgIdx+cnt].Content = prefix + prompt
}
// truncate any messages that do not fit into the context window
......@@ -148,12 +102,3 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
return b.String(), images, nil
}
func checkMllamaModelFamily(m *Model) bool {
for _, arch := range m.Config.ModelFamilies {
if arch == "mllama" {
return true
}
}
return false
}
......@@ -2,8 +2,6 @@ package server
import (
"bytes"
"image"
"image/png"
"testing"
"github.com/google/go-cmp/cmp"
......@@ -14,10 +12,9 @@ import (
func TestChatPrompt(t *testing.T) {
type expect struct {
prompt string
images [][]byte
aspectRatioID int
error error
prompt string
images [][]byte
error error
}
tmpl, err := template.Parse(`
......@@ -28,28 +25,6 @@ func TestChatPrompt(t *testing.T) {
t.Fatal(err)
}
visionModel := Model{Template: tmpl, ProjectorPaths: []string{"vision"}}
mllamaModel := Model{Template: tmpl, ProjectorPaths: []string{"vision"}, Config: ConfigV2{ModelFamilies: []string{"mllama"}}}
createImg := func(width, height int) ([]byte, error) {
img := image.NewRGBA(image.Rect(0, 0, width, height))
var buf bytes.Buffer
if err := png.Encode(&buf, img); err != nil {
return nil, err
}
return buf.Bytes(), nil
}
imgBuf, err := createImg(5, 5)
if err != nil {
t.Fatal(err)
}
imgBuf2, err := createImg(6, 6)
if err != nil {
t.Fatal(err)
}
cases := []struct {
name string
......@@ -227,90 +202,6 @@ func TestChatPrompt(t *testing.T) {
images: [][]byte{[]byte("one hotdog"), []byte("two hotdogs")},
},
},
{
name: "messages with mllama (no images)",
model: mllamaModel,
limit: 2048,
msgs: []api.Message{
{Role: "user", Content: "You're a test, Harry!"},
{Role: "assistant", Content: "I-I'm a what?"},
{Role: "user", Content: "A test. And a thumping good one at that, I'd wager."},
},
expect: expect{
prompt: "You're a test, Harry! I-I'm a what? A test. And a thumping good one at that, I'd wager. ",
},
},
{
name: "messages with mllama single prompt",
model: mllamaModel,
limit: 2048,
msgs: []api.Message{
{Role: "user", Content: "How many hotdogs are in this image?", Images: []api.ImageData{imgBuf}},
},
expect: expect{
prompt: "[img-0]<|image|>How many hotdogs are in this image? ",
images: [][]byte{imgBuf},
aspectRatioID: 1,
},
},
{
name: "messages with mllama",
model: mllamaModel,
limit: 2048,
msgs: []api.Message{
{Role: "user", Content: "You're a test, Harry!"},
{Role: "assistant", Content: "I-I'm a what?"},
{Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf}},
},
expect: expect{
prompt: "You're a test, Harry! I-I'm a what? [img-0]<|image|>A test. And a thumping good one at that, I'd wager. ",
images: [][]byte{imgBuf},
aspectRatioID: 1,
},
},
{
name: "multiple messages with mllama",
model: mllamaModel,
limit: 2048,
msgs: []api.Message{
{Role: "user", Content: "You're a test, Harry!", Images: []api.ImageData{imgBuf}},
{Role: "assistant", Content: "I-I'm a what?"},
{Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf2}},
},
expect: expect{
prompt: "[img-0]<|image|>You're a test, Harry! I-I'm a what? [img-1]<|image|>A test. And a thumping good one at that, I'd wager. ",
images: [][]byte{imgBuf, imgBuf2},
aspectRatioID: 1,
},
},
{
name: "earlier image with mllama",
model: mllamaModel,
limit: 2048,
msgs: []api.Message{
{Role: "user", Content: "How many hotdogs are in this image?", Images: []api.ImageData{imgBuf}},
{Role: "assistant", Content: "There are four hotdogs."},
{Role: "user", Content: "Which ones have mustard?"},
},
expect: expect{
prompt: "[img-0]<|image|>How many hotdogs are in this image? There are four hotdogs. Which ones have mustard? ",
images: [][]byte{imgBuf},
aspectRatioID: 1,
},
},
{
name: "too many images with mllama",
model: mllamaModel,
limit: 2048,
msgs: []api.Message{
{Role: "user", Content: "You're a test, Harry!"},
{Role: "assistant", Content: "I-I'm a what?"},
{Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf, imgBuf}},
},
expect: expect{
error: errTooManyImages,
},
},
}
for _, tt := range cases {
......@@ -341,10 +232,6 @@ func TestChatPrompt(t *testing.T) {
if !bytes.Equal(images[i].Data, tt.images[i]) {
t.Errorf("expected %q, got %q", tt.images[i], images[i].Data)
}
} else {
if images[i].AspectRatioID != tt.aspectRatioID {
t.Errorf("expected aspect ratio %d, got %d", tt.aspectRatioID, images[i].AspectRatioID)
}
}
}
})
......
......@@ -4,7 +4,6 @@ import (
"bytes"
"cmp"
"context"
"encoding/binary"
"encoding/json"
"errors"
"fmt"
......@@ -35,7 +34,6 @@ import (
"github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/logutil"
"github.com/ollama/ollama/model/models/mllama"
"github.com/ollama/ollama/openai"
"github.com/ollama/ollama/server/internal/client/ollama"
"github.com/ollama/ollama/server/internal/registry"
......@@ -100,6 +98,10 @@ func (s *Server) scheduleRunner(ctx context.Context, name string, caps []model.C
return nil, nil, nil, err
}
if slices.Contains(model.Config.ModelFamilies, "mllama") && len(model.ProjectorPaths) > 0 {
return nil, nil, nil, fmt.Errorf("'llama3.2-vision' is no longer compatible with your version of Ollama and has been replaced by a newer version. To re-download, run 'ollama pull llama3.2-vision'")
}
if err := model.CheckCapabilities(caps...); err != nil {
return nil, nil, nil, fmt.Errorf("%s %w", name, err)
}
......@@ -206,38 +208,14 @@ func (s *Server) GenerateHandler(c *gin.Context) {
return
}
isMllama := checkMllamaModelFamily(m)
if isMllama && len(req.Images) > 1 {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "this model only supports one image: more than one image sent"})
if slices.Contains(m.Config.ModelFamilies, "mllama") && len(req.Images) > 1 {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "this model only supports one image while more than one image requested"})
return
}
images := make([]llm.ImageData, len(req.Images))
for i := range req.Images {
if isMllama && len(m.ProjectorPaths) > 0 {
data, opts, err := mllama.Preprocess(bytes.NewReader(req.Images[i]))
if err != nil {
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})
return
}
ar, ok := opts["aspectRatioIndex"].(int)
if !ok {
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})
return
}
buf := new(bytes.Buffer)
err = binary.Write(buf, binary.LittleEndian, data)
if err != nil {
c.AbortWithStatusJSON(http.StatusInternalServerError, gin.H{"error": "error processing image"})
return
}
images[i] = llm.ImageData{ID: i, Data: buf.Bytes(), AspectRatioID: ar}
} else {
images[i] = llm.ImageData{ID: i, Data: req.Images[i]}
}
images[i] = llm.ImageData{ID: i, Data: req.Images[i]}
}
prompt := req.Prompt
......@@ -269,9 +247,6 @@ func (s *Server) GenerateHandler(c *gin.Context) {
for _, i := range images {
imgPrompt := ""
if isMllama {
imgPrompt = "<|image|>"
}
msgs = append(msgs, api.Message{Role: "user", Content: fmt.Sprintf("[img-%d]"+imgPrompt, i.ID)})
}
......
......@@ -8,6 +8,7 @@ import (
"os"
"reflect"
"runtime"
"slices"
"sort"
"strconv"
"strings"
......@@ -132,11 +133,11 @@ func (s *Scheduler) processPending(ctx context.Context) {
continue
}
numParallel := int(envconfig.NumParallel())
// TODO (jmorganca): mllama doesn't support parallel yet
// see https://github.com/ollama/ollama/issues/4165
if checkMllamaModelFamily(pending.model) && numParallel != 1 {
// `mllama` is a snowflake and uses an encoder cache which cannot be used with num_parallel > 1
// ref: https://github.com/ollama/ollama/issues/4165
if slices.Contains(pending.model.Config.ModelFamilies, "mllama") && numParallel != 1 {
numParallel = 1
slog.Warn("mllama doesn't support parallel requests yet")
slog.Warn("mllama does not currently support parallel requests")
}
for {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment