process_image.go 5.86 KB
Newer Older
Michael Yang's avatar
Michael Yang committed
1
2
3
4
5
6
7
8
9
package mllama

import (
	"image"
	"math"
	"slices"

	"golang.org/x/image/draw"

10
	"github.com/ollama/ollama/fs"
11
	"github.com/ollama/ollama/model/imageproc"
Michael Yang's avatar
Michael Yang committed
12
13
)

14
15
16
17
18
19
20
21
22
23
24
25
type supportedAspectRatio struct {
	rank, width, height int
}

func (a supportedAspectRatio) Point() image.Point {
	return image.Point{a.width, a.height}
}

func (a supportedAspectRatio) numTiles() int {
	return a.width * a.height
}

Michael Yang's avatar
Michael Yang committed
26
27
type ImageProcessor struct {
	imageSize, numChannels, maxNumTiles int
28
29

	mean, std [3]float32
Michael Yang's avatar
Michael Yang committed
30
31
}

32
func newImageProcessor(c fs.Config) ImageProcessor {
Michael Yang's avatar
Michael Yang committed
33
34
35
36
	return ImageProcessor{
		imageSize:   int(c.Uint("vision.image_size")),
		numChannels: int(c.Uint("vision.num_channels")),
		maxNumTiles: int(c.Uint("vision.max_num_tiles")),
37
38
39

		mean: imageproc.ClipDefaultMean,
		std:  imageproc.ClipDefaultSTD,
Michael Yang's avatar
Michael Yang committed
40
41
42
	}
}

43
44
45
46
func (p ImageProcessor) supportedAspectRatios() (ratios []supportedAspectRatio) {
	for w := 1; w <= p.maxNumTiles; w++ {
		for h := 1; h <= p.maxNumTiles/w; h++ {
			ratios = append(ratios, supportedAspectRatio{len(ratios) + 1, w, h})
Michael Yang's avatar
Michael Yang committed
47
48
49
50
51
		}
	}
	return ratios
}

52
53
54
func (p ImageProcessor) fitToCanvas(imageSize, canvasSize image.Point) image.Point {
	tw := min(max(imageSize.X, p.imageSize), canvasSize.X)
	th := min(max(imageSize.Y, p.imageSize), canvasSize.Y)
Michael Yang's avatar
Michael Yang committed
55

56
57
58
59
	r := math.Min(
		float64(tw)/float64(imageSize.X),
		float64(th)/float64(imageSize.Y),
	)
Michael Yang's avatar
Michael Yang committed
60

61
62
	w := min(int(math.Floor(float64(imageSize.X)*r)), tw)
	h := min(int(math.Floor(float64(imageSize.Y)*r)), th)
Michael Yang's avatar
Michael Yang committed
63
64
65
66

	return image.Point{w, h}
}

67
68
69
70
71
func (p ImageProcessor) optimalTiledCanvas(imageSize image.Point) image.Point {
	possibleTileArrangements := p.supportedAspectRatios()
	possibleCanvasSizes := make([]image.Point, len(possibleTileArrangements))
	for i, pta := range possibleTileArrangements {
		possibleCanvasSizes[i] = image.Point{pta.width * p.imageSize, pta.height * p.imageSize}
Michael Yang's avatar
Michael Yang committed
72
73
	}

74
75
76
77
78
79
	scales := make([]float64, len(possibleCanvasSizes))
	for i, pcs := range possibleCanvasSizes {
		scales[i] = min(
			float64(pcs.Y)/float64(imageSize.Y),
			float64(pcs.X)/float64(imageSize.X),
		)
Michael Yang's avatar
Michael Yang committed
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
	}

	var minUpscale float64
	var maxDownscale float64
	var upscale bool

	for _, s := range scales {
		if s > 1.0 {
			upscale = true
			if minUpscale == 0 {
				minUpscale = s
			} else {
				minUpscale = math.Min(minUpscale, s)
			}
		} else {
			maxDownscale = math.Max(maxDownscale, s)
		}
	}

	selectedScale := maxDownscale
	if upscale {
		selectedScale = minUpscale
	}

	var selectedCanvas image.Point
	for n, pcs := range possibleCanvasSizes {
		if scales[n] == selectedScale {
			// choose the smallest possible canvas
			if selectedCanvas.X == 0 && selectedCanvas.Y == 0 {
				selectedCanvas = pcs
			} else if pcs.X*pcs.Y < selectedCanvas.X*selectedCanvas.Y {
				selectedCanvas = pcs
			}
		}
	}
	return selectedCanvas
}

118
func (p ImageProcessor) splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
Michael Yang's avatar
Michael Yang committed
119
120
121
122
123
124
	b := img.Bounds()
	width := b.Max.X - b.Min.X
	height := b.Max.Y - b.Min.Y
	tileHeight := height / numTilesSize.Y
	tileWidth := width / numTilesSize.X

125
	images := make([]image.Image, 0, numTilesSize.Y*numTilesSize.X)
Michael Yang's avatar
Michael Yang committed
126
127
128
129

	for h := range numTilesSize.Y {
		for w := range numTilesSize.X {
			rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1))
130
			if subImg, ok := img.(interface {
Michael Yang's avatar
Michael Yang committed
131
				SubImage(image.Rectangle) image.Image
132
133
134
135
136
137
138
139
140
			}); ok {
				images = append(images, subImg.SubImage(rect))
			} else {
				// Handle the case where img does not implement SubImage
				// This is a fallback and may not be efficient
				newImg := image.NewRGBA(rect)
				draw.Draw(newImg, rect, img, rect.Min, draw.Src)
				images = append(images, newImg)
			}
Michael Yang's avatar
Michael Yang committed
141
142
143
144
145
146
		}
	}

	return images
}

147
func (p ImageProcessor) resize(img image.Image) (image.Image, image.Point) {
Michael Yang's avatar
Michael Yang committed
148
149
	b := img.Bounds()

150
151
152
	canvasSize := p.optimalTiledCanvas(b.Max)
	aspectRatio := image.Point{canvasSize.X / p.imageSize, canvasSize.Y / p.imageSize}
	newSize := p.fitToCanvas(b.Max, canvasSize)
Michael Yang's avatar
Michael Yang committed
153
154
155
156
157
158
159
160
161
162
163
164
165

	dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y))

	// scaling choices:
	//   NearestNeighbor	fast, blocky output
	//   ApproxBiLinear	fast, medium quality
	//   BiLinear		slow, high quality
	//   CatmullRom		very slow, very high quality
	draw.BiLinear.Scale(dst, dst.Rect, img, b, draw.Over, nil)

	return dst, aspectRatio
}

166
func (p ImageProcessor) pad(img image.Image, aspectRatio image.Point) image.Image {
Michael Yang's avatar
Michael Yang committed
167
	paddedSize := image.Point{
168
169
		X: p.imageSize * aspectRatio.X,
		Y: p.imageSize * aspectRatio.Y,
Michael Yang's avatar
Michael Yang committed
170
171
172
173
174
175
176
177
	}

	dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y))
	draw.Draw(dst, img.Bounds(), img, image.Point{0, 0}, draw.Over)

	return dst
}

178
func (p ImageProcessor) pack(img image.Image, aspectRatio image.Point) []float32 {
Michael Yang's avatar
Michael Yang committed
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
	subImages := p.splitToTiles(img, aspectRatio)

	var pixelVals []float32

	for _, subImg := range subImages {
		bounds := subImg.Bounds()
		var rVals, gVals, bVals []float32
		for y := bounds.Min.Y; y < bounds.Max.Y; y++ {
			for x := bounds.Min.X; x < bounds.Max.X; x++ {
				c := subImg.At(x, y)
				r, g, b, _ := c.RGBA()
				rVal := float32(r>>8) / 255.0
				gVal := float32(g>>8) / 255.0
				bVal := float32(b>>8) / 255.0

194
195
196
				rVal = (rVal - p.mean[0]) / p.std[0]
				gVal = (gVal - p.mean[1]) / p.std[1]
				bVal = (bVal - p.mean[2]) / p.std[2]
Michael Yang's avatar
Michael Yang committed
197
198
199
200
201
202
203
204
205
206
207
208
209
210

				rVals = append(rVals, rVal)
				gVals = append(gVals, gVal)
				bVals = append(bVals, bVal)
			}
		}
		pixelVals = append(pixelVals, rVals...)
		pixelVals = append(pixelVals, gVals...)
		pixelVals = append(pixelVals, bVals...)
	}

	return pixelVals
}

211
212
213
214
func (p ImageProcessor) ProcessImage(img image.Image) ([]float32, supportedAspectRatio, error) {
	newImage, newImageRatio := p.resize(img)
	newImage = p.pad(newImage, newImageRatio)
	pixelValues := p.pack(newImage, newImageRatio)
Michael Yang's avatar
Michael Yang committed
215

216
217
218
219
	supportedAspectRatios := p.supportedAspectRatios()
	aspectRatioID := slices.IndexFunc(supportedAspectRatios, func(i supportedAspectRatio) bool {
		return i.width == newImageRatio.X && i.height == newImageRatio.Y
	})
Michael Yang's avatar
Michael Yang committed
220

221
	return pixelValues, supportedAspectRatios[aspectRatioID], nil
Michael Yang's avatar
Michael Yang committed
222
}