imageproc.go 4.84 KB
Newer Older
1
package mllama
2
3
4
5
6
7

import (
	"fmt"
	"image"
	_ "image/jpeg"
	_ "image/png"
8
	"io"
9
10
11
12
	"math"
	"slices"

	"golang.org/x/image/draw"
13
14

	"github.com/ollama/ollama/model/imageproc"
15
16
)

17
func getSupportedAspectRatios(maxTiles int) []image.Point {
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
	ratios := []image.Point{}

	for w := range maxTiles {
		for h := range maxTiles {
			if (w+1)*(h+1) <= maxTiles {
				ratios = append(ratios, image.Point{w + 1, h + 1})
			}
		}
	}

	return ratios
}

func clip(a, a_min, a_max int) int {
	if a < a_min {
		return a_min
	} else if a > a_max {
		return a_max
	}

	return a
}

func getOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point {
42
	possibleTileArrangements := getSupportedAspectRatios(maxImageTiles)
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
	possibleCanvasSizes := []image.Point{}
	for _, pta := range possibleTileArrangements {
		possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize})
	}

	scales := []float64{}

	for _, pcs := range possibleCanvasSizes {
		scaleHeight := float64(pcs.Y) / float64(imageSize.Y)
		scaleWidth := float64(pcs.X) / float64(imageSize.X)

		if scaleWidth > scaleHeight {
			scales = append(scales, scaleHeight)
		} else {
			scales = append(scales, scaleWidth)
		}
	}

	var minUpscale float64
	var maxDownscale float64
	var upscale bool

	for _, s := range scales {
		if s > 1.0 {
			upscale = true
			if minUpscale == 0 {
				minUpscale = s
			} else {
				minUpscale = math.Min(minUpscale, s)
			}
		} else {
			maxDownscale = math.Max(maxDownscale, s)
		}
	}

	selectedScale := maxDownscale
	if upscale {
		selectedScale = minUpscale
	}

	var selectedCanvas image.Point
	for n, pcs := range possibleCanvasSizes {
		if scales[n] == selectedScale {
			// choose the smallest possible canvas
			if selectedCanvas.X == 0 && selectedCanvas.Y == 0 {
				selectedCanvas = pcs
			} else if pcs.X*pcs.Y < selectedCanvas.X*selectedCanvas.Y {
				selectedCanvas = pcs
			}
		}
	}
	return selectedCanvas
}

97
98
99
func getImageSizeFitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point {
	targetWidth := clip(imageSize.X, tileSize, canvasSize.X)
	targetHeight := clip(imageSize.Y, tileSize, canvasSize.Y)
100

101
102
	scaleWidth := float64(targetWidth) / float64(imageSize.X)
	scaleHeight := float64(targetHeight) / float64(imageSize.Y)
103

104
	var w, h int
105

106
107
108
109
110
111
112
	if scaleWidth < scaleHeight {
		w = targetWidth
		h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight)
	} else {
		w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth)
		h = targetHeight
	}
113

114
	return image.Point{w, h}
115
116
}

117
func resizeImage(img image.Image, format string, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) {
118
	if format == "png" {
119
		img = imageproc.Composite(img)
120
121
122
123
124
125
126
127
128
	}

	b := img.Bounds()
	tileSize := outputSize.Y

	canvasSize := getOptimalTiledCanvas(b.Max, maxImageTiles, tileSize)
	aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize}
	newSize := getImageSizeFitToCanvas(b.Max, canvasSize, tileSize)

129
	return imageproc.Resize(img, newSize, imageproc.ResizeBilinear), aspectRatio
130
131
}

132
func padImage(img image.Image, outputSize, aspectRatio image.Point) image.Image {
133
134
135
136
137
138
139
140
141
142
143
	paddedSize := image.Point{
		X: outputSize.X * aspectRatio.X,
		Y: outputSize.Y * aspectRatio.Y,
	}

	dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y))
	draw.Draw(dst, img.Bounds(), img, image.Point{0, 0}, draw.Over)

	return dst
}

144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
func splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
	b := img.Bounds()
	width := b.Max.X - b.Min.X
	height := b.Max.Y - b.Min.Y
	tileHeight := height / numTilesSize.Y
	tileWidth := width / numTilesSize.X

	images := []image.Image{}

	for h := range numTilesSize.Y {
		for w := range numTilesSize.X {
			rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1))
			images = append(images, img.(interface {
				SubImage(image.Rectangle) image.Image
			}).SubImage(rect))
		}
	}

	return images
}

func packImages(img image.Image, aspectRatio image.Point) []float32 {
166
167
168
169
	subImages := splitToTiles(img, aspectRatio)

	var pixelVals []float32

170
171
172
	rescale := true
	channelFirst := true

173
	for _, subImg := range subImages {
174
175
		vals := imageproc.Normalize(subImg, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, rescale, channelFirst)
		pixelVals = append(pixelVals, vals...)
176
177
178
179
180
	}

	return pixelVals
}

181
func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) {
182
183
184
	outputSize := image.Point{560, 560}
	maxTiles := 4

185
	img, format, err := image.Decode(imageData)
186
	if err != nil {
187
		return nil, nil, fmt.Errorf("failed to decode image: %w", err)
188
189
	}

190
191
	newImage, aspectRatio := resizeImage(img, format, outputSize, maxTiles)
	newImage = padImage(newImage, outputSize, aspectRatio)
192

193
194
195
196
197
198
	data := packImages(newImage, aspectRatio)
	aspectRatioIndex := slices.Index(getSupportedAspectRatios(maxTiles), aspectRatio) + 1

	opts := map[string]any{
		"aspectRatioIndex": aspectRatioIndex,
	}
199

200
	return data, opts, nil
201
}