imageproc.go 2 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
package qwen2vl

import (
	"fmt"
	"image"
	_ "image/jpeg"
	_ "image/png"
	"io"
	"math"

	"github.com/ollama/ollama/model/imageproc"
)

const (
	DefaultFactor    = 28
	DefaultMinPixels = 56 * 56
	DefaultMaxPixels = 14 * 14 * 4 * 1280
)

// smartResize calculates the size of the image to resize to based on the
// factor, minPixels, and maxPixels.
func smartResize(size image.Point, factor, minPixels, maxPixels int) image.Point {
	// 1. Both dimensions of size are divisible by factor
	// 2. The area of the image is between minPixels and maxPixels
	// 3. The aspect ratio of the image is as close to 1:1 as possible

	if size.Y < factor || size.X < factor {
		panic("image is too small to resize")
	} else if max(size.X, size.Y)/min(size.X, size.Y) > 200 {
		panic("aspect ratio must be less than 200:1")
	}

	f := float64(factor)
	width := float64(size.X)
	height := float64(size.Y)

	xBar := math.Round(width/f) * f
	yBar := math.Round(height/f) * f

	if xBar*yBar > float64(maxPixels) {
		beta := math.Sqrt(height * width / float64(maxPixels))
		xBar = math.Floor(width/beta/f) * f
		yBar = math.Floor(height/beta/f) * f
	} else if xBar*yBar < float64(minPixels) {
		beta := math.Sqrt(float64(minPixels) / (height * width))
		xBar = math.Ceil(width*beta/f) * f
		yBar = math.Ceil(height*beta/f) * f
	}

	return image.Point{int(xBar), int(yBar)}
}

func resizeImage(img image.Image, format string, size image.Point) image.Image {
	if format == "png" {
		img = imageproc.Composite(img)
	}

	return imageproc.Resize(img, size, imageproc.ResizeBilinear)
}

func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) {
	img, format, err := image.Decode(imageData)
	if err != nil {
		return nil, nil, fmt.Errorf("failed to decode image: %w", err)
	}

	size := smartResize(img.Bounds().Max, DefaultFactor, DefaultMinPixels, DefaultMaxPixels)
	img = resizeImage(img, format, size)

	data := imageproc.Normalize(img, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, true, true)

	opts := map[string]any{}
	return data, opts, nil
}