process_image.go 1.48 KB
Newer Older
Patrick Devine's avatar
Patrick Devine committed
1
2
3
4
5
package gemma3

import (
	"image"

6
	"github.com/ollama/ollama/fs"
Patrick Devine's avatar
Patrick Devine committed
7
8
9
10
	"github.com/ollama/ollama/model/imageproc"
)

type ImageProcessor struct {
Michael Yang's avatar
Michael Yang committed
11
	imageSize, patchSize, numChannels int
Patrick Devine's avatar
Patrick Devine committed
12
13
}

14
func newImageProcessor(c fs.Config) ImageProcessor {
Patrick Devine's avatar
Patrick Devine committed
15
16
	return ImageProcessor{
		imageSize:   int(c.Uint("vision.image_size")),
Michael Yang's avatar
Michael Yang committed
17
		patchSize:   int(c.Uint("vision.patch_size")),
Patrick Devine's avatar
Patrick Devine committed
18
19
20
21
22
		numChannels: int(c.Uint("vision.num_channels")),
	}
}

func (p *ImageProcessor) pack(img image.Image, mean, std [3]float32) []float32 {
Michael Yang's avatar
Michael Yang committed
23
	var pixelVals, rVals, gVals, bVals []float32
Patrick Devine's avatar
Patrick Devine committed
24
25

	bounds := img.Bounds()
Michael Yang's avatar
Michael Yang committed
26
27
	for y := bounds.Min.Y; y < bounds.Max.Y; y++ {
		for x := bounds.Min.X; x < bounds.Max.X; x++ {
Patrick Devine's avatar
Patrick Devine committed
28
29
30
31
32
33
34
35
36
37
			c := img.At(x, y)
			r, g, b, _ := c.RGBA()
			rVal := float32(r>>8) / 255.0
			gVal := float32(g>>8) / 255.0
			bVal := float32(b>>8) / 255.0

			rVal = (rVal - mean[0]) / std[0]
			gVal = (gVal - mean[1]) / std[1]
			bVal = (bVal - mean[2]) / std[2]

Michael Yang's avatar
Michael Yang committed
38
39
40
			rVals = append(rVals, rVal)
			gVals = append(gVals, gVal)
			bVals = append(bVals, bVal)
Patrick Devine's avatar
Patrick Devine committed
41
42
43
		}
	}

Michael Yang's avatar
Michael Yang committed
44
45
46
47
	pixelVals = append(pixelVals, rVals...)
	pixelVals = append(pixelVals, gVals...)
	pixelVals = append(pixelVals, bVals...)

Patrick Devine's avatar
Patrick Devine committed
48
49
50
51
52
53
54
55
56
57
58
	return pixelVals
}

func (p ImageProcessor) ProcessImage(img image.Image) ([]float32, error) {
	outputSize := image.Point{p.imageSize, p.imageSize}
	newImage := imageproc.Composite(img)
	newImage = imageproc.Resize(newImage, outputSize, imageproc.ResizeBilinear)

	data := p.pack(newImage, imageproc.ImageNetStandardMean, imageproc.ImageNetStandardSTD)
	return data, nil
}