Commit f63e62e5 authored by jmorganca's avatar jmorganca Committed by Michael Yang
Browse files

reduce kernel size, add TODO for loading from config

parent 65b0f329
...@@ -90,7 +90,11 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er ...@@ -90,7 +90,11 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
visionOutputs := m.VisionModel.Forward(ctx, pixelValues) visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
visionOutputs = visionOutputs.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx) visionOutputs = visionOutputs.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
patchesPerImage := m.ImageProcessor.imageSize / m.ImageProcessor.patchSize patchesPerImage := m.ImageProcessor.imageSize / m.ImageProcessor.patchSize
kernelSize := patchesPerImage * patchesPerImage / 256
// TODO (jmorganca): read this from the model config
// it should instead be math.Sqrt(tokens per image)
tokensPerSide := 8
kernelSize := patchesPerImage / tokensPerSide
visionOutputs = visionOutputs.AvgPool1D(ctx, kernelSize, kernelSize, 0) visionOutputs = visionOutputs.AvgPool1D(ctx, kernelSize, kernelSize, 0)
visionOutputs = visionOutputs.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx) visionOutputs = visionOutputs.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment