Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
470af8ab
"docs/vscode:/vscode.git/clone" did not exist on "ecd2f176277db4f074e25a2c3646b04b51cec119"
Commit
470af8ab
authored
Apr 17, 2025
by
Michael Yang
Committed by
Michael Yang
Apr 25, 2025
Browse files
connect vision to text
parent
178761ae
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
80 additions
and
4 deletions
+80
-4
model/models/llama4/model.go
model/models/llama4/model.go
+69
-3
model/models/llama4/model_text.go
model/models/llama4/model_text.go
+11
-1
No files found.
model/models/llama4/model.go
View file @
470af8ab
...
...
@@ -3,6 +3,8 @@ package llama4
import
(
"bytes"
"image"
"slices"
"sync"
"github.com/ollama/ollama/fs"
"github.com/ollama/ollama/kvcache"
...
...
@@ -78,7 +80,7 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
return
nil
,
err
}
ratioW
,
ratioH
:=
int
(
size
.
X
/
m
.
imageSize
)
,
int
(
size
.
Y
/
m
.
imageSize
)
ratioW
,
ratioH
:=
size
.
X
/
m
.
imageSize
,
size
.
Y
/
m
.
imageSize
tilesLocal
=
tilesLocal
.
Reshape
(
ctx
,
size
.
X
/
ratioW
,
ratioW
,
size
.
Y
,
m
.
numChannels
)
.
Permute
(
ctx
,
0
,
2
,
1
,
3
)
.
Contiguous
(
ctx
)
tilesLocal
=
tilesLocal
.
Reshape
(
ctx
,
size
.
X
/
ratioW
*
size
.
Y
/
ratioH
,
ratioH
,
ratioW
,
m
.
numChannels
)
.
Permute
(
ctx
,
0
,
3
,
2
,
1
)
.
Contiguous
(
ctx
)
...
...
@@ -97,11 +99,75 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
visionOutputs
:=
m
.
VisionModel
.
Forward
(
ctx
,
pixelValues
)
visionOutputs
=
visionOutputs
.
Reshape
(
ctx
,
visionOutputs
.
Dim
(
0
),
visionOutputs
.
Dim
(
1
)
*
visionOutputs
.
Dim
(
2
)
*
visionOutputs
.
Dim
(
3
))
return
m
.
Projector
.
Forward
(
ctx
,
visionOutputs
),
nil
projectedOutputs
:=
m
.
Projector
.
Forward
(
ctx
,
visionOutputs
)
return
&
chunks
{
Model
:
m
,
Tensor
:
projectedOutputs
,
aspectRatio
:
image
.
Point
{
ratioW
,
ratioH
}},
nil
}
type
chunks
struct
{
*
Model
ml
.
Tensor
aspectRatio
image
.
Point
dataOnce
sync
.
Once
data
[]
float32
}
type
chunk
struct
{
*
chunks
s
,
n
int
}
func
(
r
*
chunk
)
floats
()
[]
float32
{
r
.
dataOnce
.
Do
(
func
()
{
temp
:=
r
.
Backend
()
.
NewContext
()
defer
temp
.
Close
()
temp
.
Forward
(
r
.
Tensor
)
.
Compute
(
r
.
Tensor
)
r
.
data
=
r
.
Floats
()
})
return
r
.
data
[
r
.
s
*
r
.
Dim
(
0
)
:
(
r
.
s
+
r
.
n
)
*
r
.
Dim
(
0
)]
}
func
(
m
*
Model
)
PostTokenize
(
inputs
[]
input
.
Input
)
([]
input
.
Input
,
error
)
{
return
inputs
,
nil
var
result
[]
input
.
Input
for
_
,
inp
:=
range
inputs
{
if
inp
.
Multimodal
==
nil
{
result
=
append
(
result
,
inp
)
continue
}
t
:=
inp
.
Multimodal
.
(
*
chunks
)
var
imageInputs
[]
input
.
Input
imageInputs
=
append
(
imageInputs
,
input
.
Input
{
Token
:
200080
})
// <|image_start|>
var
offset
int
patchesPerChunk
:=
t
.
Dim
(
1
)
if
t
.
aspectRatio
.
Y
*
t
.
aspectRatio
.
X
>
1
{
patchesPerChunk
=
t
.
Dim
(
1
)
/
(
t
.
aspectRatio
.
X
*
t
.
aspectRatio
.
Y
+
1
)
for
range
t
.
aspectRatio
.
Y
{
for
x
:=
range
t
.
aspectRatio
.
X
{
imageInputs
=
append
(
imageInputs
,
input
.
Input
{
Token
:
200092
,
Multimodal
:
&
chunk
{
t
,
offset
,
patchesPerChunk
},
MultimodalHash
:
inp
.
MultimodalHash
,
SameBatch
:
patchesPerChunk
})
// <|patch|>
imageInputs
=
append
(
imageInputs
,
slices
.
Repeat
([]
input
.
Input
{{
Token
:
200092
}},
patchesPerChunk
-
1
)
...
)
if
x
<
t
.
aspectRatio
.
X
-
1
{
imageInputs
=
append
(
imageInputs
,
input
.
Input
{
Token
:
200084
})
// <|tile_x_separator|>
}
offset
+=
patchesPerChunk
}
imageInputs
=
append
(
imageInputs
,
input
.
Input
{
Token
:
200085
})
// <|tile_y_separator|>
}
}
imageInputs
=
append
(
imageInputs
,
input
.
Input
{
Token
:
200090
})
// <|image|>
imageInputs
=
append
(
imageInputs
,
input
.
Input
{
Token
:
200092
,
Multimodal
:
&
chunk
{
t
,
offset
,
patchesPerChunk
},
MultimodalHash
:
inp
.
MultimodalHash
,
SameBatch
:
patchesPerChunk
})
// <|patch|>
imageInputs
=
append
(
imageInputs
,
slices
.
Repeat
([]
input
.
Input
{{
Token
:
200092
}},
patchesPerChunk
-
1
)
...
)
imageInputs
=
append
(
imageInputs
,
input
.
Input
{
Token
:
200080
})
// <|image_end|>
result
=
append
(
result
,
imageInputs
...
)
}
return
result
,
nil
}
func
(
m
*
Model
)
Forward
(
ctx
ml
.
Context
,
batch
input
.
Batch
)
(
ml
.
Tensor
,
error
)
{
...
...
model/models/llama4/model_text.go
View file @
470af8ab
...
...
@@ -195,7 +195,17 @@ func newTextModel(c fs.Config) *TextModel {
}
func
(
m
*
TextModel
)
Forward
(
ctx
ml
.
Context
,
inputs
,
positions
,
outputs
ml
.
Tensor
,
batch
input
.
Batch
,
cache
kvcache
.
Cache
)
ml
.
Tensor
{
hiddenStates
:=
m
.
TokenEmbedding
.
Forward
(
ctx
,
inputs
)
hiddenStates
:=
m
.
TokenEmbedding
.
Forward
(
ctx
,
inputs
)
.
Duplicate
(
ctx
)
for
_
,
mi
:=
range
batch
.
Multimodal
{
f32s
:=
mi
.
Multimodal
.
(
*
chunk
)
.
floats
()
img
,
err
:=
ctx
.
Input
()
.
FromFloatSlice
(
f32s
,
len
(
f32s
)
/
m
.
hiddenSize
,
m
.
hiddenSize
)
if
err
!=
nil
{
panic
(
err
)
}
ctx
.
Forward
(
img
.
Copy
(
ctx
,
hiddenStates
.
View
(
ctx
,
mi
.
Index
*
hiddenStates
.
Stride
(
1
),
img
.
Dim
(
0
)
*
img
.
Dim
(
1
))))
}
for
i
,
layer
:=
range
m
.
Layers
{
cache
.
SetLayer
(
i
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment