Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
f247a623
Unverified
Commit
f247a623
authored
Nov 04, 2024
by
Michael Yang
Committed by
GitHub
Nov 04, 2024
Browse files
Merge pull request #7456 from ollama/mxyng/llama3.2-vision-mem
update llama3.2 vision memory estimation
parents
44bd9e59
d07cf41a
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
44 additions
and
8 deletions
+44
-8
llm/ggml.go
llm/ggml.go
+40
-1
llm/memory.go
llm/memory.go
+4
-7
No files found.
llm/ggml.go
View file @
f247a623
...
@@ -360,7 +360,7 @@ func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
...
@@ -360,7 +360,7 @@ func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
},
offset
,
nil
},
offset
,
nil
}
}
func
(
llm
GGML
)
GraphSize
(
context
,
batch
uint64
)
(
partialOffload
,
fullOffload
uint64
)
{
func
(
llm
GGML
)
GraphSize
(
context
,
batch
uint64
)
(
kv
,
partialOffload
,
fullOffload
uint64
)
{
embedding
:=
llm
.
KV
()
.
EmbeddingLength
()
embedding
:=
llm
.
KV
()
.
EmbeddingLength
()
heads
:=
llm
.
KV
()
.
HeadCount
()
heads
:=
llm
.
KV
()
.
HeadCount
()
headsKV
:=
llm
.
KV
()
.
HeadCountKV
()
headsKV
:=
llm
.
KV
()
.
HeadCountKV
()
...
@@ -368,9 +368,12 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
...
@@ -368,9 +368,12 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
embeddingHeads
:=
llm
.
KV
()
.
EmbeddingHeadCount
()
embeddingHeads
:=
llm
.
KV
()
.
EmbeddingHeadCount
()
embeddingHeadsK
:=
llm
.
KV
()
.
EmbeddingHeadCountK
()
embeddingHeadsK
:=
llm
.
KV
()
.
EmbeddingHeadCountK
()
embeddingHeadsV
:=
llm
.
KV
()
.
EmbeddingHeadCountV
()
layers
:=
llm
.
Tensors
()
.
Layers
()
layers
:=
llm
.
Tensors
()
.
Layers
()
kv
=
2
*
context
*
llm
.
KV
()
.
BlockCount
()
*
(
embeddingHeadsK
+
embeddingHeadsV
)
*
headsKV
switch
llm
.
KV
()
.
Architecture
()
{
switch
llm
.
KV
()
.
Architecture
()
{
case
"llama"
:
case
"llama"
:
fullOffload
=
max
(
fullOffload
=
max
(
...
@@ -400,6 +403,42 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
...
@@ -400,6 +403,42 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
4
*
batch
*
(
1
+
2
*
embedding
+
context
*
(
1
+
heads
))
+
embedding
*
(
6
*
context
*
headsKV
/
heads
+
embedding
*
9
/
16
),
4
*
batch
*
(
1
+
2
*
embedding
+
context
*
(
1
+
heads
))
+
embedding
*
(
6
*
context
*
headsKV
/
heads
+
embedding
*
9
/
16
),
)
)
}
}
case
"mllama"
:
var
visionTokens
,
tiles
uint64
=
1601
,
4
if
crossAttentionLayers
,
ok
:=
llm
.
KV
()[
"mllama.attention.cross_attention_layers"
]
.
(
*
array
);
ok
{
kv
=
headsKV
*
(
embeddingHeadsK
+
embeddingHeadsV
)
*
// one for K, one for V
(
2
*
// sizeof(float16)
(
llm
.
KV
()
.
BlockCount
()
-
uint64
(
crossAttentionLayers
.
size
))
*
// num non-cross attention layers
context
+
4
*
// sizeof(float32)
uint64
(
crossAttentionLayers
.
size
)
*
// num cross attention layers
visionTokens
*
tiles
)
}
fullOffload
=
max
(
4
*
batch
*
(
2
+
3
*
embedding
+
embeddingHeadsK
*
heads
+
context
*
(
1
+
heads
)),
// vocab graph
4
*
batch
*
(
embedding
+
vocab
),
)
var
ropeFreqsCount
uint64
if
ropeFreqs
,
ok
:=
llm
.
Tensors
()
.
Layers
()[
"rope_freqs"
];
ok
{
if
ropeFreqsWeights
,
ok
:=
ropeFreqs
[
"weights"
];
ok
{
ropeFreqsCount
=
ropeFreqsWeights
.
parameters
()
}
}
partialOffload
=
max
(
4
*
(
batch
*
(
2
*
embedding
+
1
+
context
*
(
1
+
heads
)
+
embeddingHeadsK
*
heads
)
+
ropeFreqsCount
+
embeddingHeadsK
*
context
*
headsKV
),
// vocab graph
4
*
batch
*
(
embedding
+
vocab
)
+
embedding
*
vocab
*
105
/
128
,
)
case
"gemma"
,
"gemma2"
:
case
"gemma"
,
"gemma2"
:
fullOffload
=
max
(
fullOffload
=
max
(
4
*
batch
*
(
embedding
+
vocab
),
4
*
batch
*
(
embedding
+
vocab
),
...
...
llm/memory.go
View file @
f247a623
...
@@ -123,13 +123,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
...
@@ -123,13 +123,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
slog
.
Warn
(
"model missing blk.0 layer size"
)
slog
.
Warn
(
"model missing blk.0 layer size"
)
}
}
// fp16 k,v = sizeof(float16) * n_ctx * n_layer * (n_embd_head_k + n_embd_head_v) * n_head_kv
kv
,
graphPartialOffload
,
graphFullOffload
:=
ggml
.
GraphSize
(
uint64
(
opts
.
NumCtx
),
uint64
(
min
(
opts
.
NumCtx
,
opts
.
NumBatch
)))
var
kv
uint64
=
2
*
uint64
(
opts
.
NumCtx
)
*
ggml
.
KV
()
.
BlockCount
()
*
(
ggml
.
KV
()
.
EmbeddingHeadCountK
()
+
ggml
.
KV
()
.
EmbeddingHeadCountV
())
*
ggml
.
KV
()
.
HeadCountKV
()
// KV is proportional to the number of layers
layerSize
+=
kv
/
ggml
.
KV
()
.
BlockCount
()
graphPartialOffload
,
graphFullOffload
=
ggml
.
GraphSize
(
uint64
(
opts
.
NumCtx
),
uint64
(
min
(
opts
.
NumCtx
,
opts
.
NumBatch
)))
if
graphPartialOffload
==
0
{
if
graphPartialOffload
==
0
{
graphPartialOffload
=
ggml
.
KV
()
.
GQA
()
*
kv
/
6
graphPartialOffload
=
ggml
.
KV
()
.
GQA
()
*
kv
/
6
}
}
...
@@ -137,6 +131,9 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
...
@@ -137,6 +131,9 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
graphFullOffload
=
graphPartialOffload
graphFullOffload
=
graphPartialOffload
}
}
// KV is proportional to the number of layers
layerSize
+=
kv
/
ggml
.
KV
()
.
BlockCount
()
// on metal there's no partial offload overhead
// on metal there's no partial offload overhead
if
gpus
[
0
]
.
Library
==
"metal"
{
if
gpus
[
0
]
.
Library
==
"metal"
{
graphPartialOffload
=
graphFullOffload
graphPartialOffload
=
graphFullOffload
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment