Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
d07cf41a
Commit
d07cf41a
authored
Oct 31, 2024
by
Michael Yang
Browse files
refactor kv estimation
parent
8c238e70
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
20 additions
and
8 deletions
+20
-8
llm/ggml.go
llm/ggml.go
+16
-1
llm/memory.go
llm/memory.go
+4
-7
No files found.
llm/ggml.go
View file @
d07cf41a
...
...
@@ -360,7 +360,7 @@ func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
},
offset
,
nil
}
func
(
llm
GGML
)
GraphSize
(
context
,
batch
uint64
)
(
partialOffload
,
fullOffload
uint64
)
{
func
(
llm
GGML
)
GraphSize
(
context
,
batch
uint64
)
(
kv
,
partialOffload
,
fullOffload
uint64
)
{
embedding
:=
llm
.
KV
()
.
EmbeddingLength
()
heads
:=
llm
.
KV
()
.
HeadCount
()
headsKV
:=
llm
.
KV
()
.
HeadCountKV
()
...
...
@@ -368,9 +368,12 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
embeddingHeads
:=
llm
.
KV
()
.
EmbeddingHeadCount
()
embeddingHeadsK
:=
llm
.
KV
()
.
EmbeddingHeadCountK
()
embeddingHeadsV
:=
llm
.
KV
()
.
EmbeddingHeadCountV
()
layers
:=
llm
.
Tensors
()
.
Layers
()
kv
=
2
*
context
*
llm
.
KV
()
.
BlockCount
()
*
(
embeddingHeadsK
+
embeddingHeadsV
)
*
headsKV
switch
llm
.
KV
()
.
Architecture
()
{
case
"llama"
:
fullOffload
=
max
(
...
...
@@ -403,6 +406,18 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
case
"mllama"
:
var
visionTokens
,
tiles
uint64
=
1601
,
4
if
crossAttentionLayers
,
ok
:=
llm
.
KV
()[
"mllama.attention.cross_attention_layers"
]
.
(
*
array
);
ok
{
kv
=
headsKV
*
(
embeddingHeadsK
+
embeddingHeadsV
)
*
// one for K, one for V
(
2
*
// sizeof(float16)
(
llm
.
KV
()
.
BlockCount
()
-
uint64
(
crossAttentionLayers
.
size
))
*
// num non-cross attention layers
context
+
4
*
// sizeof(float32)
uint64
(
crossAttentionLayers
.
size
)
*
// num cross attention layers
visionTokens
*
tiles
)
}
fullOffload
=
max
(
4
*
batch
*
(
2
+
3
*
embedding
+
embeddingHeadsK
*
heads
+
context
*
(
1
+
heads
)),
// vocab graph
...
...
llm/memory.go
View file @
d07cf41a
...
...
@@ -123,13 +123,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
slog
.
Warn
(
"model missing blk.0 layer size"
)
}
// fp16 k,v = sizeof(float16) * n_ctx * n_layer * (n_embd_head_k + n_embd_head_v) * n_head_kv
var
kv
uint64
=
2
*
uint64
(
opts
.
NumCtx
)
*
ggml
.
KV
()
.
BlockCount
()
*
(
ggml
.
KV
()
.
EmbeddingHeadCountK
()
+
ggml
.
KV
()
.
EmbeddingHeadCountV
())
*
ggml
.
KV
()
.
HeadCountKV
()
// KV is proportional to the number of layers
layerSize
+=
kv
/
ggml
.
KV
()
.
BlockCount
()
graphPartialOffload
,
graphFullOffload
=
ggml
.
GraphSize
(
uint64
(
opts
.
NumCtx
),
uint64
(
min
(
opts
.
NumCtx
,
opts
.
NumBatch
)))
kv
,
graphPartialOffload
,
graphFullOffload
:=
ggml
.
GraphSize
(
uint64
(
opts
.
NumCtx
),
uint64
(
min
(
opts
.
NumCtx
,
opts
.
NumBatch
)))
if
graphPartialOffload
==
0
{
graphPartialOffload
=
ggml
.
KV
()
.
GQA
()
*
kv
/
6
}
...
...
@@ -137,6 +131,9 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
graphFullOffload
=
graphPartialOffload
}
// KV is proportional to the number of layers
layerSize
+=
kv
/
ggml
.
KV
()
.
BlockCount
()
// on metal there's no partial offload overhead
if
gpus
[
0
]
.
Library
==
"metal"
{
graphPartialOffload
=
graphFullOffload
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment