Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
e01e535c
Unverified
Commit
e01e535c
authored
Jun 20, 2024
by
Michael Yang
Committed by
GitHub
Jun 20, 2024
Browse files
Merge pull request #5192 from ollama/mxyng/kv
handle asymmetric embedding KVs
parents
0195d6a2
8e0641a9
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
35 additions
and
9 deletions
+35
-9
llm/ggml.go
llm/ggml.go
+33
-7
llm/memory.go
llm/memory.go
+2
-2
No files found.
llm/ggml.go
View file @
e01e535c
...
...
@@ -69,6 +69,30 @@ func (kv KV) HeadCountKV() uint64 {
return
1
}
func
(
kv
KV
)
EmbeddingHeadCount
()
uint64
{
if
heads
:=
kv
.
HeadCount
();
heads
>
0
{
return
kv
.
EmbeddingLength
()
/
kv
.
HeadCount
()
}
return
0
}
func
(
kv
KV
)
EmbeddingHeadCountK
()
uint64
{
if
k
:=
kv
.
u64
(
fmt
.
Sprintf
(
"%s.attention.key_length"
,
kv
.
Architecture
()));
k
>
0
{
return
k
}
return
kv
.
EmbeddingHeadCount
()
}
func
(
kv
KV
)
EmbeddingHeadCountV
()
uint64
{
if
v
:=
kv
.
u64
(
fmt
.
Sprintf
(
"%s.attention.value_length"
,
kv
.
Architecture
()));
v
>
0
{
return
v
}
return
kv
.
EmbeddingHeadCount
()
}
func
(
kv
KV
)
GQA
()
uint64
{
return
kv
.
HeadCount
()
/
kv
.
HeadCountKV
()
}
...
...
@@ -299,6 +323,9 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
headsKV
:=
llm
.
KV
()
.
HeadCountKV
()
vocab
:=
uint64
(
len
(
llm
.
KV
()[
"tokenizer.ggml.tokens"
]
.
([]
any
)))
embeddingHeads
:=
llm
.
KV
()
.
EmbeddingHeadCount
()
embeddingHeadsK
:=
llm
.
KV
()
.
EmbeddingHeadCountK
()
layers
:=
llm
.
Tensors
()
.
Layers
()
switch
llm
.
KV
()
.
Architecture
()
{
...
...
@@ -308,7 +335,7 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
partialOffload
=
4
*
batch
*
embedding
partialOffload
+=
max
(
// 4*batch*(4+6*embedding+context*(2*heads)+llm.KV().GQA()),
4
*
batch
*
(
1
+
embedding
+
max
(
context
,
embedding
))
+
embedding
*
embedding
*
9
/
16
+
4
*
context
*
(
batch
*
heads
+
embedding
/
h
eads
*
headsKV
),
4
*
batch
*
(
1
+
embedding
+
max
(
context
,
embedding
))
+
embedding
*
embedding
*
9
/
16
+
4
*
context
*
(
batch
*
heads
+
embedding
H
eads
*
headsKV
),
4
*
batch
*
(
embedding
+
vocab
)
+
embedding
*
vocab
*
105
/
128
,
)
...
...
@@ -316,15 +343,15 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
// mixtral 8x22b
ff
:=
uint64
(
llm
.
KV
()[
"llama.feed_forward_length"
]
.
(
uint32
))
partialOffload
=
max
(
3
*
ffnGateExpsWeight
.
Size
()
+
4
*
batch
*
(
2
*
ff
+
headsKV
+
embedding
+
context
+
embedding
/
h
eads
*
headsKV
),
4
*
(
context
*
batch
*
heads
+
context
*
embedding
/
h
eads
*
headsKV
+
batch
*
1024
+
embedding
/
h
eads
*
headsKV
*
batch
),
3
*
ffnGateExpsWeight
.
Size
()
+
4
*
batch
*
(
2
*
ff
+
headsKV
+
embedding
+
context
+
embedding
H
eads
*
headsKV
),
4
*
(
context
*
batch
*
heads
+
context
*
embedding
H
eads
*
headsKV
+
batch
*
1024
+
embedding
H
eads
*
headsKV
*
batch
),
)
}
else
if
ffnGateWeight
,
ok
:=
layers
[
"blk.0"
][
"ffn_gate.0.weight"
];
ok
{
// mixtral 8x7b
ffnGateWeight1
:=
ffnGateWeight
.
Shape
[
1
]
fullOffload
=
4
*
batch
*
(
2
+
3
*
embedding
+
context
*
(
1
+
heads
)
+
2
*
headsKV
+
ffnGateWeight1
)
partialOffload
=
max
(
4
*
batch
*
(
3
+
embedding
/
h
eads
*
headsKV
+
embedding
+
context
*
(
1
+
heads
)
+
ffnGateWeight1
)
+
(
embedding
*
embedding
+
3
*
embedding
*
headsKV
*
ffnGateWeight1
)
*
9
/
16
,
4
*
batch
*
(
3
+
embedding
H
eads
*
headsKV
+
embedding
+
context
*
(
1
+
heads
)
+
ffnGateWeight1
)
+
(
embedding
*
embedding
+
3
*
embedding
*
headsKV
*
ffnGateWeight1
)
*
9
/
16
,
4
*
batch
*
(
1
+
2
*
embedding
+
context
*
(
1
+
heads
))
+
embedding
*
(
6
*
context
*
headsKV
/
heads
+
embedding
*
9
/
16
),
)
}
...
...
@@ -368,15 +395,14 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
fullOffload
,
)
case
"deepseek2"
:
keys
:=
uint64
(
llm
.
KV
()[
"deepseek2.attention.key_length"
]
.
(
uint32
))
fullOffload
=
max
(
4
*
batch
*
(
3
*
embedding
+
vocab
),
4
*
batch
*
(
3
*
embedding
+
2
+
context
*
(
1
+
headsKV
)
+
2
*
keys
*
headsKV
),
4
*
batch
*
(
3
*
embedding
+
2
+
context
*
(
1
+
headsKV
)
+
2
*
embeddingHeadsK
*
headsKV
),
)
partialOffload
=
max
(
4
*
batch
*
(
3
*
embedding
+
vocab
)
+
embedding
*
vocab
*
105
/
128
,
4
*
batch
*
(
2
*
embedding
+
1
+
2
*
keys
*
headsKV
+
context
+
context
*
headsKV
)
+
4
*
keys
*
context
*
headsKV
+
embedding
*
keys
*
headsKV
*
9
/
16
,
4
*
batch
*
(
2
*
embedding
+
1
+
2
*
embeddingHeadsK
*
headsKV
+
context
+
context
*
headsKV
)
+
4
*
embeddingHeadsK
*
context
*
headsKV
+
embedding
*
embeddingHeadsK
*
headsKV
*
9
/
16
,
)
}
...
...
llm/memory.go
View file @
e01e535c
...
...
@@ -115,8 +115,8 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
slog
.
Warn
(
"model missing blk.0 layer size"
)
}
// fp16 k,v =
(1 (k) + 1 (v)) *
sizeof(float16) * n_ctx * n_layer * n_embd
/ n
_head * n_head_kv
var
kv
uint64
=
2
*
2
*
uint64
(
opts
.
NumCtx
)
*
ggml
.
KV
()
.
BlockCount
()
*
ggml
.
KV
()
.
Embedding
Length
()
/
ggml
.
KV
()
.
HeadCount
(
)
*
ggml
.
KV
()
.
HeadCountKV
()
// fp16 k,v = sizeof(float16) * n_ctx * n_layer *
(
n_embd
_head_k + n_embd
_head
_v)
* n_head_kv
var
kv
uint64
=
2
*
uint64
(
opts
.
NumCtx
)
*
ggml
.
KV
()
.
BlockCount
()
*
(
ggml
.
KV
()
.
Embedding
HeadCountK
()
+
ggml
.
KV
()
.
Embedding
HeadCount
V
()
)
*
ggml
.
KV
()
.
HeadCountKV
()
// KV is proportional to the number of layers
layerSize
+=
kv
/
ggml
.
KV
()
.
BlockCount
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment