Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
12e923e1
Commit
12e923e1
authored
Apr 02, 2024
by
Michael Yang
Browse files
update graph size estimate
parent
cd135317
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
52 additions
and
4 deletions
+52
-4
llm/ggml.go
llm/ggml.go
+47
-0
llm/server.go
llm/server.go
+5
-4
No files found.
llm/ggml.go
View file @
12e923e1
...
@@ -303,3 +303,50 @@ func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
...
@@ -303,3 +303,50 @@ func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
model
:
model
,
model
:
model
,
},
offset
,
nil
},
offset
,
nil
}
}
func
(
llm
GGML
)
GraphSize
(
context
,
batch
int
)
(
int64
,
bool
)
{
embeddingLength
:=
llm
.
KV
()
.
EmbeddingLength
()
headCount
:=
llm
.
KV
()
.
HeadCount
()
headCountKV
:=
llm
.
KV
()
.
HeadCountKV
()
vocabLength
:=
len
(
llm
.
KV
()[
"tokenizer.ggml.tokens"
]
.
([]
any
))
var
attnQKVWeight1
uint64
=
0
for
_
,
t
:=
range
llm
.
Tensors
()
{
if
strings
.
HasSuffix
(
t
.
Name
,
".attn_qkv.weight"
)
&&
len
(
t
.
Shape
)
>=
2
{
attnQKVWeight1
=
t
.
Shape
[
1
]
break
}
}
var
ffnGate1
uint64
=
0
for
_
,
t
:=
range
llm
.
Tensors
()
{
if
strings
.
Index
(
t
.
Name
,
".ffn_gate"
)
>
0
&&
len
(
t
.
Shape
)
>=
2
{
ffnGate1
=
t
.
Shape
[
1
]
break
}
}
switch
llm
.
KV
()
.
Architecture
()
{
case
"gemma"
:
return
4
*
int64
(
batch
)
*
int64
(
embeddingLength
+
uint64
(
vocabLength
)),
true
case
"phi2"
:
return
max
(
4
*
int64
(
batch
)
*
int64
(
embeddingLength
+
uint64
(
vocabLength
)),
4
*
int64
(
batch
)
*
int64
(
1
+
4
*
embeddingLength
+
uint64
(
context
)
+
attnQKVWeight1
+
uint64
(
context
)
*
headCount
),
),
true
case
"qwen2"
:
return
max
(
4
*
int64
(
batch
)
*
int64
(
embeddingLength
+
uint64
(
vocabLength
)),
4
*
int64
(
batch
)
*
int64
(
1
+
2
*
embeddingLength
+
uint64
(
context
)
+
uint64
(
context
)
*
headCount
),
),
true
case
"llama"
:
if
ffnGate1
>
0
{
// moe
return
4
*
int64
(
batch
)
*
int64
(
2
+
3
*
embeddingLength
+
uint64
(
context
)
+
uint64
(
context
)
*
headCount
+
2
*
headCountKV
+
ffnGate1
),
true
}
return
4
*
int64
(
batch
)
*
int64
(
1
+
4
*
embeddingLength
+
uint64
(
context
)
+
uint64
(
context
)
*
headCount
),
true
}
return
0
,
false
}
llm/server.go
View file @
12e923e1
...
@@ -79,10 +79,11 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
...
@@ -79,10 +79,11 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
kv
:=
2
*
2
*
int64
(
opts
.
NumCtx
)
*
int64
(
ggml
.
KV
()
.
BlockCount
())
*
int64
(
ggml
.
KV
()
.
EmbeddingLength
())
/
int64
(
ggml
.
KV
()
.
HeadCount
())
*
int64
(
ggml
.
KV
()
.
HeadCountKV
())
kv
:=
2
*
2
*
int64
(
opts
.
NumCtx
)
*
int64
(
ggml
.
KV
()
.
BlockCount
())
*
int64
(
ggml
.
KV
()
.
EmbeddingLength
())
/
int64
(
ggml
.
KV
()
.
HeadCount
())
*
int64
(
ggml
.
KV
()
.
HeadCountKV
())
// this amount is the overhead + tensors in memory
graph
,
ok
:=
ggml
.
GraphSize
(
opts
.
NumCtx
,
min
(
opts
.
NumCtx
,
opts
.
NumBatch
))
// TODO: get this from the llama.cpp's graph calculations instead of
if
!
ok
{
// estimating it's 1/6 * kv_cache_size * num_gqa
graph
=
int64
(
ggml
.
KV
()
.
GQA
())
*
kv
/
6
graph
:=
int64
(
ggml
.
KV
()
.
GQA
())
*
kv
/
6
}
usedMemory
+=
graph
usedMemory
+=
graph
if
(
usedMemory
>
availableMemory
||
slices
.
Contains
(
cpuOnlyFamilies
,
ggml
.
KV
()
.
Architecture
()))
&&
info
.
Library
!=
"metal"
{
if
(
usedMemory
>
availableMemory
||
slices
.
Contains
(
cpuOnlyFamilies
,
ggml
.
KV
()
.
Architecture
()))
&&
info
.
Library
!=
"metal"
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment