Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
a0a15cfd
Unverified
Commit
a0a15cfd
authored
Apr 03, 2024
by
Michael Yang
Committed by
GitHub
Apr 03, 2024
Browse files
Merge pull request #3463 from ollama/mxyng/graph-estimate
update graph size estimate
parents
cd135317
12e923e1
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
52 additions
and
4 deletions
+52
-4
llm/ggml.go
llm/ggml.go
+47
-0
llm/server.go
llm/server.go
+5
-4
No files found.
llm/ggml.go
View file @
a0a15cfd
...
@@ -303,3 +303,50 @@ func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
...
@@ -303,3 +303,50 @@ func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
model
:
model
,
model
:
model
,
},
offset
,
nil
},
offset
,
nil
}
}
func
(
llm
GGML
)
GraphSize
(
context
,
batch
int
)
(
int64
,
bool
)
{
embeddingLength
:=
llm
.
KV
()
.
EmbeddingLength
()
headCount
:=
llm
.
KV
()
.
HeadCount
()
headCountKV
:=
llm
.
KV
()
.
HeadCountKV
()
vocabLength
:=
len
(
llm
.
KV
()[
"tokenizer.ggml.tokens"
]
.
([]
any
))
var
attnQKVWeight1
uint64
=
0
for
_
,
t
:=
range
llm
.
Tensors
()
{
if
strings
.
HasSuffix
(
t
.
Name
,
".attn_qkv.weight"
)
&&
len
(
t
.
Shape
)
>=
2
{
attnQKVWeight1
=
t
.
Shape
[
1
]
break
}
}
var
ffnGate1
uint64
=
0
for
_
,
t
:=
range
llm
.
Tensors
()
{
if
strings
.
Index
(
t
.
Name
,
".ffn_gate"
)
>
0
&&
len
(
t
.
Shape
)
>=
2
{
ffnGate1
=
t
.
Shape
[
1
]
break
}
}
switch
llm
.
KV
()
.
Architecture
()
{
case
"gemma"
:
return
4
*
int64
(
batch
)
*
int64
(
embeddingLength
+
uint64
(
vocabLength
)),
true
case
"phi2"
:
return
max
(
4
*
int64
(
batch
)
*
int64
(
embeddingLength
+
uint64
(
vocabLength
)),
4
*
int64
(
batch
)
*
int64
(
1
+
4
*
embeddingLength
+
uint64
(
context
)
+
attnQKVWeight1
+
uint64
(
context
)
*
headCount
),
),
true
case
"qwen2"
:
return
max
(
4
*
int64
(
batch
)
*
int64
(
embeddingLength
+
uint64
(
vocabLength
)),
4
*
int64
(
batch
)
*
int64
(
1
+
2
*
embeddingLength
+
uint64
(
context
)
+
uint64
(
context
)
*
headCount
),
),
true
case
"llama"
:
if
ffnGate1
>
0
{
// moe
return
4
*
int64
(
batch
)
*
int64
(
2
+
3
*
embeddingLength
+
uint64
(
context
)
+
uint64
(
context
)
*
headCount
+
2
*
headCountKV
+
ffnGate1
),
true
}
return
4
*
int64
(
batch
)
*
int64
(
1
+
4
*
embeddingLength
+
uint64
(
context
)
+
uint64
(
context
)
*
headCount
),
true
}
return
0
,
false
}
llm/server.go
View file @
a0a15cfd
...
@@ -79,10 +79,11 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
...
@@ -79,10 +79,11 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
kv
:=
2
*
2
*
int64
(
opts
.
NumCtx
)
*
int64
(
ggml
.
KV
()
.
BlockCount
())
*
int64
(
ggml
.
KV
()
.
EmbeddingLength
())
/
int64
(
ggml
.
KV
()
.
HeadCount
())
*
int64
(
ggml
.
KV
()
.
HeadCountKV
())
kv
:=
2
*
2
*
int64
(
opts
.
NumCtx
)
*
int64
(
ggml
.
KV
()
.
BlockCount
())
*
int64
(
ggml
.
KV
()
.
EmbeddingLength
())
/
int64
(
ggml
.
KV
()
.
HeadCount
())
*
int64
(
ggml
.
KV
()
.
HeadCountKV
())
// this amount is the overhead + tensors in memory
graph
,
ok
:=
ggml
.
GraphSize
(
opts
.
NumCtx
,
min
(
opts
.
NumCtx
,
opts
.
NumBatch
))
// TODO: get this from the llama.cpp's graph calculations instead of
if
!
ok
{
// estimating it's 1/6 * kv_cache_size * num_gqa
graph
=
int64
(
ggml
.
KV
()
.
GQA
())
*
kv
/
6
graph
:=
int64
(
ggml
.
KV
()
.
GQA
())
*
kv
/
6
}
usedMemory
+=
graph
usedMemory
+=
graph
if
(
usedMemory
>
availableMemory
||
slices
.
Contains
(
cpuOnlyFamilies
,
ggml
.
KV
()
.
Architecture
()))
&&
info
.
Library
!=
"metal"
{
if
(
usedMemory
>
availableMemory
||
slices
.
Contains
(
cpuOnlyFamilies
,
ggml
.
KV
()
.
Architecture
()))
&&
info
.
Library
!=
"metal"
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment