Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
ab6be852
Commit
ab6be852
authored
Jan 11, 2024
by
Jeffrey Morgan
Browse files
revisit memory allocation to account for full kv cache on main gpu
parent
b24e8d17
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
56 additions
and
75 deletions
+56
-75
llm/llm.go
llm/llm.go
+56
-75
No files found.
llm/llm.go
View file @
ab6be852
...
...
@@ -2,7 +2,6 @@ package llm
import
(
"context"
"fmt"
"log"
"os"
"runtime"
...
...
@@ -41,94 +40,76 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
opts
.
NumCtx
=
4
}
fmt
.
Println
(
"size"
,
ggml
.
Size
)
fmt
.
Println
(
"filetype"
,
ggml
.
FileType
())
fmt
.
Println
(
"architecture"
,
ggml
.
ModelFamily
())
fmt
.
Println
(
"type"
,
ggml
.
ModelType
())
fmt
.
Println
(
"name"
,
ggml
.
Name
())
fmt
.
Println
(
"embd"
,
ggml
.
NumEmbed
())
fmt
.
Println
(
"head"
,
ggml
.
NumHead
())
fmt
.
Println
(
"head_kv"
,
ggml
.
NumHeadKv
())
fmt
.
Println
(
"gqa"
,
ggml
.
NumGQA
())
available
,
_
:=
gpu
.
CheckVRAM
()
// For now assume filesize = model size
// TODO: use actual model size
requiredModel
:=
ggml
.
Size
vram
,
_
:=
gpu
.
CheckVRAM
()
size
:=
ggml
.
Size
// fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
requiredK
v
:=
2
*
2
*
int64
(
opts
.
NumCtx
)
*
int64
(
ggml
.
NumLayers
())
*
int64
(
ggml
.
NumEmbed
())
*
int64
(
ggml
.
NumHeadKv
())
/
int64
(
ggml
.
NumHead
())
k
v
:=
2
*
2
*
int64
(
opts
.
NumCtx
)
*
int64
(
ggml
.
NumLayers
())
*
int64
(
ggml
.
NumEmbed
())
*
int64
(
ggml
.
NumHeadKv
())
/
int64
(
ggml
.
NumHead
())
// this amount is the overhead + tensors in memory
// TODO: get this from the llama.cpp's graph calcluations instead of
// estimating it's 1/6 * kv_cache_size * num_gqa
requiredAlloc
:=
int64
(
ggml
.
NumGQA
())
*
requiredKv
/
6
requiredTotal
:=
requiredModel
+
requiredKv
+
requiredAlloc
log
.
Println
(
"system memory bytes:"
,
available
)
log
.
Println
(
"required model bytes:"
,
requiredModel
)
log
.
Println
(
"required kv bytes:"
,
requiredKv
)
log
.
Println
(
"required alloc bytes:"
,
requiredAlloc
)
log
.
Println
(
"required total bytes:"
,
requiredTotal
)
graph
:=
int64
(
ggml
.
NumGQA
())
*
kv
/
6
info
:=
gpu
.
GetGPUInfo
()
library
:=
info
.
Library
switch
runtime
.
GOOS
{
case
"darwin"
:
if
opts
.
NumGPU
==
0
{
break
}
if
opts
.
NumGPU
==
-
1
{
// default to offloading all layers
opts
.
NumGPU
=
int
(
ggml
.
NumLayers
())
+
1
}
if
size
+
kv
+
graph
>
vram
{
log
.
Println
(
"not enough vram available, falling back to CPU only"
)
opts
.
NumGPU
=
0
break
}
opts
.
NumGPU
=
1
default
:
if
library
==
"cpu"
||
library
==
"default"
{
log
.
Println
(
"GPU not available, falling back to CPU"
)
opts
.
NumGPU
=
0
break
}
// don't use GPU at all if no layers are loaded
if
opts
.
NumGPU
==
0
{
library
=
"cpu"
break
}
// user-defined GPU count
if
opts
.
NumGPU
!=
-
1
{
break
}
// the "main" GPU needs the most memory and determines the limit
// of how many layers can be loaded. It needs to fit:
// 1. the full compute graph allocation for all devices (graph)
// 2. the proportional kv cache for all devices (kv * % layers)
// 3. the proportional model (size * % layers / # devices)
// This estimates the number of layers
maxlayers
:=
int64
(
ggml
.
NumLayers
())
+
1
devices
:=
int64
(
info
.
DeviceCount
)
avg
:=
vram
/
devices
layers
:=
maxlayers
*
(
avg
-
graph
)
/
(
kv
+
size
/
devices
)
if
layers
>
maxlayers
{
layers
=
maxlayers
}
// decide how many layers to put on the GPU
if
opts
.
NumGPU
>
0
{
switch
runtime
.
GOOS
{
case
"darwin"
:
if
requiredTotal
>
available
{
log
.
Println
(
"not enough vram available, falling back to CPU only"
)
opts
.
NumGPU
=
0
}
default
:
if
library
==
"cpu"
||
library
==
"default"
{
opts
.
NumGPU
=
0
break
}
// alloc buffer and kv cache is allocated as a fixed amount on the main gpu
// TODO: find the largest GPU and only reserve memory there
avgAvailable
:=
available
/
int64
(
info
.
DeviceCount
)
if
requiredAlloc
>
avgAvailable
{
log
.
Printf
(
"not enough vram available, falling back to CPU only"
)
library
=
"cpu"
opts
.
NumGPU
=
0
break
}
// we don't know which GPU will be used, so estimate
// the scratch buffer space on all of them
// TODO: allocate less layers to the GPU with the scratch buffer
// and more to the others (based on their available memory)
available
-=
requiredAlloc
*
int64
(
info
.
DeviceCount
)
// no offloading required
if
requiredModel
+
requiredKv
<=
available
{
break
}
// fill remaining vram with layers
log
.
Println
(
"splitting"
,
available
,
"of available memory bytes into layers"
)
bytesPerLayer
:=
int64
((
requiredModel
+
requiredKv
)
/
int64
(
ggml
.
NumLayers
()))
log
.
Println
(
"bytes per layer:"
,
bytesPerLayer
)
layers
:=
available
/
bytesPerLayer
log
.
Println
(
"total required with split:"
,
requiredAlloc
+
(
layers
*
bytesPerLayer
))
if
layers
<
int64
(
opts
.
NumGPU
)
{
opts
.
NumGPU
=
int
(
layers
)
}
// 1 + 2 must fit on the main gpu
min
:=
graph
+
kv
*
layers
/
maxlayers
if
layers
<=
0
||
min
>
avg
{
log
.
Printf
(
"not enough vram available, falling back to CPU only"
)
library
=
"cpu"
opts
.
NumGPU
=
0
break
}
opts
.
NumGPU
=
int
(
layers
)
}
opts
.
NumGQA
=
0
opts
.
RopeFrequencyBase
=
0.0
opts
.
RopeFrequencyScale
=
0.0
return
newLlmServer
(
library
,
model
,
adapters
,
projectors
,
opts
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment