Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
ab6be852
Commit
ab6be852
authored
Jan 11, 2024
by
Jeffrey Morgan
Browse files
revisit memory allocation to account for full kv cache on main gpu
parent
b24e8d17
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
56 additions
and
75 deletions
+56
-75
llm/llm.go
llm/llm.go
+56
-75
No files found.
llm/llm.go
View file @
ab6be852
...
@@ -2,7 +2,6 @@ package llm
...
@@ -2,7 +2,6 @@ package llm
import
(
import
(
"context"
"context"
"fmt"
"log"
"log"
"os"
"os"
"runtime"
"runtime"
...
@@ -41,94 +40,76 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
...
@@ -41,94 +40,76 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
opts
.
NumCtx
=
4
opts
.
NumCtx
=
4
}
}
fmt
.
Println
(
"size"
,
ggml
.
Size
)
vram
,
_
:=
gpu
.
CheckVRAM
()
fmt
.
Println
(
"filetype"
,
ggml
.
FileType
())
size
:=
ggml
.
Size
fmt
.
Println
(
"architecture"
,
ggml
.
ModelFamily
())
fmt
.
Println
(
"type"
,
ggml
.
ModelType
())
fmt
.
Println
(
"name"
,
ggml
.
Name
())
fmt
.
Println
(
"embd"
,
ggml
.
NumEmbed
())
fmt
.
Println
(
"head"
,
ggml
.
NumHead
())
fmt
.
Println
(
"head_kv"
,
ggml
.
NumHeadKv
())
fmt
.
Println
(
"gqa"
,
ggml
.
NumGQA
())
available
,
_
:=
gpu
.
CheckVRAM
()
// For now assume filesize = model size
// TODO: use actual model size
requiredModel
:=
ggml
.
Size
// fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
// fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
requiredK
v
:=
2
*
2
*
int64
(
opts
.
NumCtx
)
*
int64
(
ggml
.
NumLayers
())
*
int64
(
ggml
.
NumEmbed
())
*
int64
(
ggml
.
NumHeadKv
())
/
int64
(
ggml
.
NumHead
())
k
v
:=
2
*
2
*
int64
(
opts
.
NumCtx
)
*
int64
(
ggml
.
NumLayers
())
*
int64
(
ggml
.
NumEmbed
())
*
int64
(
ggml
.
NumHeadKv
())
/
int64
(
ggml
.
NumHead
())
// this amount is the overhead + tensors in memory
// this amount is the overhead + tensors in memory
// TODO: get this from the llama.cpp's graph calcluations instead of
// TODO: get this from the llama.cpp's graph calcluations instead of
// estimating it's 1/6 * kv_cache_size * num_gqa
// estimating it's 1/6 * kv_cache_size * num_gqa
requiredAlloc
:=
int64
(
ggml
.
NumGQA
())
*
requiredKv
/
6
graph
:=
int64
(
ggml
.
NumGQA
())
*
kv
/
6
requiredTotal
:=
requiredModel
+
requiredKv
+
requiredAlloc
log
.
Println
(
"system memory bytes:"
,
available
)
log
.
Println
(
"required model bytes:"
,
requiredModel
)
log
.
Println
(
"required kv bytes:"
,
requiredKv
)
log
.
Println
(
"required alloc bytes:"
,
requiredAlloc
)
log
.
Println
(
"required total bytes:"
,
requiredTotal
)
info
:=
gpu
.
GetGPUInfo
()
info
:=
gpu
.
GetGPUInfo
()
library
:=
info
.
Library
library
:=
info
.
Library
switch
runtime
.
GOOS
{
case
"darwin"
:
if
opts
.
NumGPU
==
0
{
break
}
if
opts
.
NumGPU
==
-
1
{
if
size
+
kv
+
graph
>
vram
{
// default to offloading all layers
log
.
Println
(
"not enough vram available, falling back to CPU only"
)
opts
.
NumGPU
=
int
(
ggml
.
NumLayers
())
+
1
opts
.
NumGPU
=
0
}
break
}
opts
.
NumGPU
=
1
default
:
if
library
==
"cpu"
||
library
==
"default"
{
log
.
Println
(
"GPU not available, falling back to CPU"
)
opts
.
NumGPU
=
0
break
}
// don't use GPU at all if no layers are loaded
if
opts
.
NumGPU
==
0
{
library
=
"cpu"
break
}
// user-defined GPU count
if
opts
.
NumGPU
!=
-
1
{
break
}
// the "main" GPU needs the most memory and determines the limit
// of how many layers can be loaded. It needs to fit:
// 1. the full compute graph allocation for all devices (graph)
// 2. the proportional kv cache for all devices (kv * % layers)
// 3. the proportional model (size * % layers / # devices)
// This estimates the number of layers
maxlayers
:=
int64
(
ggml
.
NumLayers
())
+
1
devices
:=
int64
(
info
.
DeviceCount
)
avg
:=
vram
/
devices
layers
:=
maxlayers
*
(
avg
-
graph
)
/
(
kv
+
size
/
devices
)
if
layers
>
maxlayers
{
layers
=
maxlayers
}
// decide how many layers to put on the GPU
// 1 + 2 must fit on the main gpu
if
opts
.
NumGPU
>
0
{
min
:=
graph
+
kv
*
layers
/
maxlayers
switch
runtime
.
GOOS
{
if
layers
<=
0
||
min
>
avg
{
case
"darwin"
:
log
.
Printf
(
"not enough vram available, falling back to CPU only"
)
if
requiredTotal
>
available
{
library
=
"cpu"
log
.
Println
(
"not enough vram available, falling back to CPU only"
)
opts
.
NumGPU
=
0
opts
.
NumGPU
=
0
break
}
default
:
if
library
==
"cpu"
||
library
==
"default"
{
opts
.
NumGPU
=
0
break
}
// alloc buffer and kv cache is allocated as a fixed amount on the main gpu
// TODO: find the largest GPU and only reserve memory there
avgAvailable
:=
available
/
int64
(
info
.
DeviceCount
)
if
requiredAlloc
>
avgAvailable
{
log
.
Printf
(
"not enough vram available, falling back to CPU only"
)
library
=
"cpu"
opts
.
NumGPU
=
0
break
}
// we don't know which GPU will be used, so estimate
// the scratch buffer space on all of them
// TODO: allocate less layers to the GPU with the scratch buffer
// and more to the others (based on their available memory)
available
-=
requiredAlloc
*
int64
(
info
.
DeviceCount
)
// no offloading required
if
requiredModel
+
requiredKv
<=
available
{
break
}
// fill remaining vram with layers
log
.
Println
(
"splitting"
,
available
,
"of available memory bytes into layers"
)
bytesPerLayer
:=
int64
((
requiredModel
+
requiredKv
)
/
int64
(
ggml
.
NumLayers
()))
log
.
Println
(
"bytes per layer:"
,
bytesPerLayer
)
layers
:=
available
/
bytesPerLayer
log
.
Println
(
"total required with split:"
,
requiredAlloc
+
(
layers
*
bytesPerLayer
))
if
layers
<
int64
(
opts
.
NumGPU
)
{
opts
.
NumGPU
=
int
(
layers
)
}
}
}
opts
.
NumGPU
=
int
(
layers
)
}
}
opts
.
NumGQA
=
0
opts
.
RopeFrequencyBase
=
0.0
opts
.
RopeFrequencyBase
=
0.0
opts
.
RopeFrequencyScale
=
0.0
opts
.
RopeFrequencyScale
=
0.0
return
newLlmServer
(
library
,
model
,
adapters
,
projectors
,
opts
)
return
newLlmServer
(
library
,
model
,
adapters
,
projectors
,
opts
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment