Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
50b9056e
Commit
50b9056e
authored
May 10, 2024
by
Michael Yang
Browse files
count memory up to NumGPU
parent
9c76b30d
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
11 additions
and
9 deletions
+11
-9
llm/memory.go
llm/memory.go
+11
-9
No files found.
llm/memory.go
View file @
50b9056e
...
...
@@ -53,6 +53,10 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
opts
.
NumCtx
=
max
(
opts
.
NumCtx
,
2048
)
}
layers
:=
ggml
.
Tensors
()
.
Layers
()
// add one layer worth of memorr as a buffer
memoryMinimum
+=
layers
[
"blk.0"
]
.
size
()
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
var
kv
uint64
=
2
*
2
*
uint64
(
opts
.
NumCtx
)
*
ggml
.
KV
()
.
BlockCount
()
*
ggml
.
KV
()
.
EmbeddingLength
()
/
ggml
.
KV
()
.
HeadCount
()
*
ggml
.
KV
()
.
HeadCountKV
()
...
...
@@ -73,13 +77,11 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
graphPartialOffload
=
graphFullOffload
}
layers
:=
ggml
.
Tensors
()
.
Layers
()
// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
memoryRequiredTotal
:=
memoryMinimum
+
graphFullOffload
+
layers
[
"blk.0"
]
.
size
()
memoryRequiredTotal
:=
memoryMinimum
+
graphFullOffload
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
memoryRequiredPartial
:=
memoryMinimum
+
graphPartialOffload
+
layers
[
"blk.0"
]
.
size
()
memoryRequiredPartial
:=
memoryMinimum
+
graphPartialOffload
var
memoryLayerOutput
uint64
if
layer
,
ok
:=
layers
[
"output_norm"
];
ok
{
...
...
@@ -106,7 +108,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
memoryLayer
+=
kv
/
ggml
.
KV
()
.
BlockCount
()
memoryRequiredTotal
+=
memoryLayer
if
memoryAvailable
>
memoryRequiredPartial
+
memoryLayer
{
if
(
opts
.
NumGPU
>=
0
&&
layerCount
+
1
<=
opts
.
NumGPU
)
||
(
opts
.
NumGPU
<
0
&&
memoryAvailable
>
memoryRequiredPartial
+
memoryLayer
)
{
memoryRequiredPartial
+=
memoryLayer
layerCount
++
}
...
...
@@ -117,7 +119,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
memoryRequiredTotal
+=
memoryLayerOutput
}
if
memoryAvailable
>
memoryRequiredTotal
{
if
(
opts
.
NumGPU
>=
0
&&
layerCount
+
1
<=
opts
.
NumGPU
)
||
(
opts
.
NumGPU
<
0
&&
memoryAvailable
>
memoryRequiredTotal
)
{
layerCount
=
int
(
ggml
.
KV
()
.
BlockCount
())
+
1
memoryRequiredPartial
=
memoryRequiredTotal
}
...
...
@@ -128,10 +130,10 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
"offload to gpu"
,
slog
.
Group
(
"layers"
,
//
actual
number of layers offload
ed
"re
al
"
,
opts
.
NumGPU
,
//
requested
number of layers
to
offload
"re
quested
"
,
opts
.
NumGPU
,
// estimated number of layers that can be offloaded
"
estimate
"
,
layerCount
,
"
real
"
,
layerCount
,
),
slog
.
Group
(
"memory"
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment