Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
0e331c71
Unverified
Commit
0e331c71
authored
May 14, 2024
by
Michael Yang
Committed by
GitHub
May 14, 2024
Browse files
Merge pull request #4328 from ollama/mxyng/mem
count memory up to NumGPU if set by user
parents
a4b8d1f8
1d359e73
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
21 additions
and
15 deletions
+21
-15
llm/memory.go
llm/memory.go
+21
-15
No files found.
llm/memory.go
View file @
0e331c71
...
...
@@ -53,6 +53,12 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
opts
.
NumCtx
=
max
(
opts
.
NumCtx
,
2048
)
}
layers
:=
ggml
.
Tensors
()
.
Layers
()
// add one layer worth of memory as a buffer
if
blk0
,
ok
:=
layers
[
"blk.0"
];
ok
{
memoryMinimum
+=
blk0
.
size
()
}
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
var
kv
uint64
=
2
*
2
*
uint64
(
opts
.
NumCtx
)
*
ggml
.
KV
()
.
BlockCount
()
*
ggml
.
KV
()
.
EmbeddingLength
()
/
ggml
.
KV
()
.
HeadCount
()
*
ggml
.
KV
()
.
HeadCountKV
()
...
...
@@ -73,13 +79,11 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
graphPartialOffload
=
graphFullOffload
}
layers
:=
ggml
.
Tensors
()
.
Layers
()
// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
memoryRequiredTotal
:=
memoryMinimum
+
graphFullOffload
+
layers
[
"blk.0"
]
.
size
()
memoryRequiredTotal
:=
memoryMinimum
+
graphFullOffload
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
memoryRequiredPartial
:=
memoryMinimum
+
graphPartialOffload
+
layers
[
"blk.0"
]
.
size
()
memoryRequiredPartial
:=
memoryMinimum
+
graphPartialOffload
var
memoryLayerOutput
uint64
if
layer
,
ok
:=
layers
[
"output_norm"
];
ok
{
...
...
@@ -100,24 +104,26 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
var
layerCount
int
for
i
:=
0
;
i
<
int
(
ggml
.
KV
()
.
BlockCount
());
i
++
{
memoryLayer
:=
layers
[
fmt
.
Sprintf
(
"blk.%d"
,
i
)]
.
size
()
if
blk
,
ok
:=
layers
[
fmt
.
Sprintf
(
"blk.%d"
,
i
)];
ok
{
memoryLayer
:=
blk
.
size
()
// KV is proportional to the number of layers
memoryLayer
+=
kv
/
ggml
.
KV
()
.
BlockCount
()
memoryRequiredTotal
+=
memoryLayer
if
memoryAvailable
>
memoryRequiredPartial
+
memoryLayer
{
if
(
opts
.
NumGPU
>=
0
&&
layerCount
+
1
<=
opts
.
NumGPU
)
||
(
opts
.
NumGPU
<
0
&&
memoryAvailable
>
memoryRequiredPartial
+
memoryLayer
)
{
memoryRequiredPartial
+=
memoryLayer
layerCount
++
}
}
}
if
gpus
[
0
]
.
Library
!=
"metal"
||
!
opts
.
UseMMap
{
// memory was not preallocated for output tensors
memoryRequiredTotal
+=
memoryLayerOutput
}
if
memoryAvailable
>
memoryRequiredTotal
{
if
(
opts
.
NumGPU
>=
0
&&
layerCount
+
1
<=
opts
.
NumGPU
)
||
(
opts
.
NumGPU
<
0
&&
memoryAvailable
>
memoryRequiredTotal
)
{
layerCount
=
int
(
ggml
.
KV
()
.
BlockCount
())
+
1
memoryRequiredPartial
=
memoryRequiredTotal
}
...
...
@@ -128,10 +134,10 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
"offload to gpu"
,
slog
.
Group
(
"layers"
,
//
actual
number of layers offload
ed
"re
al
"
,
opts
.
NumGPU
,
//
requested
number of layers
to
offload
"re
quested
"
,
opts
.
NumGPU
,
// estimated number of layers that can be offloaded
"
estimate
"
,
layerCount
,
"
real
"
,
layerCount
,
),
slog
.
Group
(
"memory"
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment