Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
a8b9b930
Commit
a8b9b930
authored
Apr 17, 2024
by
Michael Yang
Browse files
account for all non-repeating layers
parent
9df6c85c
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
50 additions
and
11 deletions
+50
-11
llm/ggml.go
llm/ggml.go
+2
-1
llm/server.go
llm/server.go
+48
-10
No files found.
llm/ggml.go
View file @
a8b9b930
...
...
@@ -164,7 +164,8 @@ func (ts Tensors) Layers() map[string]Layer {
for
_
,
t
:=
range
ts
{
parts
:=
strings
.
Split
(
t
.
Name
,
"."
)
if
parts
[
0
]
==
"blk"
{
parts
=
parts
[
1
:
]
// join first and second part, e.g. blk.%d
parts
=
append
([]
string
{
fmt
.
Sprintf
(
"%s.%s"
,
parts
[
0
],
parts
[
1
])},
parts
[
2
:
]
...
)
}
if
_
,
ok
:=
layers
[
parts
[
0
]];
!
ok
{
...
...
llm/server.go
View file @
a8b9b930
...
...
@@ -97,7 +97,7 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
var
layerCount
int
layers
:=
ggml
.
Tensors
()
.
Layers
()
for
i
:=
0
;
i
<
int
(
ggml
.
KV
()
.
BlockCount
());
i
++
{
memoryLayer
:=
layers
[
fmt
.
Sprintf
(
"%d"
,
i
)]
.
size
()
memoryLayer
:=
layers
[
fmt
.
Sprintf
(
"
blk.
%d"
,
i
)]
.
size
()
// KV is proportional to the number of layers
memoryLayer
+=
kv
/
ggml
.
KV
()
.
BlockCount
()
...
...
@@ -109,7 +109,14 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
}
}
memoryLayerOutput
:=
layers
[
"output"
]
.
size
()
var
memoryLayerOutput
uint64
for
k
,
v
:=
range
layers
{
if
!
strings
.
HasPrefix
(
k
,
"blk."
)
{
slog
.
Info
(
"aaa"
,
"name"
,
k
,
"size"
,
format
.
HumanBytes2
(
v
.
size
()))
memoryLayerOutput
+=
v
.
size
()
}
}
memoryRequiredTotal
+=
memoryLayerOutput
if
info
.
Library
==
"metal"
&&
memoryRequiredTotal
>
info
.
TotalMemory
{
...
...
@@ -124,16 +131,47 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
opts
.
NumGPU
=
layerCount
}
memoryWeights
:=
memoryRequiredTotal
-
memoryMinimum
-
graphFullOffload
-
kv
slog
.
Info
(
"offload to gpu"
,
"reallayers"
,
opts
.
NumGPU
,
"layers"
,
layerCount
,
"required"
,
format
.
HumanBytes2
(
memoryRequiredTotal
),
"used"
,
format
.
HumanBytes2
(
memoryRequiredPartial
),
"available"
,
format
.
HumanBytes2
(
memoryAvailable
),
"kv"
,
format
.
HumanBytes2
(
kv
),
"fulloffload"
,
format
.
HumanBytes2
(
graphFullOffload
),
"partialoffload"
,
format
.
HumanBytes2
(
graphPartialOffload
),
slog
.
Group
(
"layers"
,
// actual number of layers offloaded
"real"
,
opts
.
NumGPU
,
// estimated number of layers that can be offloaded
"estimate"
,
layerCount
,
),
slog
.
Group
(
"memory"
,
// memory available for offloading
"available"
,
format
.
HumanBytes2
(
memoryAvailable
),
slog
.
Group
(
"required"
,
// memory required for full offloading
"full"
,
format
.
HumanBytes2
(
memoryRequiredTotal
),
// memory required to offload layers.estimate layers
"partial"
,
format
.
HumanBytes2
(
memoryRequiredPartial
),
// memory of KV cache
"kv"
,
format
.
HumanBytes2
(
kv
),
),
slog
.
Group
(
"weights"
,
// memory of the weights
"total"
,
format
.
HumanBytes2
(
memoryWeights
),
// memory of repeating layers
"repeating"
,
format
.
HumanBytes2
(
memoryWeights
-
memoryLayerOutput
),
// memory of non-repeating layers
"nonrepeating"
,
format
.
HumanBytes2
(
memoryLayerOutput
),
),
slog
.
Group
(
"graph"
,
// memory of graph when fully offloaded
"full"
,
format
.
HumanBytes2
(
graphFullOffload
),
// memory of graph when not fully offloaded
"partial"
,
format
.
HumanBytes2
(
graphPartialOffload
),
),
),
)
if
len
(
adapters
)
>
1
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment