Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
4736391b
Commit
4736391b
authored
May 06, 2024
by
Michael Yang
Browse files
llm: add minimum based on layer size
parent
7c533041
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
7 additions
and
7 deletions
+7
-7
gpu/gpu.go
gpu/gpu.go
+2
-2
gpu/gpu_darwin.go
gpu/gpu_darwin.go
+1
-1
llm/memory.go
llm/memory.go
+4
-4
No files found.
gpu/gpu.go
View file @
4736391b
...
@@ -31,8 +31,8 @@ type handles struct {
...
@@ -31,8 +31,8 @@ type handles struct {
}
}
const
(
const
(
cudaMinimumMemory
=
457
*
format
.
MebiByte
cudaMinimumMemory
=
256
*
format
.
MebiByte
rocmMinimumMemory
=
457
*
format
.
MebiByte
rocmMinimumMemory
=
256
*
format
.
MebiByte
)
)
var
gpuMutex
sync
.
Mutex
var
gpuMutex
sync
.
Mutex
...
...
gpu/gpu_darwin.go
View file @
4736391b
...
@@ -15,7 +15,7 @@ import (
...
@@ -15,7 +15,7 @@ import (
)
)
const
(
const
(
metalMinimumMemory
=
512
*
format
.
MebiByte
metalMinimumMemory
=
384
*
format
.
MebiByte
)
)
func
GetGPUInfo
()
GpuInfoList
{
func
GetGPUInfo
()
GpuInfoList
{
...
...
llm/memory.go
View file @
4736391b
...
@@ -85,19 +85,19 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
...
@@ -85,19 +85,19 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
graphPartialOffload
=
graphFullOffload
graphPartialOffload
=
graphFullOffload
}
}
layers
:=
ggml
.
Tensors
()
.
Layers
()
// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
memoryRequiredTotal
:=
memoryMinimum
+
graphFullOffload
memoryRequiredTotal
:=
memoryMinimum
+
graphFullOffload
+
layers
[
"blk.0"
]
.
size
()
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
memoryRequiredPartial
:=
memoryMinimum
+
graphPartialOffload
memoryRequiredPartial
:=
memoryMinimum
+
graphPartialOffload
+
layers
[
"blk.0"
]
.
size
()
if
memoryRequiredPartial
>
memoryAvailable
{
if
memoryRequiredPartial
>
memoryAvailable
{
slog
.
Debug
(
"insufficient VRAM to load any model layers"
)
slog
.
Debug
(
"insufficient VRAM to load any model layers"
)
return
0
,
0
return
0
,
0
}
}
layers
:=
ggml
.
Tensors
()
.
Layers
()
var
memoryLayerOutput
uint64
var
memoryLayerOutput
uint64
if
layer
,
ok
:=
layers
[
"output_norm"
];
ok
{
if
layer
,
ok
:=
layers
[
"output_norm"
];
ok
{
memoryLayerOutput
+=
layer
.
size
()
memoryLayerOutput
+=
layer
.
size
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment