Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
f9cd55c7
Commit
f9cd55c7
authored
Mar 09, 2024
by
Jeffrey Morgan
Browse files
disable gpu for certain model architectures and fix divide-by-zero on memory estimation
parent
ac64cd4e
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
12 additions
and
4 deletions
+12
-4
llm/llm.go
llm/llm.go
+12
-4
No files found.
llm/llm.go
View file @
f9cd55c7
...
@@ -6,6 +6,7 @@ import (
...
@@ -6,6 +6,7 @@ import (
"log/slog"
"log/slog"
"os"
"os"
"runtime"
"runtime"
"slices"
"github.com/jmorganca/ollama/api"
"github.com/jmorganca/ollama/api"
"github.com/jmorganca/ollama/gpu"
"github.com/jmorganca/ollama/gpu"
...
@@ -19,6 +20,10 @@ type LLM interface {
...
@@ -19,6 +20,10 @@ type LLM interface {
Close
()
Close
()
}
}
var
cpuOnlyFamilies
=
[]
string
{
"mamba"
,
}
func
New
(
model
string
,
adapters
,
projectors
[]
string
,
opts
api
.
Options
)
(
LLM
,
error
)
{
func
New
(
model
string
,
adapters
,
projectors
[]
string
,
opts
api
.
Options
)
(
LLM
,
error
)
{
if
_
,
err
:=
os
.
Stat
(
model
);
err
!=
nil
{
if
_
,
err
:=
os
.
Stat
(
model
);
err
!=
nil
{
return
nil
,
err
return
nil
,
err
...
@@ -48,13 +53,18 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er
...
@@ -48,13 +53,18 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er
size
:=
ggml
.
Size
size
:=
ggml
.
Size
// fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
// fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
kv
:=
2
*
2
*
int64
(
opts
.
NumCtx
)
*
int64
(
ggml
.
NumLayers
())
*
int64
(
ggml
.
NumEmbed
())
*
int64
(
ggml
.
NumHeadKv
())
/
int64
(
ggml
.
NumHead
())
kv
:=
2
*
2
*
int64
(
opts
.
NumCtx
)
*
int64
(
ggml
.
NumLayers
())
*
int64
(
ggml
.
NumEmbed
())
*
int64
(
ggml
.
NumHeadKv
())
/
int64
(
max
(
ggml
.
NumHead
()
,
1
)
)
// this amount is the overhead + tensors in memory
// this amount is the overhead + tensors in memory
// TODO: get this from the llama.cpp's graph calculations instead of
// TODO: get this from the llama.cpp's graph calculations instead of
// estimating it's 1/6 * kv_cache_size * num_gqa
// estimating it's 1/6 * kv_cache_size * num_gqa
graph
:=
int64
(
ggml
.
NumGQA
())
*
kv
/
6
graph
:=
int64
(
ggml
.
NumGQA
())
*
kv
/
6
// certain model architectures don't support gpu inference yet
if
slices
.
Contains
(
cpuOnlyFamilies
,
ggml
.
ModelFamily
())
{
opts
.
NumGPU
=
0
}
info
:=
gpu
.
GetGPUInfo
()
info
:=
gpu
.
GetGPUInfo
()
switch
runtime
.
GOOS
{
switch
runtime
.
GOOS
{
case
"darwin"
:
case
"darwin"
:
...
@@ -63,9 +73,7 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er
...
@@ -63,9 +73,7 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er
}
}
if
size
+
kv
+
graph
>
vram
{
if
size
+
kv
+
graph
>
vram
{
slog
.
Info
(
"not enough vram available, falling back to CPU only"
)
slog
.
Info
(
"not enough vram available, setting num_gpu=0"
)
info
.
Library
=
"cpu"
info
.
Variant
=
gpu
.
GetCPUVariant
()
opts
.
NumGPU
=
0
opts
.
NumGPU
=
0
break
break
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment