Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
86279f4a
Unverified
Commit
86279f4a
authored
Sep 25, 2023
by
Bruce MacDonald
Committed by
GitHub
Sep 25, 2023
Browse files
unbound max num gpu layers (#591)
--------- Co-authored-by:
Michael Yang
<
mxyng@pm.me
>
parent
b934bf23
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
36 additions
and
29 deletions
+36
-29
llm/ggml.go
llm/ggml.go
+1
-0
llm/gguf.go
llm/gguf.go
+10
-0
llm/llama.go
llm/llama.go
+23
-27
llm/llm.go
llm/llm.go
+2
-2
No files found.
llm/ggml.go
View file @
86279f4a
...
@@ -77,6 +77,7 @@ type model interface {
...
@@ -77,6 +77,7 @@ type model interface {
ModelFamily
()
string
ModelFamily
()
string
ModelType
()
string
ModelType
()
string
FileType
()
string
FileType
()
string
NumLayers
()
int64
}
}
type
container
interface
{
type
container
interface
{
...
...
llm/gguf.go
View file @
86279f4a
...
@@ -195,6 +195,16 @@ func (llm *ggufModel) Decode(r io.Reader) error {
...
@@ -195,6 +195,16 @@ func (llm *ggufModel) Decode(r io.Reader) error {
return
nil
return
nil
}
}
func
(
llm
*
ggufModel
)
NumLayers
()
int64
{
value
,
exists
:=
llm
.
kv
[
fmt
.
Sprintf
(
"%s.block_count"
,
llm
.
ModelFamily
())]
if
!
exists
{
return
0
}
v
:=
value
.
(
uint32
)
return
int64
(
v
)
}
func
(
ggufModel
)
readU8
(
r
io
.
Reader
)
uint8
{
func
(
ggufModel
)
readU8
(
r
io
.
Reader
)
uint8
{
var
u8
uint8
var
u8
uint8
binary
.
Read
(
r
,
binary
.
LittleEndian
,
&
u8
)
binary
.
Read
(
r
,
binary
.
LittleEndian
,
&
u8
)
...
...
llm/llama.go
View file @
86279f4a
...
@@ -152,6 +152,10 @@ func (llm *llamaModel) FileType() string {
...
@@ -152,6 +152,10 @@ func (llm *llamaModel) FileType() string {
return
fileType
(
llm
.
hyperparameters
.
FileType
)
return
fileType
(
llm
.
hyperparameters
.
FileType
)
}
}
func
(
llm
*
llamaModel
)
NumLayers
()
int64
{
return
int64
(
llm
.
hyperparameters
.
NumLayer
)
}
type
llamaHyperparameters
struct
{
type
llamaHyperparameters
struct
{
// NumVocab is the size of the model's vocabulary.
// NumVocab is the size of the model's vocabulary.
NumVocab
uint32
NumVocab
uint32
...
@@ -207,13 +211,13 @@ func CheckVRAM() (int, error) {
...
@@ -207,13 +211,13 @@ func CheckVRAM() (int, error) {
return
total
,
nil
return
total
,
nil
}
}
func
NumGPU
(
opts
api
.
Options
)
int
{
func
NumGPU
(
numLayer
,
fileSizeBytes
int64
,
opts
api
.
Options
)
int
{
if
opts
.
NumGPU
!=
-
1
{
if
opts
.
NumGPU
!=
-
1
{
return
opts
.
NumGPU
return
opts
.
NumGPU
}
}
n
:=
1
// default to enable metal on macOS
n
:=
1
// default to enable metal on macOS
if
runtime
.
GOOS
==
"linux"
{
if
runtime
.
GOOS
==
"linux"
{
vram
,
err
:=
CheckVRAM
()
vram
Mib
,
err
:=
CheckVRAM
()
if
err
!=
nil
{
if
err
!=
nil
{
if
err
.
Error
()
!=
"nvidia-smi command failed"
{
if
err
.
Error
()
!=
"nvidia-smi command failed"
{
log
.
Print
(
err
.
Error
())
log
.
Print
(
err
.
Error
())
...
@@ -221,33 +225,25 @@ func NumGPU(opts api.Options) int {
...
@@ -221,33 +225,25 @@ func NumGPU(opts api.Options) int {
// nvidia driver not installed or no nvidia GPU found
// nvidia driver not installed or no nvidia GPU found
return
0
return
0
}
}
// TODO: this is a very rough heuristic, better would be to calculate this based on number of layers and context size
switch
{
totalVramBytes
:=
int64
(
vramMib
)
*
1024
*
1024
// 1 MiB = 1024^2 bytes
case
vram
<
500
:
log
.
Printf
(
"WARNING: Low VRAM detected, disabling GPU"
)
// Calculate bytes per layer
n
=
0
// TODO: this is a rough heuristic, better would be to calculate this based on number of layers and context size
case
vram
<
1000
:
bytesPerLayer
:=
fileSizeBytes
/
numLayer
n
=
4
case
vram
<
2000
:
// set n to the max number of layers we can fit in VRAM
n
=
8
return
int
(
totalVramBytes
/
bytesPerLayer
)
case
vram
<
4000
:
n
=
12
log
.
Printf
(
"%d MiB VRAM available, loading up to %d GPU layers"
,
vramMib
,
n
)
case
vram
<
8000
:
n
=
16
case
vram
<
12000
:
n
=
24
case
vram
<
16000
:
n
=
32
default
:
n
=
48
}
log
.
Printf
(
"%d MB VRAM available, loading %d GPU layers"
,
vram
,
n
)
}
}
return
n
// default to enable metal on macOS
return
1
}
}
func
newLlama
(
model
string
,
adapters
[]
string
,
runners
[]
ModelRunner
,
opts
api
.
Options
)
(
*
llama
,
error
)
{
func
newLlama
(
model
string
,
adapters
[]
string
,
runners
[]
ModelRunner
,
numLayers
int64
,
opts
api
.
Options
)
(
*
llama
,
error
)
{
if
_
,
err
:=
os
.
Stat
(
model
);
err
!=
nil
{
fileInfo
,
err
:=
os
.
Stat
(
model
)
if
err
!=
nil
{
return
nil
,
err
return
nil
,
err
}
}
...
@@ -261,7 +257,7 @@ func newLlama(model string, adapters []string, runners []ModelRunner, opts api.O
...
@@ -261,7 +257,7 @@ func newLlama(model string, adapters []string, runners []ModelRunner, opts api.O
"--rope-freq-base"
,
fmt
.
Sprintf
(
"%f"
,
opts
.
RopeFrequencyBase
),
"--rope-freq-base"
,
fmt
.
Sprintf
(
"%f"
,
opts
.
RopeFrequencyBase
),
"--rope-freq-scale"
,
fmt
.
Sprintf
(
"%f"
,
opts
.
RopeFrequencyScale
),
"--rope-freq-scale"
,
fmt
.
Sprintf
(
"%f"
,
opts
.
RopeFrequencyScale
),
"--batch-size"
,
fmt
.
Sprintf
(
"%d"
,
opts
.
NumBatch
),
"--batch-size"
,
fmt
.
Sprintf
(
"%d"
,
opts
.
NumBatch
),
"--n-gpu-layers"
,
fmt
.
Sprintf
(
"%d"
,
NumGPU
(
opts
)),
"--n-gpu-layers"
,
fmt
.
Sprintf
(
"%d"
,
NumGPU
(
numLayers
,
fileInfo
.
Size
(),
opts
)),
"--embedding"
,
"--embedding"
,
}
}
...
...
llm/llm.go
View file @
86279f4a
...
@@ -91,9 +91,9 @@ func New(workDir, model string, adapters []string, opts api.Options) (LLM, error
...
@@ -91,9 +91,9 @@ func New(workDir, model string, adapters []string, opts api.Options) (LLM, error
switch
ggml
.
Name
()
{
switch
ggml
.
Name
()
{
case
"gguf"
:
case
"gguf"
:
opts
.
NumGQA
=
0
// TODO: remove this when llama.cpp runners differ enough to need separate newLlama functions
opts
.
NumGQA
=
0
// TODO: remove this when llama.cpp runners differ enough to need separate newLlama functions
return
newLlama
(
model
,
adapters
,
chooseRunners
(
workDir
,
"gguf"
),
opts
)
return
newLlama
(
model
,
adapters
,
chooseRunners
(
workDir
,
"gguf"
),
ggml
.
NumLayers
(),
opts
)
case
"ggml"
,
"ggmf"
,
"ggjt"
,
"ggla"
:
case
"ggml"
,
"ggmf"
,
"ggjt"
,
"ggla"
:
return
newLlama
(
model
,
adapters
,
chooseRunners
(
workDir
,
"ggml"
),
opts
)
return
newLlama
(
model
,
adapters
,
chooseRunners
(
workDir
,
"ggml"
),
ggml
.
NumLayers
(),
opts
)
default
:
default
:
return
nil
,
fmt
.
Errorf
(
"unknown ggml type: %s"
,
ggml
.
ModelFamily
())
return
nil
,
fmt
.
Errorf
(
"unknown ggml type: %s"
,
ggml
.
ModelFamily
())
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment