Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
d790bf99
Unverified
Commit
d790bf99
authored
Oct 13, 2023
by
Michael Yang
Committed by
GitHub
Oct 13, 2023
Browse files
Merge pull request #783 from jmorganca/mxyng/fix-gpu-offloading
fix: offloading on low end GPUs
parents
3553d107
35afac09
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
33 additions
and
15 deletions
+33
-15
llm/llama.go
llm/llama.go
+33
-15
No files found.
llm/llama.go
View file @
d790bf99
...
...
@@ -30,42 +30,43 @@ import (
var
llamaCppEmbed
embed
.
FS
type
ModelRunner
struct
{
Path
string
// path to the model runner executable
Path
string
// path to the model runner executable
Accelerated
bool
}
func
chooseRunners
(
workDir
,
runnerType
string
)
[]
ModelRunner
{
buildPath
:=
path
.
Join
(
"llama.cpp"
,
runnerType
,
"build"
)
var
runners
[]
string
var
runners
[]
ModelRunner
// set the runners based on the OS
// IMPORTANT: the order of the runners in the array is the priority order
switch
runtime
.
GOOS
{
case
"darwin"
:
runners
=
[]
string
{
path
.
Join
(
buildPath
,
"metal"
,
"bin"
,
"ollama-runner"
),
path
.
Join
(
buildPath
,
"cpu"
,
"bin"
,
"ollama-runner"
),
runners
=
[]
ModelRunner
{
{
Path
:
path
.
Join
(
buildPath
,
"metal"
,
"bin"
,
"ollama-runner"
)
}
,
{
Path
:
path
.
Join
(
buildPath
,
"cpu"
,
"bin"
,
"ollama-runner"
)
}
,
}
case
"linux"
:
runners
=
[]
string
{
path
.
Join
(
buildPath
,
"cuda"
,
"bin"
,
"ollama-runner"
),
path
.
Join
(
buildPath
,
"cpu"
,
"bin"
,
"ollama-runner"
),
runners
=
[]
ModelRunner
{
{
Path
:
path
.
Join
(
buildPath
,
"cuda"
,
"bin"
,
"ollama-runner"
),
Accelerated
:
true
},
{
Path
:
path
.
Join
(
buildPath
,
"cpu"
,
"bin"
,
"ollama-runner"
)
}
,
}
case
"windows"
:
// TODO: select windows GPU runner here when available
runners
=
[]
string
{
path
.
Join
(
buildPath
,
"cpu"
,
"bin"
,
"Release"
,
"ollama-runner.exe"
),
runners
=
[]
ModelRunner
{
{
Path
:
path
.
Join
(
buildPath
,
"cpu"
,
"bin"
,
"Release"
,
"ollama-runner.exe"
)
}
,
}
default
:
log
.
Printf
(
"unknown OS, running on CPU: %s"
,
runtime
.
GOOS
)
runners
=
[]
string
{
path
.
Join
(
buildPath
,
"cpu"
,
"bin"
,
"ollama-runner"
),
runners
=
[]
ModelRunner
{
{
Path
:
path
.
Join
(
buildPath
,
"cpu"
,
"bin"
,
"ollama-runner"
)
}
,
}
}
runnerAvailable
:=
false
// if no runner files are found in the embed, this flag will cause a fast fail
for
_
,
r
:=
range
runners
{
// find all the files in the runner's bin directory
files
,
err
:=
fs
.
Glob
(
llamaCppEmbed
,
path
.
Join
(
path
.
Dir
(
r
),
"*"
))
files
,
err
:=
fs
.
Glob
(
llamaCppEmbed
,
path
.
Join
(
path
.
Dir
(
r
.
Path
),
"*"
))
if
err
!=
nil
{
// this is expected, ollama may be compiled without all runners packed in
log
.
Printf
(
"%s runner not found: %v"
,
r
,
err
)
...
...
@@ -115,7 +116,10 @@ func chooseRunners(workDir, runnerType string) []ModelRunner {
localRunnersByPriority
:=
[]
ModelRunner
{}
for
_
,
r
:=
range
runners
{
// clean the ModelRunner paths so that they match the OS we are running on
localRunnersByPriority
=
append
(
localRunnersByPriority
,
ModelRunner
{
Path
:
filepath
.
Clean
(
path
.
Join
(
workDir
,
r
))})
localRunnersByPriority
=
append
(
localRunnersByPriority
,
ModelRunner
{
Path
:
filepath
.
Clean
(
path
.
Join
(
workDir
,
r
.
Path
)),
Accelerated
:
r
.
Accelerated
,
})
}
return
localRunnersByPriority
...
...
@@ -215,6 +219,11 @@ func CheckVRAM() (int64, error) {
free
+=
vram
}
if
free
*
1024
*
1024
<
2
*
1000
*
1000
*
1000
{
log
.
Printf
(
"less than 2 GB VRAM available, falling back to CPU only"
)
free
=
0
}
return
free
,
nil
}
...
...
@@ -276,16 +285,20 @@ func newLlama(model string, adapters []string, runners []ModelRunner, numLayers
return
nil
,
errors
.
New
(
"ollama supports only one lora adapter, but multiple were provided"
)
}
numGPU
:=
NumGPU
(
numLayers
,
fileInfo
.
Size
(),
opts
)
params
:=
[]
string
{
"--model"
,
model
,
"--ctx-size"
,
fmt
.
Sprintf
(
"%d"
,
opts
.
NumCtx
),
"--rope-freq-base"
,
fmt
.
Sprintf
(
"%f"
,
opts
.
RopeFrequencyBase
),
"--rope-freq-scale"
,
fmt
.
Sprintf
(
"%f"
,
opts
.
RopeFrequencyScale
),
"--batch-size"
,
fmt
.
Sprintf
(
"%d"
,
opts
.
NumBatch
),
"--n-gpu-layers"
,
fmt
.
Sprintf
(
"%d"
,
NumGPU
(
numLayers
,
fileInfo
.
Size
(),
opts
)),
"--embedding"
,
}
if
numGPU
>
0
{
params
=
append
(
params
,
"--n-gpu-layers"
,
fmt
.
Sprintf
(
"%d"
,
numGPU
))
}
if
opts
.
NumGQA
>
0
{
params
=
append
(
params
,
"--gqa"
,
fmt
.
Sprintf
(
"%d"
,
opts
.
NumGQA
))
}
...
...
@@ -316,6 +329,11 @@ func newLlama(model string, adapters []string, runners []ModelRunner, numLayers
// start the llama.cpp server with a retry in case the port is already in use
for
_
,
runner
:=
range
runners
{
if
runner
.
Accelerated
&&
numGPU
==
0
{
log
.
Printf
(
"skipping accelerated runner because num_gpu=0"
)
continue
}
if
_
,
err
:=
os
.
Stat
(
runner
.
Path
);
err
!=
nil
{
log
.
Printf
(
"llama runner not found: %v"
,
err
)
continue
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment