Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
cef2c605
Unverified
Commit
cef2c605
authored
Jul 30, 2024
by
Daniel Hiltgen
Committed by
GitHub
Jul 30, 2024
Browse files
Merge pull request #5859 from dhiltgen/homogeneous_gpus
Prevent partial loading on mixed GPU brands
parents
0be8baad
34542099
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
66 additions
and
4 deletions
+66
-4
server/sched.go
server/sched.go
+27
-4
server/sched_test.go
server/sched_test.go
+39
-0
No files found.
server/sched.go
View file @
cef2c605
...
...
@@ -212,9 +212,12 @@ func (s *Scheduler) processPending(ctx context.Context) {
}
else
if
loadedCount
==
0
{
// No models loaded. Load the model but prefer the best fit.
slog
.
Debug
(
"loading first model"
,
"model"
,
pending
.
model
.
ModelPath
)
g
:=
pickBestF
itGPUs
(
pending
,
ggml
,
gpus
,
&
numParallel
)
g
:=
pickBestF
ullFitByLibrary
(
pending
,
ggml
,
gpus
,
&
numParallel
)
if
g
!=
nil
{
gpus
=
g
}
else
{
// Only allow partial loads when this is the first model
gpus
=
pickBestPartialFitByLibrary
(
pending
,
ggml
,
gpus
,
&
numParallel
)
}
s
.
loadFn
(
pending
,
ggml
,
gpus
,
numParallel
)
break
...
...
@@ -231,7 +234,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
// Update free memory from currently loaded models
s
.
updateFreeSpace
(
availGpus
)
fitGpus
:=
pickBestF
itGPUs
(
pending
,
ggml
,
availGpus
,
&
numParallel
)
fitGpus
:=
pickBestF
ullFitByLibrary
(
pending
,
ggml
,
availGpus
,
&
numParallel
)
if
fitGpus
!=
nil
{
slog
.
Debug
(
"new model fits with existing models, loading"
)
s
.
loadFn
(
pending
,
ggml
,
fitGpus
,
numParallel
)
...
...
@@ -668,11 +671,12 @@ func (a ByDuration) Less(i, j int) bool {
// func (a BySize) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
// func (a BySize) Less(i, j int) bool { return a[i].estimatedVRAM < a[j].estimatedVRAM }
// pickBestFitGPUs will try to find the optimal placement of the model in the available GPUs where the model fully fits
// pickBestFullFitByLibrary will try to find the optimal placement of the model in the available GPUs where the model fully fits
// The list of GPUs returned will always be the same brand (library)
// If the model can not be fit fully within the available GPU(s) nil is returned
// If numParallel is <= 0, this will attempt try to optimize parallism based on available VRAM, and adjust
// opts.NumCtx accordingly
func
pickBestF
itGPUs
(
req
*
LlmRequest
,
ggml
*
llm
.
GGML
,
gpus
gpu
.
GpuInfoList
,
numParallel
*
int
)
gpu
.
GpuInfoList
{
func
pickBestF
ullFitByLibrary
(
req
*
LlmRequest
,
ggml
*
llm
.
GGML
,
gpus
gpu
.
GpuInfoList
,
numParallel
*
int
)
gpu
.
GpuInfoList
{
var
estimatedVRAM
uint64
var
numParallelToTry
[]
int
...
...
@@ -723,6 +727,25 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numP
return
nil
}
// If multiple Libraries are detected, pick the Library which loads the most layers for the model
func
pickBestPartialFitByLibrary
(
req
*
LlmRequest
,
ggml
*
llm
.
GGML
,
gpus
gpu
.
GpuInfoList
,
numParallel
*
int
)
gpu
.
GpuInfoList
{
*
numParallel
=
1
byLibrary
:=
gpus
.
ByLibrary
()
if
len
(
byLibrary
)
<=
1
{
return
gpus
}
var
bestEstimate
uint64
var
bestFit
int
for
i
,
gl
:=
range
byLibrary
{
_
,
estimatedVRAM
:=
llm
.
PredictServerFit
(
gl
,
ggml
,
req
.
model
.
AdapterPaths
,
req
.
model
.
ProjectorPaths
,
req
.
opts
)
if
estimatedVRAM
>
bestEstimate
{
bestEstimate
=
estimatedVRAM
bestFit
=
i
}
}
return
byLibrary
[
bestFit
]
}
// findRunnerToUnload finds a runner to unload to make room for a new model
func
(
s
*
Scheduler
)
findRunnerToUnload
()
*
runnerRef
{
s
.
loadedMu
.
Lock
()
...
...
server/sched_test.go
View file @
cef2c605
...
...
@@ -666,6 +666,45 @@ func TestAlreadyCanceled(t *testing.T) {
require
.
Empty
(
t
,
scenario1a
.
req
.
successCh
)
}
func
TestHomogeneousGPUs
(
t
*
testing
.
T
)
{
ctx
,
done
:=
context
.
WithTimeout
(
context
.
Background
(),
100
*
time
.
Millisecond
)
defer
done
()
s
:=
InitScheduler
(
ctx
)
s
.
getGpuFn
=
func
()
gpu
.
GpuInfoList
{
// Set memory values to require the model to be spread
gpus
:=
[]
gpu
.
GpuInfo
{
{
Library
:
"cuda"
},
{
Library
:
"rocm"
},
}
gpus
[
0
]
.
TotalMemory
=
1
*
format
.
GibiByte
gpus
[
0
]
.
FreeMemory
=
256
*
format
.
MebiByte
gpus
[
1
]
.
TotalMemory
=
1
*
format
.
GibiByte
gpus
[
1
]
.
FreeMemory
=
256
*
format
.
MebiByte
return
gpus
}
s
.
getCpuFn
=
getCpuFn
a
:=
newScenarioRequest
(
t
,
ctx
,
"ollama-model-1"
,
10
,
&
api
.
Duration
{
Duration
:
5
*
time
.
Millisecond
})
s
.
newServerFn
=
func
(
gpus
gpu
.
GpuInfoList
,
model
string
,
ggml
*
llm
.
GGML
,
adapters
[]
string
,
projectors
[]
string
,
opts
api
.
Options
,
numParallel
int
)
(
llm
.
LlamaServer
,
error
)
{
require
.
Len
(
t
,
gpus
,
1
)
return
a
.
newServer
(
gpus
,
model
,
ggml
,
adapters
,
projectors
,
opts
,
numParallel
)
}
slog
.
Info
(
"a"
)
s
.
pendingReqCh
<-
a
.
req
require
.
Len
(
t
,
s
.
pendingReqCh
,
1
)
s
.
Run
(
ctx
)
select
{
case
resp
:=
<-
a
.
req
.
successCh
:
require
.
Equal
(
t
,
resp
.
llama
,
a
.
srv
)
require
.
Empty
(
t
,
s
.
pendingReqCh
)
require
.
Empty
(
t
,
a
.
req
.
errCh
)
case
err
:=
<-
a
.
req
.
errCh
:
t
.
Fatal
(
err
.
Error
())
case
<-
ctx
.
Done
()
:
t
.
Fatal
(
"timeout"
)
}
}
type
mockLlm
struct
{
pingResp
error
waitResp
error
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment