Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
83d6d46e
Unverified
Commit
83d6d46e
authored
May 09, 2024
by
Daniel Hiltgen
Committed by
GitHub
May 09, 2024
Browse files
Merge pull request #4299 from dhiltgen/handle_vram_reporting_lag
Wait for GPU free memory reporting to converge
parents
dc18eee3
354ad925
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
61 additions
and
3 deletions
+61
-3
gpu/cpu_common.go
gpu/cpu_common.go
+3
-3
server/sched.go
server/sched.go
+58
-0
No files found.
gpu/cpu_common.go
View file @
83d6d46e
...
@@ -8,14 +8,14 @@ import (
...
@@ -8,14 +8,14 @@ import (
func
GetCPUVariant
()
string
{
func
GetCPUVariant
()
string
{
if
cpu
.
X86
.
HasAVX2
{
if
cpu
.
X86
.
HasAVX2
{
slog
.
Info
(
"CPU has AVX2"
)
slog
.
Debug
(
"CPU has AVX2"
)
return
"avx2"
return
"avx2"
}
}
if
cpu
.
X86
.
HasAVX
{
if
cpu
.
X86
.
HasAVX
{
slog
.
Info
(
"CPU has AVX"
)
slog
.
Debug
(
"CPU has AVX"
)
return
"avx"
return
"avx"
}
}
slog
.
Info
(
"CPU does not have vector extensions"
)
slog
.
Debug
(
"CPU does not have vector extensions"
)
// else LCD
// else LCD
return
""
return
""
}
}
server/sched.go
View file @
83d6d46e
...
@@ -265,11 +265,14 @@ func (s *Scheduler) processCompleted(ctx context.Context) {
...
@@ -265,11 +265,14 @@ func (s *Scheduler) processCompleted(ctx context.Context) {
s
.
loadedMu
.
Lock
()
s
.
loadedMu
.
Lock
()
slog
.
Debug
(
"got lock to unload"
,
"model"
,
runner
.
model
)
slog
.
Debug
(
"got lock to unload"
,
"model"
,
runner
.
model
)
finished
:=
runner
.
waitForVRAMRecovery
()
runner
.
unload
()
runner
.
unload
()
delete
(
s
.
loaded
,
runner
.
model
)
delete
(
s
.
loaded
,
runner
.
model
)
s
.
loadedMu
.
Unlock
()
s
.
loadedMu
.
Unlock
()
slog
.
Debug
(
"runner released"
,
"model"
,
runner
.
model
)
slog
.
Debug
(
"runner released"
,
"model"
,
runner
.
model
)
runner
.
refMu
.
Unlock
()
runner
.
refMu
.
Unlock
()
<-
finished
slog
.
Debug
(
"sending an unloaded event"
,
"model"
,
runner
.
model
)
slog
.
Debug
(
"sending an unloaded event"
,
"model"
,
runner
.
model
)
s
.
unloadedCh
<-
struct
{}{}
s
.
unloadedCh
<-
struct
{}{}
}
}
...
@@ -465,6 +468,61 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool
...
@@ -465,6 +468,61 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool
return
false
return
false
}
}
// Free memory reporting on GPUs can lag for a while even after the runner
// exits, so we have to keep checking until we see the available memory recover,
// otherwise subsequent model loads will get far less layers loaded or worse
// case, may completely fall back to CPU mode.
// This routine must be called before the runner unloads so it can establish
// a before and after GPU memory allocation. The returned channel
// will be notified when we're done waiting, or have timed out and should
// proceed anyway
func
(
runner
*
runnerRef
)
waitForVRAMRecovery
()
chan
interface
{}
{
finished
:=
make
(
chan
interface
{},
1
)
// CPU or Metal don't need checking, so no waiting required
if
len
(
runner
.
gpus
)
==
1
&&
(
runner
.
gpus
[
0
]
.
Library
==
"cpu"
||
runner
.
gpus
[
0
]
.
Library
==
"metal"
)
{
finished
<-
struct
{}{}
return
finished
}
start
:=
time
.
Now
()
// Establish a baseline before we unload
gpusBefore
:=
gpu
.
GetGPUInfo
()
var
totalMemoryBefore
,
freeMemoryBefore
uint64
for
_
,
gpu
:=
range
gpusBefore
{
totalMemoryBefore
+=
gpu
.
TotalMemory
freeMemoryBefore
+=
gpu
.
FreeMemory
}
go
func
()
{
expiresAt
:=
start
.
Add
(
5
*
time
.
Second
)
// typical convergence is 0.5-1.5s
ticker
:=
time
.
NewTicker
(
250
*
time
.
Millisecond
)
defer
ticker
.
Stop
()
for
{
<-
ticker
.
C
if
time
.
Now
()
.
After
(
expiresAt
)
{
slog
.
Warn
(
"gpu VRAM usage didn't recover within timeout"
,
"seconds"
,
time
.
Since
(
start
)
.
Seconds
())
finished
<-
struct
{}{}
}
// Query GPUs, look for free to go back up
gpusNow
:=
gpu
.
GetGPUInfo
()
var
totalMemoryNow
,
freeMemoryNow
uint64
for
_
,
gpu
:=
range
gpusNow
{
totalMemoryNow
+=
gpu
.
TotalMemory
freeMemoryNow
+=
gpu
.
FreeMemory
}
// If we're within ~80% of the estimated memory usage recovered, bail out
if
float32
(
freeMemoryNow
-
freeMemoryBefore
)
>
float32
(
runner
.
estimatedVRAM
)
*
0.8
{
slog
.
Debug
(
fmt
.
Sprintf
(
"gpu VRAM free memory converged after %0.2f seconds"
,
time
.
Since
(
start
)
.
Seconds
()))
finished
<-
struct
{}{}
return
}
}
}()
return
finished
}
type
ByDuration
[]
*
runnerRef
type
ByDuration
[]
*
runnerRef
func
(
a
ByDuration
)
Len
()
int
{
return
len
(
a
)
}
func
(
a
ByDuration
)
Len
()
int
{
return
len
(
a
)
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment