Commit ff4f0cbd authored by Daniel Hiltgen's avatar Daniel Hiltgen
Browse files

Prevent multiple concurrent loads on the same gpus

While models are loading, the VRAM metrics are dynamic, so try
to load on a GPU that doesn't have a model actively loading, or wait
to avoid races that lead to OOMs
parent fc37c192
......@@ -26,6 +26,7 @@ type LlmRequest struct {
sessionDuration time.Duration
successCh chan *runnerRef
errCh chan error
schedAttempts uint
}
type Scheduler struct {
......@@ -37,10 +38,11 @@ type Scheduler struct {
loaded map[string]*runnerRef
loadedMu sync.Mutex
loadFn func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList)
newServerFn func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error)
getGpuFn func() gpu.GpuInfoList
getCpuFn func() gpu.GpuInfoList
loadFn func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList)
newServerFn func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error)
getGpuFn func() gpu.GpuInfoList
getCpuFn func() gpu.GpuInfoList
reschedDelay time.Duration
}
var ErrMaxQueue = fmt.Errorf("server busy, please try again. maximum pending requests exceeded")
......@@ -55,6 +57,7 @@ func InitScheduler(ctx context.Context) *Scheduler {
newServerFn: llm.NewLlamaServer,
getGpuFn: gpu.GetGPUInfo,
getCpuFn: gpu.GetCPUInfo,
reschedDelay: 250 * time.Millisecond,
}
sched.loadFn = sched.load
return sched
......@@ -106,6 +109,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
return
case pending := <-s.pendingReqCh:
// Block other requests until we get this pending request running
pending.schedAttempts++
if pending.ctx.Err() != nil {
slog.Debug("pending request cancelled or timed out, skipping scheduling")
......@@ -172,13 +176,39 @@ func (s *Scheduler) processPending(ctx context.Context) {
}
if runnerToExpire == nil {
// More than one loaded model, so we have to see if the new one fits
// More than one loaded model, so we have to see if the
// new one fits
//
// We want to avoid loading on any GPUs that have other
// models still loading on them to avoid potential races
// with VRAM consumption ramping up during load
availGpus := s.filterGPUsWithLoadingModels(gpus)
// Update free memory from currently loaded models
s.updateFreeSpace(gpus)
gpus = pickBestFitGPUs(pending, ggml, gpus)
if gpus != nil {
s.updateFreeSpace(availGpus)
fitGpus := pickBestFitGPUs(pending, ggml, availGpus)
if fitGpus != nil {
slog.Debug("new model fits with existing models, loading")
s.loadFn(pending, ggml, gpus)
s.loadFn(pending, ggml, fitGpus)
break
}
// We couldn't find a set of GPUs to fully load the new
// model. If no other models are loading (both GPU lists
// are the same) then we need to unload another model to
// make room
if len(availGpus) < len(gpus) {
// There are other requests pending, and this one
// needs more time, so put it on the back of the
// queue so that we might satisfy other pending
// requests that aren't blocked
go func() {
// Process in a go routine to avoid deadlocking
// the scheduler if our queue is full
slog.Debug("delaying scheduling while other models finish loading", "attempts", pending.schedAttempts, "model", pending.model.ModelPath)
time.Sleep(s.reschedDelay)
s.pendingReqCh <- pending
}()
break
}
runnerToExpire = s.findRunnerToUnload()
......@@ -409,11 +439,36 @@ func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
// after we start our first runner, then we'll never acount for that, so picking the smallest free value seems prudent.
allGpus[i].FreeMemory = allGpus[i].TotalMemory - p
}
slog.Info("updated VRAM", "gpu", allGpus[i].ID, "library", allGpus[i].Library, "total", format.HumanBytes2(allGpus[i].TotalMemory), "available", format.HumanBytes2(allGpus[i].FreeMemory))
slog.Info("updated VRAM based on existing loaded models", "gpu", allGpus[i].ID, "library", allGpus[i].Library, "total", format.HumanBytes2(allGpus[i].TotalMemory), "available", format.HumanBytes2(allGpus[i].FreeMemory))
}
}
}
// While models are loading the VRAM consumption numbers will be indeterminate, so we have
// to avoid scheduling another model on the same GPU(s) that haven't stabilized.
// This routine returns the set of GPUs that do not have an active loading model.
// If all GPUs have loading models, an empty list will be returned (not a single CPU entry)
func (s *Scheduler) filterGPUsWithLoadingModels(allGpus gpu.GpuInfoList) gpu.GpuInfoList {
ret := append(gpu.GpuInfoList{}, allGpus...)
s.loadedMu.Lock()
defer s.loadedMu.Unlock()
for _, runner := range s.loaded {
if runner.loading {
slog.Debug("overlapping loads detected", "gpus", runner.gpus, "model", runner.modelPath)
for _, busyGPU := range runner.gpus {
for i := range ret {
if ret[i].ID == busyGPU.ID {
ret = append(ret[:i], ret[i+1:]...)
break
}
}
}
}
}
return ret
}
// TODO consolidate sched_types.go
type runnerRef struct {
refMu sync.Mutex
// refCond sync.Cond // Signaled on transition from 1 -> 0 refCount
......@@ -519,7 +574,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
for {
<-ticker.C
if time.Now().After(expiresAt) {
slog.Warn("gpu VRAM usage didn't recover within timeout", "seconds", time.Since(start).Seconds())
slog.Warn("gpu VRAM usage didn't recover within timeout", "seconds", time.Since(start).Seconds(), "model", runner.modelPath)
finished <- struct{}{}
}
......@@ -532,7 +587,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
}
// If we're within ~80% of the estimated memory usage recovered, bail out
if float32(freeMemoryNow-freeMemoryBefore) > float32(runner.estimatedVRAM)*0.8 {
slog.Debug(fmt.Sprintf("gpu VRAM free memory converged after %0.2f seconds", time.Since(start).Seconds()))
slog.Debug(fmt.Sprintf("gpu VRAM free memory converged after %0.2f seconds", time.Since(start).Seconds()), "model", runner.modelPath)
finished <- struct{}{}
return
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment