Unverified Commit 3258a89b authored by Daniel Hiltgen's avatar Daniel Hiltgen Committed by GitHub
Browse files

DRY out the runner lifecycle code (#12540)

* DRY out the runner lifecycle code

Now that discovery uses the runners as well, this unifies the runner spawning code
into a single place.  This also unifies GPU discovery types with the newer ml.DeviceInfo

* win: make incremental builds better

Place build artifacts in discrete directories so incremental builds don't have to start fresh

* Adjust sort order to consider iGPUs

* handle cpu inference oom scenarios

* review comments
parent 1c093e97
...@@ -2065,12 +2065,6 @@ power management: ...@@ -2065,12 +2065,6 @@ power management:
cpus := linuxCPUDetails(buf) cpus := linuxCPUDetails(buf)
slog.Info("example", "scenario", k, "cpus", cpus) slog.Info("example", "scenario", k, "cpus", cpus)
si := SystemInfo{
System: CPUInfo{
CPUs: cpus,
},
}
threadCount := si.GetOptimalThreadCount()
if len(v.expCPUs) != len(cpus) { if len(v.expCPUs) != len(cpus) {
t.Fatalf("incorrect number of sockets: expected:%v got:%v", v.expCPUs, cpus) t.Fatalf("incorrect number of sockets: expected:%v got:%v", v.expCPUs, cpus)
} }
...@@ -2085,10 +2079,6 @@ power management: ...@@ -2085,10 +2079,6 @@ power management:
t.Fatalf("incorrect number of threads: expected:%v got:%v", v.expCPUs[i], c) t.Fatalf("incorrect number of threads: expected:%v got:%v", v.expCPUs[i], c)
} }
} }
if threadCount != v.expThreadCount {
t.Fatalf("incorrect thread count expected:%d got:%d", v.expThreadCount, threadCount)
}
}) })
} }
} }
package discover package discover
import ( import (
"context"
"log/slog" "log/slog"
"os" "os"
"path/filepath"
"regexp" "regexp"
"runtime" "runtime"
"strconv" "strconv"
"strings" "strings"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/ml" "github.com/ollama/ollama/ml"
) )
...@@ -18,159 +15,28 @@ import ( ...@@ -18,159 +15,28 @@ import (
// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices. // Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
var CudaTegra string = os.Getenv("JETSON_JETPACK") var CudaTegra string = os.Getenv("JETSON_JETPACK")
func GetCPUInfo() GpuInfo { // GetSystemInfo returns the last cached state of the GPUs on the system
mem, err := GetCPUMem() func GetSystemInfo() ml.SystemInfo {
if err != nil { memInfo, err := GetCPUMem()
slog.Warn("error looking up system memory", "error", err)
}
return GpuInfo{
memInfo: mem,
DeviceID: ml.DeviceID{
Library: "cpu",
ID: "0",
},
}
}
func GetGPUInfo(ctx context.Context, runners []FilteredRunnerDiscovery) GpuInfoList {
devs := GPUDevices(ctx, runners)
return devInfoToInfoList(devs)
}
func devInfoToInfoList(devs []ml.DeviceInfo) GpuInfoList {
resp := []GpuInfo{}
// Our current packaging model places ggml-hip in the main directory
// but keeps rocm in an isolated directory. We have to add it to
// the [LD_LIBRARY_]PATH so ggml-hip will load properly
rocmDir := filepath.Join(LibOllamaPath, "rocm")
if _, err := os.Stat(rocmDir); err != nil {
rocmDir = ""
}
for _, dev := range devs {
info := GpuInfo{
DeviceID: dev.DeviceID,
filterID: dev.FilteredID,
Name: dev.Description,
memInfo: memInfo{
TotalMemory: dev.TotalMemory,
FreeMemory: dev.FreeMemory,
},
// TODO can we avoid variant
DependencyPath: dev.LibraryPath,
DriverMajor: dev.DriverMajor,
DriverMinor: dev.DriverMinor,
ComputeMajor: dev.ComputeMajor,
ComputeMinor: dev.ComputeMinor,
}
if dev.Library == "CUDA" || dev.Library == "ROCm" {
info.MinimumMemory = 457 * format.MebiByte
}
if dev.Library == "ROCm" && rocmDir != "" {
info.DependencyPath = append(info.DependencyPath, rocmDir)
}
// TODO any special processing of Vulkan devices?
resp = append(resp, info)
}
if len(resp) == 0 {
mem, err := GetCPUMem()
if err != nil { if err != nil {
slog.Warn("error looking up system memory", "error", err) slog.Warn("error looking up system memory", "error", err)
} }
var threadCount int
resp = append(resp, GpuInfo{ cpus := GetCPUDetails()
memInfo: mem, for _, c := range cpus {
DeviceID: ml.DeviceID{ threadCount += c.CoreCount - c.EfficiencyCoreCount
Library: "cpu",
ID: "0",
},
})
} }
return resp
}
// Given the list of GPUs this instantiation is targeted for, if threadCount == 0 {
// figure out the visible devices environment variable // Fall back to Go's num CPU
// threadCount = runtime.NumCPU()
// If different libraries are detected, the first one is what we use
func (l GpuInfoList) GetVisibleDevicesEnv() []string {
if len(l) == 0 {
return nil
}
res := []string{}
envVar := rocmGetVisibleDevicesEnv(l)
if envVar != "" {
res = append(res, envVar)
}
envVar = vkGetVisibleDevicesEnv(l)
if envVar != "" {
res = append(res, envVar)
}
return res
}
func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) string {
ids := []string{}
for _, info := range gpuInfo {
if info.Library != "ROCm" {
continue
}
// If the devices requires a numeric ID, for filtering purposes, we use the unfiltered ID number
if info.filterID != "" {
ids = append(ids, info.filterID)
} else {
ids = append(ids, info.ID)
}
}
if len(ids) == 0 {
return ""
}
envVar := "ROCR_VISIBLE_DEVICES="
if runtime.GOOS != "linux" {
envVar = "HIP_VISIBLE_DEVICES="
}
// There are 3 potential env vars to use to select GPUs.
// ROCR_VISIBLE_DEVICES supports UUID or numeric but does not work on Windows
// HIP_VISIBLE_DEVICES supports numeric IDs only
// GPU_DEVICE_ORDINAL supports numeric IDs only
return envVar + strings.Join(ids, ",")
}
func vkGetVisibleDevicesEnv(gpuInfo []GpuInfo) string {
ids := []string{}
for _, info := range gpuInfo {
if info.Library != "Vulkan" {
continue
}
if info.filterID != "" {
ids = append(ids, info.filterID)
} else {
ids = append(ids, info.ID)
}
}
if len(ids) == 0 {
return ""
}
envVar := "GGML_VK_VISIBLE_DEVICES="
return envVar + strings.Join(ids, ",")
}
// GetSystemInfo returns the last cached state of the GPUs on the system
func GetSystemInfo() SystemInfo {
deviceMu.Lock()
defer deviceMu.Unlock()
gpus := devInfoToInfoList(devices)
if len(gpus) == 1 && gpus[0].Library == "cpu" {
gpus = []GpuInfo{}
} }
return SystemInfo{ return ml.SystemInfo{
System: CPUInfo{ ThreadCount: threadCount,
CPUs: GetCPUDetails(), TotalMemory: memInfo.TotalMemory,
GpuInfo: GetCPUInfo(), FreeMemory: memInfo.FreeMemory,
}, FreeSwap: memInfo.FreeSwap,
GPUs: gpus,
} }
} }
......
...@@ -4,13 +4,8 @@ package discover ...@@ -4,13 +4,8 @@ package discover
import ( import (
"context" "context"
"encoding/json"
"fmt"
"io" "io"
"log/slog" "log/slog"
"math/rand"
"net"
"net/http"
"os" "os"
"os/exec" "os/exec"
"path/filepath" "path/filepath"
...@@ -23,6 +18,7 @@ import ( ...@@ -23,6 +18,7 @@ import (
"github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format" "github.com/ollama/ollama/format"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/logutil" "github.com/ollama/ollama/logutil"
"github.com/ollama/ollama/ml" "github.com/ollama/ollama/ml"
) )
...@@ -36,7 +32,7 @@ var ( ...@@ -36,7 +32,7 @@ var (
bootstrapped bool bootstrapped bool
) )
func GPUDevices(ctx context.Context, runners []FilteredRunnerDiscovery) []ml.DeviceInfo { func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo {
deviceMu.Lock() deviceMu.Lock()
defer deviceMu.Unlock() defer deviceMu.Unlock()
startDiscovery := time.Now() startDiscovery := time.Now()
...@@ -154,9 +150,9 @@ func GPUDevices(ctx context.Context, runners []FilteredRunnerDiscovery) []ml.Dev ...@@ -154,9 +150,9 @@ func GPUDevices(ctx context.Context, runners []FilteredRunnerDiscovery) []ml.Dev
slog.Error("Unknown Library:" + devices[i].Library) slog.Error("Unknown Library:" + devices[i].Library)
} }
extraEnvs := []string{ extraEnvs := map[string]string{
"GGML_CUDA_INIT=1", // force deep initialization to trigger crash on unsupported GPUs "GGML_CUDA_INIT": "1", // force deep initialization to trigger crash on unsupported GPUs
envVar + "=" + id, // Filter to just this one GPU envVar: id, // Filter to just this one GPU
} }
if len(bootstrapDevices(ctx2ndPass, devices[i].LibraryPath, extraEnvs)) == 0 { if len(bootstrapDevices(ctx2ndPass, devices[i].LibraryPath, extraEnvs)) == 0 {
needsDelete[i] = true needsDelete[i] = true
...@@ -449,100 +445,35 @@ func (r *bootstrapRunner) HasExited() bool { ...@@ -449,100 +445,35 @@ func (r *bootstrapRunner) HasExited() bool {
return false return false
} }
func bootstrapDevices(ctx context.Context, ollamaLibDirs []string, extraEnvs []string) []ml.DeviceInfo { func bootstrapDevices(ctx context.Context, ollamaLibDirs []string, extraEnvs map[string]string) []ml.DeviceInfo {
// TODO DRY out with llm/server.go var out io.Writer
slog.Debug("spawning runner with", "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "extra_envs", extraEnvs) if envconfig.LogLevel() == logutil.LevelTrace {
out = os.Stderr
}
start := time.Now() start := time.Now()
defer func() { defer func() {
slog.Debug("bootstrap discovery took", "duration", time.Since(start), "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "extra_envs", extraEnvs) slog.Debug("bootstrap discovery took", "duration", time.Since(start), "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "extra_envs", extraEnvs)
}() }()
port := 0
if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil { logutil.Trace("starting runner for device discovery", "libDirs", ollamaLibDirs, "extraEnvs", extraEnvs)
var l *net.TCPListener cmd, port, err := llm.StartRunner(
if l, err = net.ListenTCP("tcp", a); err == nil { true, // ollama engine
port = l.Addr().(*net.TCPAddr).Port "", // no model
l.Close() ollamaLibDirs,
} out,
} extraEnvs,
if port == 0 { )
slog.Debug("ResolveTCPAddr failed, using random port") if err != nil {
port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range slog.Debug("failed to start runner to discovery GPUs", "error", err)
}
params := []string{"runner", "--ollama-engine", "--port", strconv.Itoa(port)}
var pathEnv string
switch runtime.GOOS {
case "windows":
pathEnv = "PATH"
case "darwin":
pathEnv = "DYLD_LIBRARY_PATH"
default:
pathEnv = "LD_LIBRARY_PATH"
}
libraryPaths := append([]string{LibOllamaPath}, ollamaLibDirs...)
if rocmDir != "" {
libraryPaths = append(libraryPaths, rocmDir)
}
// Note: we always put our dependency paths first
// since these are the exact version we compiled/linked against
if libraryPath, ok := os.LookupEnv(pathEnv); ok {
libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
}
cmd := exec.Command(exe, params...)
cmd.Env = os.Environ()
if envconfig.LogLevel() == logutil.LevelTrace {
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
}
// cmd.SysProcAttr = llm.LlamaServerSysProcAttr // circular dependency - bring back once refactored
pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
pathNeeded := true
ollamaPathNeeded := true
extraDone := make([]bool, len(extraEnvs))
for i := range cmd.Env {
cmp := strings.SplitN(cmd.Env[i], "=", 2)
if strings.EqualFold(cmp[0], pathEnv) {
cmd.Env[i] = pathEnv + "=" + pathEnvVal
pathNeeded = false
} else if strings.EqualFold(cmp[0], "OLLAMA_LIBRARY_PATH") {
cmd.Env[i] = "OLLAMA_LIBRARY_PATH=" + strings.Join(ollamaLibDirs, string(filepath.ListSeparator))
ollamaPathNeeded = false
} else {
for j := range extraEnvs {
if extraDone[j] {
continue
}
extra := strings.SplitN(extraEnvs[j], "=", 2)
if cmp[0] == extra[0] {
cmd.Env[i] = extraEnvs[j]
extraDone[j] = true
}
}
}
}
if pathNeeded {
cmd.Env = append(cmd.Env, pathEnv+"="+pathEnvVal)
}
if ollamaPathNeeded {
cmd.Env = append(cmd.Env, "OLLAMA_LIBRARY_PATH="+strings.Join(ollamaLibDirs, string(filepath.ListSeparator)))
}
for i := range extraDone {
if !extraDone[i] {
cmd.Env = append(cmd.Env, extraEnvs[i])
}
}
logutil.Trace("starting runner for device discovery", "env", cmd.Env, "cmd", cmd)
if err := cmd.Start(); err != nil {
slog.Warn("unable to start discovery subprocess", "cmd", cmd, "error", err)
return nil return nil
} }
go func() { go func() {
cmd.Wait() // exit status ignored cmd.Wait() // exit status ignored
}() }()
defer cmd.Process.Kill() defer cmd.Process.Kill()
devices, err := GetDevicesFromRunner(ctx, &bootstrapRunner{port: port, cmd: cmd}) devices, err := ml.GetDevicesFromRunner(ctx, &bootstrapRunner{port: port, cmd: cmd})
if err != nil { if err != nil {
if cmd.ProcessState != nil && cmd.ProcessState.ExitCode() >= 0 { if cmd.ProcessState != nil && cmd.ProcessState.ExitCode() >= 0 {
// Expected during bootstrapping while we filter out unsupported AMD GPUs // Expected during bootstrapping while we filter out unsupported AMD GPUs
...@@ -555,52 +486,3 @@ func bootstrapDevices(ctx context.Context, ollamaLibDirs []string, extraEnvs []s ...@@ -555,52 +486,3 @@ func bootstrapDevices(ctx context.Context, ollamaLibDirs []string, extraEnvs []s
return devices return devices
} }
func GetDevicesFromRunner(ctx context.Context, runner BaseRunner) ([]ml.DeviceInfo, error) {
var moreDevices []ml.DeviceInfo
port := runner.GetPort()
tick := time.Tick(10 * time.Millisecond)
for {
select {
case <-ctx.Done():
return nil, fmt.Errorf("failed to finish discovery before timeout")
case <-tick:
r, err := http.NewRequestWithContext(ctx, http.MethodGet, fmt.Sprintf("http://127.0.0.1:%d/info", port), nil)
if err != nil {
return nil, fmt.Errorf("failed to create request: %w", err)
}
r.Header.Set("Content-Type", "application/json")
resp, err := http.DefaultClient.Do(r)
if err != nil {
// slog.Warn("failed to send request", "error", err)
if runner.HasExited() {
return nil, fmt.Errorf("runner crashed")
}
continue
}
defer resp.Body.Close()
if resp.StatusCode == http.StatusNotFound {
// old runner, fall back to bootstrapping model
return nil, fmt.Errorf("llamarunner free vram reporting not supported")
}
body, err := io.ReadAll(resp.Body)
if err != nil {
slog.Warn("failed to read response", "error", err)
continue
}
if resp.StatusCode != 200 {
logutil.Trace("runner failed to discover free VRAM", "status", resp.StatusCode, "response", body)
return nil, fmt.Errorf("runner error: %s", string(body))
}
if err := json.Unmarshal(body, &moreDevices); err != nil {
slog.Warn("unmarshal encode response", "error", err)
continue
}
return moreDevices, nil
}
}
}
package discover package discover
import ( import (
"context"
"log/slog" "log/slog"
"path/filepath" "path/filepath"
"runtime"
"strings" "strings"
"github.com/ollama/ollama/format" "github.com/ollama/ollama/format"
...@@ -17,50 +15,6 @@ type memInfo struct { ...@@ -17,50 +15,6 @@ type memInfo struct {
FreeSwap uint64 `json:"free_swap,omitempty"` // TODO split this out for system only FreeSwap uint64 `json:"free_swap,omitempty"` // TODO split this out for system only
} }
// Beginning of an `ollama info` command
type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
ml.DeviceID
memInfo
// Optional variant to select (e.g. versions, cpu feature flags)
Variant string `json:"variant"`
// MinimumMemory represents the minimum memory required to use the GPU
MinimumMemory uint64 `json:"-"`
// Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly
DependencyPath []string `json:"lib_path,omitempty"`
// Set to true if we can NOT reliably discover FreeMemory. A value of true indicates
// the FreeMemory is best effort, and may over or under report actual memory usage
// False indicates FreeMemory can generally be trusted on this GPU
UnreliableFreeMemory bool
// GPU information
filterID string // AMD/Vulkan Workaround: The numeric ID of the device used to filter out other devices
Name string `json:"name"` // user friendly name if available
ComputeMajor int `json:"compute_major"` // Compute Capability or gfx
ComputeMinor int `json:"compute_minor"`
// Driver Information - TODO no need to put this on each GPU
DriverMajor int `json:"driver_major,omitempty"`
DriverMinor int `json:"driver_minor,omitempty"`
// TODO other performance capability info to help in scheduling decisions
}
func (gpu GpuInfo) RunnerName() string {
if gpu.Variant != "" {
return gpu.Library + "_" + gpu.Variant
}
return gpu.Library
}
type CPUInfo struct {
GpuInfo
CPUs []CPU
}
// CPU type represents a CPU Package occupying a socket // CPU type represents a CPU Package occupying a socket
type CPU struct { type CPU struct {
ID string `cpuinfo:"processor"` ID string `cpuinfo:"processor"`
...@@ -71,32 +25,6 @@ type CPU struct { ...@@ -71,32 +25,6 @@ type CPU struct {
ThreadCount int ThreadCount int
} }
type GpuInfoList []GpuInfo
func (l GpuInfoList) ByLibrary() []GpuInfoList {
resp := []GpuInfoList{}
libs := []string{}
for _, info := range l {
found := false
requested := info.Library
if info.Variant != "" {
requested += "_" + info.Variant
}
for i, lib := range libs {
if lib == requested {
resp[i] = append(resp[i], info)
found = true
break
}
}
if !found {
libs = append(libs, requested)
resp = append(resp, []GpuInfo{info})
}
}
return resp
}
func LogDetails(devices []ml.DeviceInfo) { func LogDetails(devices []ml.DeviceInfo) {
for _, dev := range devices { for _, dev := range devices {
var libs []string var libs []string
...@@ -141,74 +69,3 @@ func LogDetails(devices []ml.DeviceInfo) { ...@@ -141,74 +69,3 @@ func LogDetails(devices []ml.DeviceInfo) {
) )
} }
} }
// Sort by Free Space
type ByFreeMemory []GpuInfo
func (a ByFreeMemory) Len() int { return len(a) }
func (a ByFreeMemory) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a ByFreeMemory) Less(i, j int) bool { return a[i].FreeMemory < a[j].FreeMemory }
type SystemInfo struct {
System CPUInfo `json:"system"`
GPUs []GpuInfo `json:"gpus"`
}
// Return the optimal number of threads to use for inference
func (si SystemInfo) GetOptimalThreadCount() int {
if len(si.System.CPUs) == 0 {
// Fall back to Go's num CPU
return runtime.NumCPU()
}
coreCount := 0
for _, c := range si.System.CPUs {
coreCount += c.CoreCount - c.EfficiencyCoreCount
}
return coreCount
}
// For each GPU, check if it does NOT support flash attention
func (l GpuInfoList) FlashAttentionSupported() bool {
for _, gpu := range l {
supportsFA := gpu.Library == "cpu" ||
gpu.Name == "Metal" || gpu.Library == "Metal" ||
(gpu.Library == "CUDA" && gpu.DriverMajor >= 7 && !(gpu.ComputeMajor == 7 && gpu.ComputeMinor == 2)) || // We don't have kernels for Jetson Xavier
gpu.Library == "ROCm" ||
gpu.Library == "Vulkan"
if !supportsFA {
return false
}
}
return true
}
type BaseRunner interface {
// GetPort returns the localhost port number the runner is running on
GetPort() int
// HasExited indicates if the runner is no longer running. This can be used during
// bootstrap to detect if a given filtered device is incompatible and triggered an assert
HasExited() bool
}
type RunnerDiscovery interface {
BaseRunner
// GetDeviceInfos will perform a query of the underlying device libraries
// for device identification and free VRAM information
// During bootstrap scenarios, this routine may take seconds to complete
GetDeviceInfos(ctx context.Context) []ml.DeviceInfo
}
type FilteredRunnerDiscovery interface {
RunnerDiscovery
// GetActiveDeviceIDs returns the filtered set of devices actively in
// use by this runner for running models. If the runner is a bootstrap runner, no devices
// will be active yet so no device IDs are returned.
// This routine will not query the underlying device and will return immediately
GetActiveDeviceIDs() []ml.DeviceID
}
...@@ -4,27 +4,28 @@ import ( ...@@ -4,27 +4,28 @@ import (
"fmt" "fmt"
"log/slog" "log/slog"
"os" "os"
"slices"
"sort" "sort"
"strings" "strings"
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format" "github.com/ollama/ollama/format"
"github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/ml"
) )
// pickBestFullFitByLibrary will try to find the optimal placement of the model in the available GPUs where the model fully fits // pickBestFullFitByLibrary will try to find the optimal placement of the model in the available GPUs where the model fully fits
// The list of GPUs returned will always be the same brand (library) // The list of GPUs returned will always be the same brand (library)
// If the model can not be fit fully within the available GPU(s) nil is returned // If the model can not be fit fully within the available GPU(s) nil is returned
func pickBestFullFitByLibrary(f *ggml.GGML, modelPath string, projectors []string, adapters []string, opts api.Options, gpus discover.GpuInfoList, numParallel int) discover.GpuInfoList { func pickBestFullFitByLibrary(f *ggml.GGML, modelPath string, projectors []string, adapters []string, opts api.Options, gpus []ml.DeviceInfo, numParallel int) []ml.DeviceInfo {
for _, gl := range gpus.ByLibrary() { for _, gl := range ml.ByLibrary(gpus) {
sgl := append(make(discover.GpuInfoList, 0, len(gl)), gl...) sgl := append(make([]ml.DeviceInfo, 0, len(gl)), gl...)
// TODO - potentially sort by performance capability, existing models loaded, etc. // TODO - potentially sort by performance capability, existing models loaded, etc.
// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them // TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
// Note: at present, this will favor most current available VRAM descending and ignoring faster GPU speed in mixed setups // Note: at present, this will favor most current available VRAM descending and ignoring faster GPU speed in mixed setups
sort.Sort(sort.Reverse(discover.ByFreeMemory(sgl))) sort.Sort(sort.Reverse(ml.ByFreeMemory(sgl)))
if !envconfig.SchedSpread() { if !envconfig.SchedSpread() {
// Try to pack into as few GPUs as possible, starting from 1 GPU // Try to pack into as few GPUs as possible, starting from 1 GPU
...@@ -63,8 +64,8 @@ func pickBestFullFitByLibrary(f *ggml.GGML, modelPath string, projectors []strin ...@@ -63,8 +64,8 @@ func pickBestFullFitByLibrary(f *ggml.GGML, modelPath string, projectors []strin
} }
// If multiple Libraries are detected, pick the Library which loads the most layers for the model // If multiple Libraries are detected, pick the Library which loads the most layers for the model
func pickBestPartialFitByLibrary(f *ggml.GGML, projectors []string, adapters []string, opts api.Options, gpus discover.GpuInfoList, numParallel int) discover.GpuInfoList { func pickBestPartialFitByLibrary(f *ggml.GGML, projectors []string, adapters []string, opts api.Options, gpus []ml.DeviceInfo, numParallel int) []ml.DeviceInfo {
byLibrary := gpus.ByLibrary() byLibrary := ml.ByLibrary(gpus)
if len(byLibrary) <= 1 { if len(byLibrary) <= 1 {
return gpus return gpus
} }
...@@ -81,10 +82,10 @@ func pickBestPartialFitByLibrary(f *ggml.GGML, projectors []string, adapters []s ...@@ -81,10 +82,10 @@ func pickBestPartialFitByLibrary(f *ggml.GGML, projectors []string, adapters []s
} }
// This algorithm looks for a complete fit to determine if we need to unload other models // This algorithm looks for a complete fit to determine if we need to unload other models
func predictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) { func predictServerFit(allGpus []ml.DeviceInfo, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) {
// Split up the GPUs by type and try them // Split up the GPUs by type and try them
var estimatedVRAM uint64 var estimatedVRAM uint64
for _, gpus := range allGpus.ByLibrary() { for _, gpus := range ml.ByLibrary(allGpus) {
var layerCount int var layerCount int
estimate := estimateGPULayers(gpus, f, projectors, opts, numParallel) estimate := estimateGPULayers(gpus, f, projectors, opts, numParallel)
layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize
...@@ -97,14 +98,23 @@ func predictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, proj ...@@ -97,14 +98,23 @@ func predictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, proj
return true, estimatedVRAM return true, estimatedVRAM
} }
} }
if len(gpus) == 1 && gpus[0].Library == "cpu" && estimate.TotalSize <= gpus[0].FreeMemory {
return true, estimatedVRAM
}
} }
return false, estimatedVRAM return false, estimatedVRAM
} }
func verifyCPUFit(f *ggml.GGML, modelPath string, projectors []string, adapters []string, opts api.Options, systemInfo ml.SystemInfo, numParallel int) bool {
estimate := estimateGPULayers(nil, f, projectors, opts, numParallel)
if estimate.TotalSize > systemInfo.FreeMemory {
return false
}
slog.Info("new model will fit in available system memory for CPU inference, loading",
"model", modelPath,
"parallel", numParallel,
"required", format.HumanBytes2(estimate.TotalSize),
)
return true
}
type MemoryEstimate struct { type MemoryEstimate struct {
// How many layers we predict we can load // How many layers we predict we can load
Layers int Layers int
...@@ -141,7 +151,7 @@ type MemoryEstimate struct { ...@@ -141,7 +151,7 @@ type MemoryEstimate struct {
// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
// The GPUs provided must all be the same Library // The GPUs provided must all be the same Library
func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []string, opts api.Options, numParallel int) MemoryEstimate { func estimateGPULayers(gpus []ml.DeviceInfo, f *ggml.GGML, projectors []string, opts api.Options, numParallel int) MemoryEstimate {
// Graph size for a partial offload, applies to all GPUs // Graph size for a partial offload, applies to all GPUs
var graphPartialOffload uint64 var graphPartialOffload uint64
...@@ -175,10 +185,17 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin ...@@ -175,10 +185,17 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
overhead := envconfig.GpuOverhead() overhead := envconfig.GpuOverhead()
availableList := make([]string, len(gpus)) availableList := make([]string, len(gpus))
libraries := []string{}
for i, gpu := range gpus { for i, gpu := range gpus {
availableList[i] = format.HumanBytes2(gpu.FreeMemory) availableList[i] = format.HumanBytes2(gpu.FreeMemory)
if !slices.Contains(libraries, gpu.Library) {
libraries = append(libraries, gpu.Library)
}
}
if len(libraries) == 0 {
libraries = []string{"cpu"}
} }
slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList) slog.Debug("evaluating", "library", strings.Join(libraries, ","), "gpu_count", len(gpus), "available", availableList)
for _, projector := range projectors { for _, projector := range projectors {
llamaEngineProjectorWeights += projectorMemoryRequirements(projector) llamaEngineProjectorWeights += projectorMemoryRequirements(projector)
...@@ -196,7 +213,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin ...@@ -196,7 +213,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
} }
useFlashAttention := envconfig.FlashAttention(f.FlashAttention()) && useFlashAttention := envconfig.FlashAttention(f.FlashAttention()) &&
(discover.GpuInfoList)(gpus).FlashAttentionSupported() && ml.FlashAttentionSupported(gpus) &&
f.SupportsFlashAttention() f.SupportsFlashAttention()
var kvct string var kvct string
...@@ -231,7 +248,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin ...@@ -231,7 +248,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
} }
// on metal there's no partial offload overhead // on metal there's no partial offload overhead
if gpus[0].Library == "Metal" { if len(gpus) > 0 && gpus[0].Library == "Metal" {
graphPartialOffload = graphFullOffload graphPartialOffload = graphFullOffload
} else if len(gpus) > 1 { } else if len(gpus) > 1 {
// multigpu should always use the partial graph size // multigpu should always use the partial graph size
...@@ -256,7 +273,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin ...@@ -256,7 +273,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
gpuAllocations := make([]uint64, len(gpus)) gpuAllocations := make([]uint64, len(gpus))
type gs struct { type gs struct {
i int i int
g *discover.GpuInfo g *ml.DeviceInfo
} }
gpusWithSpace := []gs{} gpusWithSpace := []gs{}
for i := range gpus { for i := range gpus {
...@@ -265,19 +282,11 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin ...@@ -265,19 +282,11 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
gzo = gpuZeroOverhead gzo = gpuZeroOverhead
} }
// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer // Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
if gpus[i].FreeMemory < overhead+gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize { if gpus[i].FreeMemory < overhead+gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory()+2*layerSize {
var compute string
if gpus[i].Library == "ROCm" {
compute = fmt.Sprintf("gfx%x%02x", gpus[i].ComputeMajor, gpus[i].ComputeMinor)
} else {
compute = fmt.Sprintf("%d.%d", gpus[i].ComputeMajor, gpus[i].ComputeMinor)
}
slog.Debug("gpu has too little memory to allocate any layers", slog.Debug("gpu has too little memory to allocate any layers",
"id", gpus[i].ID, "id", gpus[i].ID,
"library", gpus[i].Library, "library", gpus[i].Library,
"variant", gpus[i].Variant, "compute", gpus[i].Compute(),
"compute", compute,
"driver", fmt.Sprintf("%d.%d", gpus[i].DriverMajor, gpus[i].DriverMinor), "driver", fmt.Sprintf("%d.%d", gpus[i].DriverMajor, gpus[i].DriverMinor),
"name", gpus[i].Name, "name", gpus[i].Name,
"total", format.HumanBytes2(gpus[i].TotalMemory), "total", format.HumanBytes2(gpus[i].TotalMemory),
...@@ -291,7 +300,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin ...@@ -291,7 +300,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
continue continue
} }
gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]}) gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
gpuAllocations[i] += gpus[i].MinimumMemory + layerSize // We hold off on graph until we know partial vs. full gpuAllocations[i] += gpus[i].MinimumMemory() + layerSize // We hold off on graph until we know partial vs. full
} }
var gpuZeroID int var gpuZeroID int
...@@ -397,7 +406,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin ...@@ -397,7 +406,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
VRAMSize: 0, VRAMSize: 0,
GPUSizes: []uint64{}, GPUSizes: []uint64{},
inferenceLibrary: gpus[0].Library, inferenceLibrary: strings.Join(libraries, ","),
layersRequested: opts.NumGPU, layersRequested: opts.NumGPU,
layersModel: int(f.KV().BlockCount()) + 1, layersModel: int(f.KV().BlockCount()) + 1,
availableList: availableList, availableList: availableList,
...@@ -411,7 +420,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin ...@@ -411,7 +420,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
projectorGraph: ollamaEngineProjectorGraph, projectorGraph: ollamaEngineProjectorGraph,
} }
if gpus[0].Library == "cpu" { if len(gpus) == 0 {
return estimate return estimate
} }
if layerCount == 0 { if layerCount == 0 {
......
...@@ -10,7 +10,7 @@ import ( ...@@ -10,7 +10,7 @@ import (
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/discover" "github.com/ollama/ollama/format"
"github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/ml" "github.com/ollama/ollama/ml"
) )
...@@ -54,13 +54,7 @@ func TestEstimateGPULayers(t *testing.T) { ...@@ -54,13 +54,7 @@ func TestEstimateGPULayers(t *testing.T) {
} }
// Simple CPU scenario // Simple CPU scenario
gpus := []discover.GpuInfo{ gpus := []ml.DeviceInfo{}
{
DeviceID: ml.DeviceID{
Library: "cpu",
},
},
}
projectors := []string{} projectors := []string{}
opts := api.DefaultOptions() opts := api.DefaultOptions()
t.Run("cpu", func(t *testing.T) { t.Run("cpu", func(t *testing.T) {
...@@ -77,19 +71,17 @@ func TestEstimateGPULayers(t *testing.T) { ...@@ -77,19 +71,17 @@ func TestEstimateGPULayers(t *testing.T) {
memoryLayerOutput := uint64(4) memoryLayerOutput := uint64(4)
// Dual CUDA scenario with asymmetry // Dual CUDA scenario with asymmetry
gpuMinimumMemory := uint64(2048) gpuMinimumMemory := uint64(457 * format.MebiByte)
gpus = []discover.GpuInfo{ gpus = []ml.DeviceInfo{
{ {
DeviceID: ml.DeviceID{ DeviceID: ml.DeviceID{
Library: "cuda", Library: "CUDA",
}, },
MinimumMemory: gpuMinimumMemory,
}, },
{ {
DeviceID: ml.DeviceID{ DeviceID: ml.DeviceID{
Library: "cuda", Library: "CUDA",
}, },
MinimumMemory: gpuMinimumMemory,
}, },
} }
// Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1 // Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1
......
This diff is collapsed.
...@@ -8,7 +8,6 @@ import ( ...@@ -8,7 +8,6 @@ import (
"testing" "testing"
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/format" "github.com/ollama/ollama/format"
"github.com/ollama/ollama/ml" "github.com/ollama/ollama/ml"
"golang.org/x/sync/semaphore" "golang.org/x/sync/semaphore"
...@@ -20,6 +19,8 @@ func TestLLMServerFitGPU(t *testing.T) { ...@@ -20,6 +19,8 @@ func TestLLMServerFitGPU(t *testing.T) {
free int free int
} }
minMemory := 457 * format.MebiByte
tests := []struct { tests := []struct {
name string name string
gpus []gpu gpus []gpu
...@@ -37,91 +38,91 @@ func TestLLMServerFitGPU(t *testing.T) { ...@@ -37,91 +38,91 @@ func TestLLMServerFitGPU(t *testing.T) {
}, },
{ {
name: "Full single GPU", name: "Full single GPU",
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}}, gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256*format.MebiByte + minMemory}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte}, layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1, numGPU: -1,
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0, 1, 2}}}, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0, 1, 2}}},
}, },
{ {
name: "Partial single GPU", name: "Partial single GPU",
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}}, gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256*format.MebiByte + minMemory}},
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte}, layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
numGPU: -1, numGPU: -1,
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1, 2}}}, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1, 2}}},
}, },
{ {
name: "Single GPU with numGPU 1", name: "Single GPU with numGPU 1",
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}}, gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256*format.MebiByte + minMemory}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte}, layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: 1, numGPU: 1,
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1}}}, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1}}},
}, },
{ {
name: "Single GPU with numGPU 0", name: "Single GPU with numGPU 0",
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}}, gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256*format.MebiByte + minMemory}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte}, layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: 0, numGPU: 0,
expected: ml.GPULayersList{}, expected: ml.GPULayersList{},
}, },
{ {
name: "Single GPU with numGPU 999", name: "Single GPU with numGPU 999",
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}}, gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256*format.MebiByte + minMemory}},
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte}, layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
numGPU: 999, numGPU: 999,
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0, 1, 2, 3}}}, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0, 1, 2, 3}}},
}, },
{ {
name: "Multi GPU fits on one", name: "Multi GPU fits on one",
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}}, gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{ID: "gpu1"}, free: 256*format.MebiByte + minMemory}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte}, layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1, numGPU: -1,
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0, 1, 2}}}, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0, 1, 2}}},
}, },
{ {
name: "Multi GPU split", name: "Multi GPU split",
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}}, gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{ID: "gpu1"}, free: 256*format.MebiByte + minMemory}},
layers: []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte}, layers: []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1, numGPU: -1,
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1, 2}}}, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1, 2}}},
}, },
{ {
name: "Multi GPU partial", name: "Multi GPU partial",
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}}, gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{ID: "gpu1"}, free: 256*format.MebiByte + minMemory}},
layers: []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte}, layers: []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1, numGPU: -1,
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{1}}}, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{1}}},
}, },
{ {
name: "Multi GPU numGPU 1", name: "Multi GPU numGPU 1",
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}}, gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{ID: "gpu1"}, free: 256*format.MebiByte + minMemory}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte}, layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: 1, numGPU: 1,
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{1}}}, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{1}}},
}, },
{ {
name: "Multi GPU numGPU 2", name: "Multi GPU numGPU 2",
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}}, gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{ID: "gpu1"}, free: 256*format.MebiByte + minMemory}},
layers: []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte}, layers: []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: 2, numGPU: 2,
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1}}}, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1}}},
}, },
{ {
name: "Multi GPU numGPU 999", name: "Multi GPU numGPU 999",
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}}, gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{ID: "gpu1"}, free: 256*format.MebiByte + minMemory}},
layers: []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte}, layers: []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
numGPU: 999, numGPU: 999,
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0, 1}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{2}}}, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0, 1}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{2}}},
}, },
{ {
name: "Multi GPU different libraries", name: "Multi GPU different libraries",
gpus: []gpu{{id: ml.DeviceID{Library: "CUDA", ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{Library: "ROCm", ID: "gpu1"}, free: 256 * format.MebiByte}}, gpus: []gpu{{id: ml.DeviceID{Library: "CUDA", ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{Library: "ROCm", ID: "gpu1"}, free: 256*format.MebiByte + minMemory}},
layers: []int{128 * format.MebiByte, 128 * format.MebiByte, 50 * format.MebiByte}, layers: []int{128 * format.MebiByte, 128 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1, numGPU: -1,
expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1", Library: "ROCm"}, Layers: []int{0, 1}}}, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1", Library: "ROCm"}, Layers: []int{0, 1}}},
}, },
{ {
name: "requireFull", name: "requireFull",
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}}, gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256*format.MebiByte + minMemory}},
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte}, layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
numGPU: -1, numGPU: -1,
requireFull: true, requireFull: true,
...@@ -139,12 +140,12 @@ func TestLLMServerFitGPU(t *testing.T) { ...@@ -139,12 +140,12 @@ func TestLLMServerFitGPU(t *testing.T) {
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) { t.Run(tt.name, func(t *testing.T) {
var systemInfo discover.SystemInfo var systemInfo ml.SystemInfo
systemInfo.System.TotalMemory = format.GibiByte systemInfo.TotalMemory = format.GibiByte
systemInfo.System.FreeMemory = 512 * format.MebiByte systemInfo.FreeMemory = 512 * format.MebiByte
systemInfo.System.FreeSwap = 256 * format.MebiByte systemInfo.FreeSwap = 256 * format.MebiByte
gpus := make(discover.GpuInfoList, len(tt.gpus)) gpus := make([]ml.DeviceInfo, len(tt.gpus))
for i := range tt.gpus { for i := range tt.gpus {
gpus[i].DeviceID = tt.gpus[i].id gpus[i].DeviceID = tt.gpus[i].id
gpus[i].FreeMemory = uint64(tt.gpus[i].free) gpus[i].FreeMemory = uint64(tt.gpus[i].free)
......
...@@ -3,15 +3,21 @@ package ml ...@@ -3,15 +3,21 @@ package ml
import ( import (
"context" "context"
"encoding/binary" "encoding/binary"
"encoding/json"
"fmt" "fmt"
"hash/maphash" "hash/maphash"
"io"
"log/slog" "log/slog"
"net/http"
"runtime"
"slices" "slices"
"sort" "sort"
"strconv" "strconv"
"strings" "strings"
"time"
"github.com/ollama/ollama/format" "github.com/ollama/ollama/format"
"github.com/ollama/ollama/logutil"
) )
// GPULayers is a set of layers to be allocated on a single GPU // GPULayers is a set of layers to be allocated on a single GPU
...@@ -282,6 +288,20 @@ type DeviceInfo struct { ...@@ -282,6 +288,20 @@ type DeviceInfo struct {
LibraryPath []string LibraryPath []string
} }
type SystemInfo struct {
// ThreadCount is the optimal number of threads to use for inference
ThreadCount int `json:"threads,omitempty"`
// TotalMemory is the total amount of system memory
TotalMemory uint64 `json:"total_memory,omitempty"`
// FreeMemory is the amount of memory currently available on the system for loading models
FreeMemory uint64 `json:"free_memory,omitempty"`
// FreeSwap is the amount of system swap space reported as available
FreeSwap uint64 `json:"free_swap,omitempty"`
}
func (d DeviceInfo) Compute() string { func (d DeviceInfo) Compute() string {
// AMD gfx is encoded into the major minor in hex form // AMD gfx is encoded into the major minor in hex form
if strings.EqualFold(d.Library, "ROCm") { if strings.EqualFold(d.Library, "ROCm") {
...@@ -294,6 +314,71 @@ func (d DeviceInfo) Driver() string { ...@@ -294,6 +314,71 @@ func (d DeviceInfo) Driver() string {
return strconv.Itoa(d.DriverMajor) + "." + strconv.Itoa(d.DriverMinor) return strconv.Itoa(d.DriverMajor) + "." + strconv.Itoa(d.DriverMinor)
} }
// MinimumMemory reports the amount of memory that should be set aside
// on the device for overhead (e.g. VRAM consumed by context structures independent
// of model allocations)
func (d DeviceInfo) MinimumMemory() uint64 {
if d.Library == "Metal" {
return 512 * format.MebiByte
}
return 457 * format.MebiByte
}
// Sort by Free Space.
// iGPUs are reported first, thus Reverse() yields the largest discrete GPU first
type ByFreeMemory []DeviceInfo
func (a ByFreeMemory) Len() int { return len(a) }
func (a ByFreeMemory) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a ByFreeMemory) Less(i, j int) bool {
if a[i].Integrated && !a[j].Integrated {
return true
} else if !a[i].Integrated && a[j].Integrated {
return false
}
return a[i].FreeMemory < a[j].FreeMemory
}
func ByLibrary(l []DeviceInfo) [][]DeviceInfo {
resp := [][]DeviceInfo{}
libs := []string{}
for _, info := range l {
found := false
requested := info.Library
for i, lib := range libs {
if lib == requested {
resp[i] = append(resp[i], info)
found = true
break
}
}
if !found {
libs = append(libs, requested)
resp = append(resp, []DeviceInfo{info})
}
}
return resp
}
func LibraryPaths(l []DeviceInfo) []string {
var gpuLibs []string
for _, gpu := range l {
for _, dir := range gpu.LibraryPath {
needed := true
for _, existing := range gpuLibs {
if dir == existing {
needed = false
break
}
}
if needed {
gpuLibs = append(gpuLibs, dir)
}
}
}
return gpuLibs
}
type DeviceComparison int type DeviceComparison int
const ( const (
...@@ -336,3 +421,133 @@ func (a DeviceInfo) IsBetter(b DeviceInfo) bool { ...@@ -336,3 +421,133 @@ func (a DeviceInfo) IsBetter(b DeviceInfo) bool {
sort.Sort(sort.Reverse(sort.StringSlice(cmp))) sort.Sort(sort.Reverse(sort.StringSlice(cmp)))
return cmp[0] == bLibSplit[1] return cmp[0] == bLibSplit[1]
} }
// For each GPU, check if it does NOT support flash attention
func FlashAttentionSupported(l []DeviceInfo) bool {
for _, gpu := range l {
supportsFA := gpu.Library == "cpu" ||
gpu.Name == "Metal" || gpu.Library == "Metal" ||
(gpu.Library == "CUDA" && gpu.DriverMajor >= 7 && !(gpu.ComputeMajor == 7 && gpu.ComputeMinor == 2)) ||
gpu.Library == "ROCm"
if !supportsFA {
return false
}
}
return true
}
// Given the list of GPUs this instantiation is targeted for,
// figure out the visible devices environment variables
func GetVisibleDevicesEnv(l []DeviceInfo) map[string]string {
if len(l) == 0 {
return nil
}
env := map[string]string{}
for _, d := range l {
d.updateVisibleDevicesEnv(env)
}
return env
}
func (d DeviceInfo) updateVisibleDevicesEnv(env map[string]string) {
var envVar string
switch d.Library {
case "ROCm":
envVar = "ROCR_VISIBLE_DEVICES"
if runtime.GOOS != "linux" {
envVar = "HIP_VISIBLE_DEVICES"
}
case "Vulkan":
envVar = "GGML_VK_VISIBLE_DEVICES"
default:
return
}
v, existing := env[envVar]
if existing {
v = v + ","
}
if d.FilteredID != "" {
v = v + d.FilteredID
} else {
v = v + d.ID
}
env[envVar] = v
}
type BaseRunner interface {
// GetPort returns the localhost port number the runner is running on
GetPort() int
// HasExited indicates if the runner is no longer running. This can be used during
// bootstrap to detect if a given filtered device is incompatible and triggered an assert
HasExited() bool
}
type RunnerDiscovery interface {
BaseRunner
// GetDeviceInfos will perform a query of the underlying device libraries
// for device identification and free VRAM information
// During bootstrap scenarios, this routine may take seconds to complete
GetDeviceInfos(ctx context.Context) []DeviceInfo
}
type FilteredRunnerDiscovery interface {
RunnerDiscovery
// GetActiveDeviceIDs returns the filtered set of devices actively in
// use by this runner for running models. If the runner is a bootstrap runner, no devices
// will be active yet so no device IDs are returned.
// This routine will not query the underlying device and will return immediately
GetActiveDeviceIDs() []DeviceID
}
func GetDevicesFromRunner(ctx context.Context, runner BaseRunner) ([]DeviceInfo, error) {
var moreDevices []DeviceInfo
port := runner.GetPort()
tick := time.Tick(10 * time.Millisecond)
for {
select {
case <-ctx.Done():
return nil, fmt.Errorf("failed to finish discovery before timeout")
case <-tick:
r, err := http.NewRequestWithContext(ctx, http.MethodGet, fmt.Sprintf("http://127.0.0.1:%d/info", port), nil)
if err != nil {
return nil, fmt.Errorf("failed to create request: %w", err)
}
r.Header.Set("Content-Type", "application/json")
resp, err := http.DefaultClient.Do(r)
if err != nil {
// slog.Warn("failed to send request", "error", err)
if runner.HasExited() {
return nil, fmt.Errorf("runner crashed")
}
continue
}
defer resp.Body.Close()
if resp.StatusCode == http.StatusNotFound {
// old runner, fall back to bootstrapping model
return nil, fmt.Errorf("llamarunner free vram reporting not supported")
}
body, err := io.ReadAll(resp.Body)
if err != nil {
slog.Warn("failed to read response", "error", err)
continue
}
if resp.StatusCode != 200 {
logutil.Trace("runner failed to discover free VRAM", "status", resp.StatusCode, "response", body)
return nil, fmt.Errorf("runner error: %s", string(body))
}
if err := json.Unmarshal(body, &moreDevices); err != nil {
slog.Warn("unmarshal encode response", "error", err)
continue
}
return moreDevices, nil
}
}
}
...@@ -84,11 +84,11 @@ function buildCPU() { ...@@ -84,11 +84,11 @@ function buildCPU() {
Remove-Item -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}" Remove-Item -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}"
New-Item "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ItemType Directory -ea 0 New-Item "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ItemType Directory -ea 0
& cmake --fresh --preset CPU --install-prefix $script:DIST_DIR & cmake -B build\cpu --preset CPU --install-prefix $script:DIST_DIR
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
& cmake --build --preset CPU --config Release --parallel $script:JOBS & cmake --build build\cpu --target ggml-cpu --config Release --parallel $script:JOBS
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
& cmake --install build --component CPU --strip & cmake --install build\cpu --component CPU --strip
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
} }
} }
...@@ -105,11 +105,11 @@ function buildCUDA11() { ...@@ -105,11 +105,11 @@ function buildCUDA11() {
$hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V11")) { $x=$hashEnv[$_]; if (test-path -literalpath "$x\bin\nvcc.exe" ) { $cuda=$x} }} $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V11")) { $x=$hashEnv[$_]; if (test-path -literalpath "$x\bin\nvcc.exe" ) { $cuda=$x} }}
write-host "Building CUDA v11 backend libraries $cuda" write-host "Building CUDA v11 backend libraries $cuda"
$env:CUDAToolkit_ROOT=$cuda $env:CUDAToolkit_ROOT=$cuda
& cmake --fresh --preset "CUDA 11" -T cuda="$cuda" -DCMAKE_CUDA_COMPILER="$cuda\bin\nvcc.exe" -G "Visual Studio 16 2019" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v11" & cmake -B build\cuda_v11 --preset "CUDA 11" -T cuda="$cuda" -DCMAKE_CUDA_COMPILER="$cuda\bin\nvcc.exe" -G "Visual Studio 16 2019" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v11"
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
& cmake --build --preset "CUDA 11" --config Release --parallel $script:JOBS & cmake --build build\cuda_v11 --target ggml-cuda --config Release --parallel $script:JOBS
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
& cmake --install build --component "CUDA" --strip & cmake --install build\cuda_v11 --component "CUDA" --strip
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
} }
} }
...@@ -124,11 +124,11 @@ function buildCUDA12() { ...@@ -124,11 +124,11 @@ function buildCUDA12() {
$hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V12_8")) { $x=$hashEnv[$_]; if (test-path -literalpath "$x\bin\nvcc.exe" ) { $cuda=$x} }} $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V12_8")) { $x=$hashEnv[$_]; if (test-path -literalpath "$x\bin\nvcc.exe" ) { $cuda=$x} }}
write-host "Building CUDA v12 backend libraries $cuda" write-host "Building CUDA v12 backend libraries $cuda"
$env:CUDAToolkit_ROOT=$cuda $env:CUDAToolkit_ROOT=$cuda
& cmake --fresh --preset "CUDA 12" -T cuda="$cuda" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v12" & cmake -B build\cuda_v12 --preset "CUDA 12" -T cuda="$cuda" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v12"
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
& cmake --build --preset "CUDA 12" --config Release --parallel $script:JOBS & cmake --build build\cuda_v12 --target ggml-cuda --config Release --parallel $script:JOBS
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
& cmake --install build --component "CUDA" --strip & cmake --install build\cuda_v12 --component "CUDA" --strip
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
} }
} }
...@@ -143,11 +143,11 @@ function buildCUDA13() { ...@@ -143,11 +143,11 @@ function buildCUDA13() {
$hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V13")) { $x=$hashEnv[$_]; if (test-path -literalpath "$x\bin\nvcc.exe" ) { $cuda=$x} }} $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V13")) { $x=$hashEnv[$_]; if (test-path -literalpath "$x\bin\nvcc.exe" ) { $cuda=$x} }}
$env:CUDAToolkit_ROOT=$cuda $env:CUDAToolkit_ROOT=$cuda
write-host "Building CUDA v13 backend libraries $cuda" write-host "Building CUDA v13 backend libraries $cuda"
& cmake --fresh --preset "CUDA 13" -T cuda="$cuda" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v13" & cmake -B build\cuda_v13 --preset "CUDA 13" -T cuda="$cuda" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v13"
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
& cmake --build --preset "CUDA 13" --config Release --parallel $script:JOBS & cmake --build build\cuda_v13 --target ggml-cuda --config Release --parallel $script:JOBS
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
& cmake --install build --component "CUDA" --strip & cmake --install build\cuda_v13 --component "CUDA" --strip
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
} }
} }
...@@ -165,7 +165,7 @@ function buildROCm() { ...@@ -165,7 +165,7 @@ function buildROCm() {
$env:HIPCXX="${env:HIP_PATH}\bin\clang++.exe" $env:HIPCXX="${env:HIP_PATH}\bin\clang++.exe"
$env:HIP_PLATFORM="amd" $env:HIP_PLATFORM="amd"
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}" $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
& cmake --fresh --preset "ROCm 6" -G Ninja -DOLLAMA_RUNNER_DIR="rocm" ` & cmake --fresh -B build\rocm --preset "ROCm 6" -G Ninja -DOLLAMA_RUNNER_DIR="rocm" `
-DCMAKE_C_COMPILER=clang ` -DCMAKE_C_COMPILER=clang `
-DCMAKE_CXX_COMPILER=clang++ ` -DCMAKE_CXX_COMPILER=clang++ `
-DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" ` -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" `
...@@ -175,9 +175,9 @@ function buildROCm() { ...@@ -175,9 +175,9 @@ function buildROCm() {
$env:HIPCXX="" $env:HIPCXX=""
$env:HIP_PLATFORM="" $env:HIP_PLATFORM=""
$env:CMAKE_PREFIX_PATH="" $env:CMAKE_PREFIX_PATH=""
& cmake --build --preset "ROCm 6" --config Release --parallel $script:JOBS & cmake --build build\rocm --target ggml-hip --config Release --parallel $script:JOBS
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
& cmake --install build --component "HIP" --strip & cmake --install build\rocm --component "HIP" --strip
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
Remove-Item -Path $script:DIST_DIR\lib\ollama\rocm\rocblas\library\*gfx906* -ErrorAction SilentlyContinue Remove-Item -Path $script:DIST_DIR\lib\ollama\rocm\rocblas\library\*gfx906* -ErrorAction SilentlyContinue
} }
......
...@@ -9,9 +9,9 @@ import ( ...@@ -9,9 +9,9 @@ import (
"github.com/gin-gonic/gin" "github.com/gin-gonic/gin"
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/llm"
"github.com/ollama/ollama/ml"
) )
func TestGenerateDebugRenderOnly(t *testing.T) { func TestGenerateDebugRenderOnly(t *testing.T) {
...@@ -37,9 +37,9 @@ func TestGenerateDebugRenderOnly(t *testing.T) { ...@@ -37,9 +37,9 @@ func TestGenerateDebugRenderOnly(t *testing.T) {
loaded: make(map[string]*runnerRef), loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock), newServerFn: newMockServer(&mock),
getGpuFn: getGpuFn, getGpuFn: getGpuFn,
getCpuFn: getCpuFn, getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 250 * time.Millisecond, waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool { loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
// add small delay to simulate loading // add small delay to simulate loading
time.Sleep(time.Millisecond) time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{ req.successCh <- &runnerRef{
...@@ -230,9 +230,9 @@ func TestChatDebugRenderOnly(t *testing.T) { ...@@ -230,9 +230,9 @@ func TestChatDebugRenderOnly(t *testing.T) {
loaded: make(map[string]*runnerRef), loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock), newServerFn: newMockServer(&mock),
getGpuFn: getGpuFn, getGpuFn: getGpuFn,
getCpuFn: getCpuFn, getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 250 * time.Millisecond, waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool { loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
// add small delay to simulate loading // add small delay to simulate loading
time.Sleep(time.Millisecond) time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{ req.successCh <- &runnerRef{
......
...@@ -12,9 +12,9 @@ import ( ...@@ -12,9 +12,9 @@ import (
"github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp"
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/llm"
"github.com/ollama/ollama/ml"
) )
// TestGenerateWithBuiltinRenderer tests that api/generate uses built-in renderers // TestGenerateWithBuiltinRenderer tests that api/generate uses built-in renderers
...@@ -42,9 +42,9 @@ func TestGenerateWithBuiltinRenderer(t *testing.T) { ...@@ -42,9 +42,9 @@ func TestGenerateWithBuiltinRenderer(t *testing.T) {
loaded: make(map[string]*runnerRef), loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock), newServerFn: newMockServer(&mock),
getGpuFn: getGpuFn, getGpuFn: getGpuFn,
getCpuFn: getCpuFn, getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 250 * time.Millisecond, waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool { loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
time.Sleep(time.Millisecond) time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{ req.successCh <- &runnerRef{
llama: &mock, llama: &mock,
...@@ -226,9 +226,9 @@ func TestGenerateWithDebugRenderOnly(t *testing.T) { ...@@ -226,9 +226,9 @@ func TestGenerateWithDebugRenderOnly(t *testing.T) {
loaded: make(map[string]*runnerRef), loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock), newServerFn: newMockServer(&mock),
getGpuFn: getGpuFn, getGpuFn: getGpuFn,
getCpuFn: getCpuFn, getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 250 * time.Millisecond, waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool { loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
time.Sleep(time.Millisecond) time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{ req.successCh <- &runnerRef{
llama: &mock, llama: &mock,
......
...@@ -17,9 +17,9 @@ import ( ...@@ -17,9 +17,9 @@ import (
"github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp"
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/llm"
"github.com/ollama/ollama/ml"
) )
type mockRunner struct { type mockRunner struct {
...@@ -48,8 +48,8 @@ func (mockRunner) Tokenize(_ context.Context, s string) (tokens []int, err error ...@@ -48,8 +48,8 @@ func (mockRunner) Tokenize(_ context.Context, s string) (tokens []int, err error
return return
} }
func newMockServer(mock *mockRunner) func(discover.GpuInfoList, string, *ggml.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) { func newMockServer(mock *mockRunner) func(ml.SystemInfo, []ml.DeviceInfo, string, *ggml.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
return func(_ discover.GpuInfoList, _ string, _ *ggml.GGML, _, _ []string, _ api.Options, _ int) (llm.LlamaServer, error) { return func(_ ml.SystemInfo, _ []ml.DeviceInfo, _ string, _ *ggml.GGML, _, _ []string, _ api.Options, _ int) (llm.LlamaServer, error) {
return mock, nil return mock, nil
} }
} }
...@@ -157,9 +157,9 @@ func TestGenerateChat(t *testing.T) { ...@@ -157,9 +157,9 @@ func TestGenerateChat(t *testing.T) {
loaded: make(map[string]*runnerRef), loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock), newServerFn: newMockServer(&mock),
getGpuFn: getGpuFn, getGpuFn: getGpuFn,
getCpuFn: getCpuFn, getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 250 * time.Millisecond, waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool { loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
// add small delay to simulate loading // add small delay to simulate loading
time.Sleep(time.Millisecond) time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{ req.successCh <- &runnerRef{
...@@ -768,9 +768,9 @@ func TestGenerate(t *testing.T) { ...@@ -768,9 +768,9 @@ func TestGenerate(t *testing.T) {
loaded: make(map[string]*runnerRef), loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock), newServerFn: newMockServer(&mock),
getGpuFn: getGpuFn, getGpuFn: getGpuFn,
getCpuFn: getCpuFn, getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 250 * time.Millisecond, waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool { loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
// add small delay to simulate loading // add small delay to simulate loading
time.Sleep(time.Millisecond) time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{ req.successCh <- &runnerRef{
...@@ -1193,9 +1193,9 @@ func TestChatWithPromptEndingInThinkTag(t *testing.T) { ...@@ -1193,9 +1193,9 @@ func TestChatWithPromptEndingInThinkTag(t *testing.T) {
loaded: make(map[string]*runnerRef), loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(mock), newServerFn: newMockServer(mock),
getGpuFn: getGpuFn, getGpuFn: getGpuFn,
getCpuFn: getCpuFn, getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 250 * time.Millisecond, waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool { loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
time.Sleep(time.Millisecond) time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{llama: mock} req.successCh <- &runnerRef{llama: mock}
return false return false
......
...@@ -14,9 +14,9 @@ import ( ...@@ -14,9 +14,9 @@ import (
"github.com/gin-gonic/gin" "github.com/gin-gonic/gin"
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/llm"
"github.com/ollama/ollama/ml"
) )
func getTestTools() []api.Tool { func getTestTools() []api.Tool {
...@@ -275,9 +275,9 @@ func TestChatHarmonyParserStreamingRealtime(t *testing.T) { ...@@ -275,9 +275,9 @@ func TestChatHarmonyParserStreamingRealtime(t *testing.T) {
loaded: make(map[string]*runnerRef), loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock), newServerFn: newMockServer(&mock),
getGpuFn: getGpuFn, getGpuFn: getGpuFn,
getCpuFn: getCpuFn, getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 100 * time.Millisecond, waitForRecovery: 100 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool { loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
req.successCh <- &runnerRef{ req.successCh <- &runnerRef{
llama: &mock, llama: &mock,
} }
...@@ -426,9 +426,9 @@ func TestChatHarmonyParserStreamingSimple(t *testing.T) { ...@@ -426,9 +426,9 @@ func TestChatHarmonyParserStreamingSimple(t *testing.T) {
loaded: make(map[string]*runnerRef), loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock), newServerFn: newMockServer(&mock),
getGpuFn: getGpuFn, getGpuFn: getGpuFn,
getCpuFn: getCpuFn, getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 100 * time.Millisecond, waitForRecovery: 100 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool { loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
req.successCh <- &runnerRef{ req.successCh <- &runnerRef{
llama: &mock, llama: &mock,
} }
...@@ -608,9 +608,9 @@ func TestChatHarmonyParserStreaming(t *testing.T) { ...@@ -608,9 +608,9 @@ func TestChatHarmonyParserStreaming(t *testing.T) {
loaded: make(map[string]*runnerRef), loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock), newServerFn: newMockServer(&mock),
getGpuFn: getGpuFn, getGpuFn: getGpuFn,
getCpuFn: getCpuFn, getSystemInfoFn: getSystemInfoFn,
waitForRecovery: 250 * time.Millisecond, waitForRecovery: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool { loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool {
req.successCh <- &runnerRef{ req.successCh <- &runnerRef{
llama: &mock, llama: &mock,
} }
......
...@@ -5,12 +5,9 @@ import ( ...@@ -5,12 +5,9 @@ import (
"errors" "errors"
"fmt" "fmt"
"log/slog" "log/slog"
"os"
"reflect" "reflect"
"runtime"
"slices" "slices"
"sort" "sort"
"strconv"
"strings" "strings"
"sync" "sync"
"time" "time"
...@@ -52,12 +49,10 @@ type Scheduler struct { ...@@ -52,12 +49,10 @@ type Scheduler struct {
activeLoading llm.LlamaServer activeLoading llm.LlamaServer
loaded map[string]*runnerRef loaded map[string]*runnerRef
loadFn func(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, requireFull bool) bool loadFn func(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool
newServerFn func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) newServerFn func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
getGpuFn func(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList getGpuFn func(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo
getCpuFn func() discover.GpuInfo getSystemInfoFn func() ml.SystemInfo
// waitForRecovery sets the limit for how long to wait for memory usage to recover after unload before scheduling the next model
waitForRecovery time.Duration waitForRecovery time.Duration
} }
...@@ -77,8 +72,8 @@ func InitScheduler(ctx context.Context) *Scheduler { ...@@ -77,8 +72,8 @@ func InitScheduler(ctx context.Context) *Scheduler {
unloadedCh: make(chan any, maxQueue), unloadedCh: make(chan any, maxQueue),
loaded: make(map[string]*runnerRef), loaded: make(map[string]*runnerRef),
newServerFn: llm.NewLlamaServer, newServerFn: llm.NewLlamaServer,
getGpuFn: discover.GetGPUInfo, getGpuFn: discover.GPUDevices,
getCpuFn: discover.GetCPUInfo, getSystemInfoFn: discover.GetSystemInfo,
waitForRecovery: 5 * time.Second, waitForRecovery: 5 * time.Second,
} }
sched.loadFn = sched.load sched.loadFn = sched.load
...@@ -133,6 +128,8 @@ func (s *Scheduler) Run(ctx context.Context) { ...@@ -133,6 +128,8 @@ func (s *Scheduler) Run(ctx context.Context) {
} }
func (s *Scheduler) processPending(ctx context.Context) { func (s *Scheduler) processPending(ctx context.Context) {
maxRunners := envconfig.MaxRunners()
for { for {
select { select {
case <-ctx.Done(): case <-ctx.Done():
...@@ -152,7 +149,7 @@ func (s *Scheduler) processPending(ctx context.Context) { ...@@ -152,7 +149,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
s.loadedMu.Lock() s.loadedMu.Lock()
runner := s.loaded[pending.model.ModelPath] runner := s.loaded[pending.model.ModelPath]
loadedCount := len(s.loaded) loadedCount := len(s.loaded)
runnersSnapshot := make([]discover.FilteredRunnerDiscovery, 0, len(s.loaded)) runnersSnapshot := make([]ml.FilteredRunnerDiscovery, 0, len(s.loaded))
for _, r := range s.loaded { for _, r := range s.loaded {
runnersSnapshot = append(runnersSnapshot, r) runnersSnapshot = append(runnersSnapshot, r)
} }
...@@ -167,39 +164,29 @@ func (s *Scheduler) processPending(ctx context.Context) { ...@@ -167,39 +164,29 @@ func (s *Scheduler) processPending(ctx context.Context) {
pending.useLoadedRunner(runner, s.finishedReqCh) pending.useLoadedRunner(runner, s.finishedReqCh)
break break
} }
} else if envconfig.MaxRunners() > 0 && loadedCount >= int(envconfig.MaxRunners()) { } else if maxRunners > 0 && loadedCount >= int(maxRunners) {
slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount) slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount)
runnerToExpire = s.findRunnerToUnload() runnerToExpire = s.findRunnerToUnload()
} else { } else {
// Either no models are loaded or below envconfig.MaxRunners // Either no models are loaded or below envconfig.MaxRunners
// Get a refreshed GPU list // Get a refreshed GPU list
var gpus discover.GpuInfoList var gpus []ml.DeviceInfo
if pending.opts.NumGPU == 0 { if pending.opts.NumGPU == 0 {
gpus = discover.GpuInfoList{s.getCpuFn()} gpus = []ml.DeviceInfo{}
} else { } else {
gpus = s.getGpuFn(ctx, runnersSnapshot) gpus = s.getGpuFn(ctx, runnersSnapshot)
} }
systemInfo := s.getSystemInfoFn()
if envconfig.MaxRunners() <= 0 { if maxRunners <= 0 {
// No user specified MaxRunners, so figure out what automatic setting to use // No user specified MaxRunners, so figure out what automatic setting to use for the next load attempt
// If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs if pending.opts.NumGPU == 0 {
// if any GPU has unreliable free memory reporting, 1x the number of GPUs // Need to get actual GPU list to set the correct default max models
allReliable := true g := s.getGpuFn(ctx, runnersSnapshot)
for _, gpu := range gpus { maxRunners = uint(defaultModelsPerGPU * max(len(g), 1))
if gpu.UnreliableFreeMemory {
allReliable = false
break
}
}
if allReliable {
// HACK
os.Setenv("OLLAMA_MAX_LOADED_MODELS", strconv.Itoa(defaultModelsPerGPU*len(gpus)))
slog.Debug("updating default concurrency", "OLLAMA_MAX_LOADED_MODELS", envconfig.MaxRunners(), "gpu_count", len(gpus))
} else { } else {
// HACK maxRunners = uint(defaultModelsPerGPU * max(len(gpus), 1))
os.Setenv("OLLAMA_MAX_LOADED_MODELS", strconv.Itoa(len(gpus)))
slog.Info("one or more GPUs detected that are unable to accurately report free memory - disabling default concurrency")
} }
slog.Debug("updating default concurrency", "OLLAMA_MAX_LOADED_MODELS", maxRunners, "gpu_count", len(gpus))
} }
// Load model for fitting // Load model for fitting
...@@ -215,14 +202,14 @@ func (s *Scheduler) processPending(ctx context.Context) { ...@@ -215,14 +202,14 @@ func (s *Scheduler) processPending(ctx context.Context) {
if loadedCount == 0 { if loadedCount == 0 {
// No models loaded. Load the model but prefer the best fit. // No models loaded. Load the model but prefer the best fit.
slog.Debug("loading first model", "model", pending.model.ModelPath) slog.Debug("loading first model", "model", pending.model.ModelPath)
s.loadFn(pending, ggml, gpus, false) s.loadFn(pending, ggml, systemInfo, gpus, false)
break break
} }
// More than one loaded model, so we have to see if the // More than one loaded model, so we have to see if the
// new one fits // new one fits
needEvict := s.loadFn(pending, ggml, gpus, true) needEvict := s.loadFn(pending, ggml, systemInfo, gpus, true)
if !needEvict { if !needEvict {
slog.Debug("new model fits with existing models, loading") slog.Debug("new model fits with existing models, loading")
break break
...@@ -353,7 +340,7 @@ func (s *Scheduler) processCompleted(ctx context.Context) { ...@@ -353,7 +340,7 @@ func (s *Scheduler) processCompleted(ctx context.Context) {
runner.refMu.Unlock() runner.refMu.Unlock()
} else { } else {
slog.Debug("starting background wait for VRAM recovery", "runner", runner) slog.Debug("starting background wait for VRAM recovery", "runner", runner)
runnersSnapshot := make([]discover.FilteredRunnerDiscovery, 0, len(s.loaded)) runnersSnapshot := make([]ml.FilteredRunnerDiscovery, 0, len(s.loaded))
for _, r := range s.loaded { for _, r := range s.loaded {
runnersSnapshot = append(runnersSnapshot, r) runnersSnapshot = append(runnersSnapshot, r)
} }
...@@ -395,7 +382,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm ...@@ -395,7 +382,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
// load creates a new model based on req and loads it. If requireFull is true then the model must be loaded fully onto GPUs // load creates a new model based on req and loads it. If requireFull is true then the model must be loaded fully onto GPUs
// (if any). Returns whether the scheduler needs to evict a model to make this one fit. // (if any). Returns whether the scheduler needs to evict a model to make this one fit.
func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, requireFull bool) bool { func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool {
numParallel := max(int(envconfig.NumParallel()), 1) numParallel := max(int(envconfig.NumParallel()), 1)
// Embedding models should always be loaded with parallel=1 // Embedding models should always be loaded with parallel=1
...@@ -420,7 +407,7 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis ...@@ -420,7 +407,7 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
if llama == nil { if llama == nil {
var err error var err error
llama, err = s.newServerFn(gpus, req.model.ModelPath, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel) llama, err = s.newServerFn(systemInfo, gpus, req.model.ModelPath, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
if err != nil { if err != nil {
// some older models are not compatible with newer versions of llama.cpp // some older models are not compatible with newer versions of llama.cpp
// show a generalized compatibility error until there is a better way to // show a generalized compatibility error until there is a better way to
...@@ -443,9 +430,16 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis ...@@ -443,9 +430,16 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
s.loadedMu.Unlock() s.loadedMu.Unlock()
gpuIDs, err := llama.Load(req.ctx, gpus, requireFull) gpuIDs, err := llama.Load(req.ctx, systemInfo, gpus, requireFull)
if err != nil { if err != nil {
if errors.Is(err, llm.ErrLoadRequiredFull) { if errors.Is(err, llm.ErrLoadRequiredFull) {
if !requireFull {
// No other models loaded, yet we still don't fit, so report an error
slog.Info("model is too large for system memory", "requireFull", requireFull)
s.activeLoading.Close()
s.activeLoading = nil
req.errCh <- err
}
return true return true
} }
...@@ -456,6 +450,20 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis ...@@ -456,6 +450,20 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
return false return false
} }
// Determine if we have discrete GPUs which we should monitor VRAM usage on during shutdown
discreteGPUs := false
iGPUScan:
for _, devid := range gpuIDs {
for _, dev := range gpus {
if dev.DeviceID == devid {
if !dev.Integrated {
discreteGPUs = true
break iGPUScan
}
}
}
}
runner := &runnerRef{ runner := &runnerRef{
model: req.model, model: req.model,
modelPath: req.model.ModelPath, modelPath: req.model.ModelPath,
...@@ -463,6 +471,7 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis ...@@ -463,6 +471,7 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
Options: &req.opts, Options: &req.opts,
sessionDuration: sessionDuration, sessionDuration: sessionDuration,
gpus: gpuIDs, gpus: gpuIDs,
discreteGPUs: discreteGPUs,
vramSize: llama.VRAMSize(), vramSize: llama.VRAMSize(),
totalSize: llama.TotalSize(), totalSize: llama.TotalSize(),
loading: true, loading: true,
...@@ -510,7 +519,10 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis ...@@ -510,7 +519,10 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
return false return false
} }
func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) { func (s *Scheduler) updateFreeSpace(allGpus []ml.DeviceInfo) {
if len(allGpus) == 0 {
return
}
predMap := map[ml.DeviceID]uint64{} // Sum up the total predicted usage per GPU for all runners predMap := map[ml.DeviceID]uint64{} // Sum up the total predicted usage per GPU for all runners
s.loadedMu.Lock() s.loadedMu.Lock()
runners := make([]*runnerRef, 0, len(s.loaded)) runners := make([]*runnerRef, 0, len(s.loaded))
...@@ -558,6 +570,7 @@ type runnerRef struct { ...@@ -558,6 +570,7 @@ type runnerRef struct {
pid int pid int
loading bool // True only during initial load, then false forever loading bool // True only during initial load, then false forever
gpus []ml.DeviceID // Recorded at time of provisioning gpus []ml.DeviceID // Recorded at time of provisioning
discreteGPUs bool // True if all devices are discrete GPUs - used to skip VRAM recovery check for iGPUs
vramSize uint64 vramSize uint64
totalSize uint64 totalSize uint64
...@@ -627,14 +640,12 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool ...@@ -627,14 +640,12 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool
// a before and after GPU memory allocation. The returned channel // a before and after GPU memory allocation. The returned channel
// will be notified when we're done waiting, or have timed out and should // will be notified when we're done waiting, or have timed out and should
// proceed anyway // proceed anyway
func (s *Scheduler) waitForVRAMRecovery(runner *runnerRef, runners []discover.FilteredRunnerDiscovery) chan any { func (s *Scheduler) waitForVRAMRecovery(runner *runnerRef, runners []ml.FilteredRunnerDiscovery) chan any {
finished := make(chan any, 1) finished := make(chan any, 1)
// CPU or Metal don't need checking, so no waiting required // CPU, Metal and iGPUs don't need checking, so no waiting required
// windows can page VRAM, only cuda currently can report accurate used vram usage if len(runner.gpus) == 0 || !runner.discreteGPUs ||
if len(runner.gpus) == 0 || (len(runner.gpus) == 1 && runner.gpus[0].Library == "Metal") {
(len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "Metal")) ||
(runtime.GOOS == "windows" && runner.gpus[0].Library != "CUDA") {
finished <- struct{}{} finished <- struct{}{}
slog.Debug("no need to wait for VRAM recovery", "runner", runner) slog.Debug("no need to wait for VRAM recovery", "runner", runner)
return finished return finished
...@@ -668,7 +679,11 @@ func (s *Scheduler) waitForVRAMRecovery(runner *runnerRef, runners []discover.Fi ...@@ -668,7 +679,11 @@ func (s *Scheduler) waitForVRAMRecovery(runner *runnerRef, runners []discover.Fi
totalMemoryNow += gpu.TotalMemory totalMemoryNow += gpu.TotalMemory
freeMemoryNow += gpu.FreeMemory freeMemoryNow += gpu.FreeMemory
} }
logutil.Trace("gpu VRAM convergence", "percent", int(max(float32(freeMemoryNow-freeMemoryBefore), 0.0)/float32(runner.vramSize)*100)) if freeMemoryNow > freeMemoryBefore {
logutil.Trace("gpu VRAM convergence", "percent", int(float32(freeMemoryNow-freeMemoryBefore)/float32(runner.vramSize)*100))
} else {
logutil.Trace("gpu VRAM convergence", "percent", 0)
}
// If we're within ~75% of the estimated memory usage recovered, bail out // If we're within ~75% of the estimated memory usage recovered, bail out
if float32(freeMemoryNow-freeMemoryBefore) > float32(runner.vramSize)*0.75 { if float32(freeMemoryNow-freeMemoryBefore) > float32(runner.vramSize)*0.75 {
slog.Debug(fmt.Sprintf("gpu VRAM free memory converged after %0.2f seconds", time.Since(start).Seconds()), "free_before", format.HumanBytes2(freeMemoryBefore), "free_now", format.HumanBytes2(freeMemoryNow), "runner", runner) slog.Debug(fmt.Sprintf("gpu VRAM free memory converged after %0.2f seconds", time.Since(start).Seconds()), "free_before", format.HumanBytes2(freeMemoryBefore), "free_now", format.HumanBytes2(freeMemoryNow), "runner", runner)
......
...@@ -13,7 +13,6 @@ import ( ...@@ -13,7 +13,6 @@ import (
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/app/lifecycle" "github.com/ollama/ollama/app/lifecycle"
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/format" "github.com/ollama/ollama/format"
"github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/llm"
...@@ -50,11 +49,12 @@ func TestSchedLoad(t *testing.T) { ...@@ -50,11 +49,12 @@ func TestSchedLoad(t *testing.T) {
sessionDuration: &api.Duration{Duration: 2 * time.Second}, sessionDuration: &api.Duration{Duration: 2 * time.Second},
} }
// Fail to load model first // Fail to load model first
s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
return nil, errors.New("something failed to load model blah") return nil, errors.New("something failed to load model blah")
} }
gpus := discover.GpuInfoList{} gpus := []ml.DeviceInfo{}
s.load(req, f, gpus, false) systemInfo := ml.SystemInfo{}
s.load(req, f, systemInfo, gpus, false)
require.Empty(t, req.successCh) require.Empty(t, req.successCh)
require.Len(t, req.errCh, 1) require.Len(t, req.errCh, 1)
s.loadedMu.Lock() s.loadedMu.Lock()
...@@ -64,11 +64,11 @@ func TestSchedLoad(t *testing.T) { ...@@ -64,11 +64,11 @@ func TestSchedLoad(t *testing.T) {
require.Contains(t, err.Error(), "this model may be incompatible") require.Contains(t, err.Error(), "this model may be incompatible")
server := &mockLlm{vramSize: 10, vramByGPU: map[ml.DeviceID]uint64{}} server := &mockLlm{vramSize: 10, vramByGPU: map[ml.DeviceID]uint64{}}
s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
server.modelPath = model server.modelPath = model
return server, nil return server, nil
} }
s.load(req, f, gpus, false) s.load(req, f, systemInfo, gpus, false)
select { select {
case err := <-req.errCh: case err := <-req.errCh:
require.NoError(t, err) require.NoError(t, err)
...@@ -82,7 +82,7 @@ func TestSchedLoad(t *testing.T) { ...@@ -82,7 +82,7 @@ func TestSchedLoad(t *testing.T) {
req.model.ModelPath = "dummy_model_path" req.model.ModelPath = "dummy_model_path"
server.waitResp = errors.New("wait failure") server.waitResp = errors.New("wait failure")
s.load(req, f, gpus, false) s.load(req, f, systemInfo, gpus, false)
select { select {
case err := <-req.errCh: case err := <-req.errCh:
require.Contains(t, err.Error(), "wait failure") require.Contains(t, err.Error(), "wait failure")
...@@ -106,7 +106,7 @@ type reqBundle struct { ...@@ -106,7 +106,7 @@ type reqBundle struct {
f *ggml.GGML f *ggml.GGML
} }
func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { func (scenario *reqBundle) newServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
scenario.srv.modelPath = model scenario.srv.modelPath = model
return scenario.srv, nil return scenario.srv, nil
} }
...@@ -152,20 +152,20 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, vra ...@@ -152,20 +152,20 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, vra
return b return b
} }
func getGpuFn(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList { func getGpuFn(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo {
slog.Info("test getGpuFn called", "runners", runners) slog.Info("test getGpuFn called", "runners", runners)
g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "metal"}} g := ml.DeviceInfo{DeviceID: ml.DeviceID{Library: "Metal"}}
g.TotalMemory = 24 * format.GigaByte g.TotalMemory = 24 * format.GigaByte
g.FreeMemory = 12 * format.GigaByte g.FreeMemory = 12 * format.GigaByte
return []discover.GpuInfo{g} return []ml.DeviceInfo{g}
} }
func getCpuFn() discover.GpuInfo { func getSystemInfoFn() ml.SystemInfo {
slog.Info("test getCpuFn called") slog.Info("test getSystemInfoFn called")
g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "cpu"}} return ml.SystemInfo{
g.TotalMemory = 32 * format.GigaByte TotalMemory: 32 * format.GigaByte,
g.FreeMemory = 26 * format.GigaByte FreeMemory: 26 * format.GigaByte,
return g }
} }
func TestSchedRequestsSameModelSameRequest(t *testing.T) { func TestSchedRequestsSameModelSameRequest(t *testing.T) {
...@@ -174,7 +174,7 @@ func TestSchedRequestsSameModelSameRequest(t *testing.T) { ...@@ -174,7 +174,7 @@ func TestSchedRequestsSameModelSameRequest(t *testing.T) {
s := InitScheduler(ctx) s := InitScheduler(ctx)
s.waitForRecovery = 10 * time.Millisecond s.waitForRecovery = 10 * time.Millisecond
s.getGpuFn = getGpuFn s.getGpuFn = getGpuFn
s.getCpuFn = getCpuFn s.getSystemInfoFn = getSystemInfoFn
a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond}, nil) a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond}, nil)
b := newScenarioRequest(t, ctx, "ollama-model-1", 11, &api.Duration{Duration: 0}, nil) b := newScenarioRequest(t, ctx, "ollama-model-1", 11, &api.Duration{Duration: 0}, nil)
b.req.model = a.req.model b.req.model = a.req.model
...@@ -218,7 +218,7 @@ func TestSchedRequestsSimpleReloadSameModel(t *testing.T) { ...@@ -218,7 +218,7 @@ func TestSchedRequestsSimpleReloadSameModel(t *testing.T) {
s := InitScheduler(ctx) s := InitScheduler(ctx)
s.waitForRecovery = 10 * time.Millisecond s.waitForRecovery = 10 * time.Millisecond
s.getGpuFn = getGpuFn s.getGpuFn = getGpuFn
s.getCpuFn = getCpuFn s.getSystemInfoFn = getSystemInfoFn
a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond}, nil) a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond}, nil)
b := newScenarioRequest(t, ctx, "ollama-model-1", 20, &api.Duration{Duration: 5 * time.Millisecond}, nil) b := newScenarioRequest(t, ctx, "ollama-model-1", 20, &api.Duration{Duration: 5 * time.Millisecond}, nil)
tmpModel := *a.req.model tmpModel := *a.req.model
...@@ -251,12 +251,12 @@ func TestSchedRequestsSimpleReloadSameModel(t *testing.T) { ...@@ -251,12 +251,12 @@ func TestSchedRequestsSimpleReloadSameModel(t *testing.T) {
a.ctxDone() a.ctxDone()
// Report recovered VRAM usage // Report recovered VRAM usage
time.Sleep(1 * time.Millisecond) time.Sleep(1 * time.Millisecond)
s.getGpuFn = func(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList { s.getGpuFn = func(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo {
slog.Info("XXX altered getGpuFn called") slog.Info("altered getGpuFn called")
g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "metal"}} g := ml.DeviceInfo{DeviceID: ml.DeviceID{Library: "Metal"}}
g.TotalMemory = 24 * format.GigaByte g.TotalMemory = 24 * format.GigaByte
g.FreeMemory = 24 * format.GigaByte g.FreeMemory = 24 * format.GigaByte
return []discover.GpuInfo{g} return []ml.DeviceInfo{g}
} }
select { select {
case resp := <-b.req.successCh: case resp := <-b.req.successCh:
...@@ -271,26 +271,26 @@ func TestSchedRequestsSimpleReloadSameModel(t *testing.T) { ...@@ -271,26 +271,26 @@ func TestSchedRequestsSimpleReloadSameModel(t *testing.T) {
} }
func TestSchedRequestsMultipleLoadedModels(t *testing.T) { func TestSchedRequestsMultipleLoadedModels(t *testing.T) {
ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond) slog.Info("TestRequestsMultipleLoadedModels")
ctx, done := context.WithTimeout(t.Context(), 1000*time.Millisecond)
defer done() defer done()
s := InitScheduler(ctx) s := InitScheduler(ctx)
s.waitForRecovery = 10 * time.Millisecond s.waitForRecovery = 10 * time.Millisecond
s.getGpuFn = getGpuFn // 1 metal GPU s.getGpuFn = getGpuFn // 1 Metal GPU
s.getCpuFn = getCpuFn // 1 CPU s.getSystemInfoFn = getSystemInfoFn
// Multiple loaded models // Multiple loaded models
a := newScenarioRequest(t, ctx, "model-a-1g-gpu", 1*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "metal"}: 1 * format.GigaByte}) a := newScenarioRequest(t, ctx, "model-a-1g-gpu", 1*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "Metal"}: 1 * format.GigaByte})
a.req.sessionDuration = &api.Duration{Duration: 5 * time.Millisecond} a.req.sessionDuration = &api.Duration{Duration: 5 * time.Millisecond}
b := newScenarioRequest(t, ctx, "model-b-10g-gpu", 10*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "metal"}: 10 * format.GigaByte}) b := newScenarioRequest(t, ctx, "model-b-10g-gpu", 10*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "Metal"}: 10 * format.GigaByte})
b.req.sessionDuration = &api.Duration{Duration: 5 * time.Millisecond} b.req.sessionDuration = &api.Duration{Duration: 5 * time.Millisecond}
c := newScenarioRequest(t, ctx, "model-c-10g-cpu", 10*format.GigaByte, nil, nil /* No GPU load */) c := newScenarioRequest(t, ctx, "model-c-10g-cpu", 10*format.GigaByte, nil, nil /* No GPU load */)
c.req.opts.NumGPU = 0 // CPU load, will be allowed c.req.opts.NumGPU = 0 // CPU load, will be allowed
b.req.sessionDuration = &api.Duration{Duration: 10 * time.Millisecond} // longer than b to cause the scheduler to favor unloading b over c b.req.sessionDuration = &api.Duration{Duration: 10 * time.Millisecond} // longer than b to cause the scheduler to favor unloading b over c
d := newScenarioRequest(t, ctx, "model-d-10g-gpu", 13*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "metal"}: 13 * format.GigaByte}) // Needs prior unloaded d := newScenarioRequest(t, ctx, "model-d-10g-gpu", 13*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "Metal"}: 13 * format.GigaByte}) // Needs prior unloaded
t.Setenv("OLLAMA_MAX_LOADED_MODELS", "1")
s.newServerFn = a.newServer s.newServerFn = a.newServer
slog.Info("a") slog.Info("Loading A")
s.pendingReqCh <- a.req s.pendingReqCh <- a.req
s.Run(ctx) s.Run(ctx)
select { select {
...@@ -309,7 +309,7 @@ func TestSchedRequestsMultipleLoadedModels(t *testing.T) { ...@@ -309,7 +309,7 @@ func TestSchedRequestsMultipleLoadedModels(t *testing.T) {
t.Setenv("OLLAMA_MAX_LOADED_MODELS", "0") t.Setenv("OLLAMA_MAX_LOADED_MODELS", "0")
s.newServerFn = b.newServer s.newServerFn = b.newServer
slog.Info("b") slog.Info("Loading B")
s.pendingReqCh <- b.req s.pendingReqCh <- b.req
select { select {
case resp := <-b.req.successCh: case resp := <-b.req.successCh:
...@@ -327,7 +327,7 @@ func TestSchedRequestsMultipleLoadedModels(t *testing.T) { ...@@ -327,7 +327,7 @@ func TestSchedRequestsMultipleLoadedModels(t *testing.T) {
// This is a CPU load with NumGPU = 0 so it should load // This is a CPU load with NumGPU = 0 so it should load
s.newServerFn = c.newServer s.newServerFn = c.newServer
slog.Info("c") slog.Info("Loading C")
s.pendingReqCh <- c.req s.pendingReqCh <- c.req
select { select {
case resp := <-c.req.successCh: case resp := <-c.req.successCh:
...@@ -337,6 +337,7 @@ func TestSchedRequestsMultipleLoadedModels(t *testing.T) { ...@@ -337,6 +337,7 @@ func TestSchedRequestsMultipleLoadedModels(t *testing.T) {
case err := <-c.req.errCh: case err := <-c.req.errCh:
t.Fatal(err.Error()) t.Fatal(err.Error())
case <-ctx.Done(): case <-ctx.Done():
slog.Info("FAIL: scheduler state", "s.loaded", s.loaded)
t.Fatal("timeout") t.Fatal("timeout")
} }
s.loadedMu.Lock() s.loadedMu.Lock()
...@@ -361,11 +362,11 @@ func TestSchedRequestsMultipleLoadedModels(t *testing.T) { ...@@ -361,11 +362,11 @@ func TestSchedRequestsMultipleLoadedModels(t *testing.T) {
b.ctxDone() b.ctxDone()
// Report recovered VRAM usage so scheduler will finish waiting and unload // Report recovered VRAM usage so scheduler will finish waiting and unload
time.Sleep(1 * time.Millisecond) time.Sleep(1 * time.Millisecond)
s.getGpuFn = func(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList { s.getGpuFn = func(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo {
g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "metal"}} g := ml.DeviceInfo{DeviceID: ml.DeviceID{Library: "Metal"}}
g.TotalMemory = 24 * format.GigaByte g.TotalMemory = 24 * format.GigaByte
g.FreeMemory = 24 * format.GigaByte g.FreeMemory = 24 * format.GigaByte
return []discover.GpuInfo{g} return []ml.DeviceInfo{g}
} }
select { select {
case resp := <-d.req.successCh: case resp := <-d.req.successCh:
...@@ -404,7 +405,7 @@ func TestSchedGetRunner(t *testing.T) { ...@@ -404,7 +405,7 @@ func TestSchedGetRunner(t *testing.T) {
s := InitScheduler(ctx) s := InitScheduler(ctx)
s.waitForRecovery = 10 * time.Millisecond s.waitForRecovery = 10 * time.Millisecond
s.getGpuFn = getGpuFn s.getGpuFn = getGpuFn
s.getCpuFn = getCpuFn s.getSystemInfoFn = getSystemInfoFn
s.newServerFn = a.newServer s.newServerFn = a.newServer
slog.Info("a") slog.Info("a")
successCh1a, errCh1a := s.GetRunner(a.ctx, a.req.model, a.req.opts, a.req.sessionDuration) successCh1a, errCh1a := s.GetRunner(a.ctx, a.req.model, a.req.opts, a.req.sessionDuration)
...@@ -462,13 +463,14 @@ func TestSchedExpireRunner(t *testing.T) { ...@@ -462,13 +463,14 @@ func TestSchedExpireRunner(t *testing.T) {
} }
var f *ggml.GGML var f *ggml.GGML
gpus := discover.GpuInfoList{} gpus := []ml.DeviceInfo{}
systemInfo := ml.SystemInfo{}
server := &mockLlm{vramSize: 10, vramByGPU: map[ml.DeviceID]uint64{}} server := &mockLlm{vramSize: 10, vramByGPU: map[ml.DeviceID]uint64{}}
s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
server.modelPath = model server.modelPath = model
return server, nil return server, nil
} }
s.load(req, f, gpus, false) s.load(req, f, systemInfo, gpus, false)
select { select {
case err := <-req.errCh: case err := <-req.errCh:
...@@ -497,19 +499,15 @@ func TestSchedExpireRunner(t *testing.T) { ...@@ -497,19 +499,15 @@ func TestSchedExpireRunner(t *testing.T) {
// TODO - add one scenario that triggers the bogus finished event with positive ref count // TODO - add one scenario that triggers the bogus finished event with positive ref count
func TestSchedPrematureExpired(t *testing.T) { func TestSchedPrematureExpired(t *testing.T) {
ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond) ctx, done := context.WithTimeout(t.Context(), 1000*time.Millisecond)
defer done() defer done()
// Same model, same request // Same model, same request
scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, nil, nil) scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, &api.Duration{Duration: 100 * time.Millisecond}, nil)
s := InitScheduler(ctx) s := InitScheduler(ctx)
s.waitForRecovery = 10 * time.Millisecond s.waitForRecovery = 10 * time.Millisecond
s.getGpuFn = func(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList { s.getGpuFn = getGpuFn
g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "metal"}} s.getSystemInfoFn = getSystemInfoFn
g.TotalMemory = 24 * format.GigaByte
g.FreeMemory = 12 * format.GigaByte
return []discover.GpuInfo{g}
}
s.newServerFn = scenario1a.newServer s.newServerFn = scenario1a.newServer
successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration) successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
require.Len(t, s.pendingReqCh, 1) require.Len(t, s.pendingReqCh, 1)
...@@ -574,7 +572,7 @@ func TestSchedUseLoadedRunner(t *testing.T) { ...@@ -574,7 +572,7 @@ func TestSchedUseLoadedRunner(t *testing.T) {
func TestSchedUpdateFreeSpace(t *testing.T) { func TestSchedUpdateFreeSpace(t *testing.T) {
ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond) ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
defer done() defer done()
gpus := discover.GpuInfoList{ gpus := []ml.DeviceInfo{
{ {
DeviceID: ml.DeviceID{ DeviceID: ml.DeviceID{
ID: "1", ID: "1",
...@@ -756,8 +754,12 @@ func (s *mockLlm) ModelPath() string { ...@@ -756,8 +754,12 @@ func (s *mockLlm) ModelPath() string {
return s.modelPath return s.modelPath
} }
func (s *mockLlm) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error) { func (s *mockLlm) Load(ctx context.Context, sytemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) ([]ml.DeviceID, error) {
if requireFull { if requireFull {
if len(gpus) == 0 {
slog.Info("mockLlm.Load CPU based load")
return nil, nil
}
for _, g := range gpus { for _, g := range gpus {
if g.FreeMemory >= s.vramSize { if g.FreeMemory >= s.vramSize {
return []ml.DeviceID{g.DeviceID}, nil return []ml.DeviceID{g.DeviceID}, nil
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment