Unverified Commit bc8909fb authored by Daniel Hiltgen's avatar Daniel Hiltgen Committed by GitHub
Browse files

Use runners for GPU discovery (#12090)

This revamps how we discover GPUs in the system by leveraging the Ollama
runner.  This should eliminate inconsistency between our GPU discovery and the
runners capabilities at runtime, particularly for cases where we try to filter
out unsupported GPUs.  Now the runner does that implicitly based on the actual
device list.  In some cases free VRAM reporting can be unreliable which can
leaad to scheduling mistakes, so this also includes a patch to leverage more
reliable VRAM reporting libraries if available.

Automatic workarounds have been removed as only one GPU leveraged this, which
is now documented. This GPU will soon fall off the support matrix with the next
ROCm bump.

Additional cleanup of the scheduler and discovery packages can be done in the
future once we have switched on the new memory management code, and removed
support for the llama runner.
parent 6b50f2b9
#ifndef __APPLE__
#ifndef __GPU_INFO_NVML_H__
#define __GPU_INFO_NVML_H__
#include "gpu_info.h"
// Just enough typedef's to dlopen/dlsym for memory information
typedef enum nvmlReturn_enum {
NVML_SUCCESS = 0,
// Other values omitted for now...
} nvmlReturn_t;
typedef void *nvmlDevice_t; // Opaque is sufficient
typedef struct nvmlMemory_st {
unsigned long long total;
unsigned long long free;
unsigned long long used;
} nvmlMemory_t;
typedef enum nvmlBrandType_enum
{
NVML_BRAND_UNKNOWN = 0,
} nvmlBrandType_t;
typedef struct nvml_handle {
void *handle;
uint16_t verbose;
nvmlReturn_t (*nvmlInit_v2)(void);
nvmlReturn_t (*nvmlShutdown)(void);
nvmlReturn_t (*nvmlDeviceGetHandleByUUID)(const char *, nvmlDevice_t *);
nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *);
} nvml_handle_t;
typedef struct nvml_init_resp {
char *err; // If err is non-null handle is invalid
nvml_handle_t ch;
} nvml_init_resp_t;
typedef struct nvml_compute_capability {
char *err;
int major;
int minor;
} nvml_compute_capability_t;
void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp);
void nvml_get_free(nvml_handle_t ch, char *uuid, uint64_t *free, uint64_t *total, uint64_t *used);
void nvml_release(nvml_handle_t ch);
#endif // __GPU_INFO_NVML_H__
#endif // __APPLE__
\ No newline at end of file
#ifndef __APPLE__
#include "gpu_info_oneapi.h"
#include <string.h>
void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) {
ze_result_t ret;
resp->err = NULL;
resp->oh.devices = NULL;
resp->oh.num_devices = NULL;
resp->oh.drivers = NULL;
resp->oh.num_drivers = 0;
const int buflen = 256;
char buf[buflen + 1];
int i, d;
struct lookup {
char *s;
void **p;
} l[] = {
{"zesInit", (void *)&resp->oh.zesInit},
{"zesDriverGet", (void *)&resp->oh.zesDriverGet},
{"zesDeviceGet", (void *)&resp->oh.zesDeviceGet},
{"zesDeviceGetProperties", (void *)&resp->oh.zesDeviceGetProperties},
{"zesDeviceEnumMemoryModules",
(void *)&resp->oh.zesDeviceEnumMemoryModules},
{"zesMemoryGetProperties", (void *)&resp->oh.zesMemoryGetProperties},
{"zesMemoryGetState", (void *)&resp->oh.zesMemoryGetState},
{NULL, NULL},
};
resp->oh.handle = LOAD_LIBRARY(oneapi_lib_path, RTLD_LAZY);
if (!resp->oh.handle) {
char *msg = LOAD_ERR();
snprintf(buf, buflen,
"Unable to load %s library to query for Intel GPUs: %s\n",
oneapi_lib_path, msg);
free(msg);
resp->err = strdup(buf);
return;
}
// TODO once we've squashed the remaining corner cases remove this log
LOG(resp->oh.verbose,
"wiring Level-Zero management library functions in %s\n",
oneapi_lib_path);
for (i = 0; l[i].s != NULL; i++) {
// TODO once we've squashed the remaining corner cases remove this log
LOG(resp->oh.verbose, "dlsym: %s\n", l[i].s);
*l[i].p = LOAD_SYMBOL(resp->oh.handle, l[i].s);
if (!*(l[i].p)) {
resp->oh.handle = NULL;
char *msg = LOAD_ERR();
LOG(resp->oh.verbose, "dlerr: %s\n", msg);
UNLOAD_LIBRARY(resp->oh.handle);
snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s, msg);
free(msg);
resp->err = strdup(buf);
return;
}
}
LOG(resp->oh.verbose, "calling zesInit\n");
ret = (*resp->oh.zesInit)(0);
if (ret != ZE_RESULT_SUCCESS) {
LOG(resp->oh.verbose, "zesInit err: %x\n", ret);
snprintf(buf, buflen, "oneapi vram init failure: %x", ret);
resp->err = strdup(buf);
oneapi_release(resp->oh);
return;
}
LOG(resp->oh.verbose, "calling zesDriverGet\n");
ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, NULL);
if (ret != ZE_RESULT_SUCCESS) {
LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret);
snprintf(buf, buflen, "unable to get driver count: %x", ret);
resp->err = strdup(buf);
oneapi_release(resp->oh);
return;
}
LOG(resp->oh.verbose, "oneapi driver count: %d\n", resp->oh.num_drivers);
resp->oh.drivers = malloc(resp->oh.num_drivers * sizeof(zes_driver_handle_t));
resp->oh.num_devices = malloc(resp->oh.num_drivers * sizeof(uint32_t));
memset(&resp->oh.num_devices[0], 0, resp->oh.num_drivers * sizeof(uint32_t));
resp->oh.devices =
malloc(resp->oh.num_drivers * sizeof(zes_device_handle_t *));
ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, &resp->oh.drivers[0]);
if (ret != ZE_RESULT_SUCCESS) {
LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret);
snprintf(buf, buflen, "unable to get driver count: %x", ret);
resp->err = strdup(buf);
oneapi_release(resp->oh);
return;
}
for (d = 0; d < resp->oh.num_drivers; d++) {
LOG(resp->oh.verbose, "calling zesDeviceGet count %d: %p\n", d, resp->oh.drivers[d]);
ret = (*resp->oh.zesDeviceGet)(resp->oh.drivers[d],
&resp->oh.num_devices[d], NULL);
if (ret != ZE_RESULT_SUCCESS) {
LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret);
snprintf(buf, buflen, "unable to get device count: %x", ret);
resp->err = strdup(buf);
oneapi_release(resp->oh);
return;
}
resp->oh.devices[d] =
malloc(resp->oh.num_devices[d] * sizeof(zes_device_handle_t));
ret = (*resp->oh.zesDeviceGet)(
resp->oh.drivers[d], &resp->oh.num_devices[d], resp->oh.devices[d]);
if (ret != ZE_RESULT_SUCCESS) {
LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret);
snprintf(buf, buflen, "unable to get device count: %x", ret);
resp->err = strdup(buf);
oneapi_release(resp->oh);
return;
}
}
return;
}
void oneapi_check_vram(oneapi_handle_t h, int driver, int device,
mem_info_t *resp) {
ze_result_t ret;
resp->err = NULL;
uint64_t totalMem = 0;
uint64_t usedMem = 0;
const int buflen = 256;
char buf[buflen + 1];
int i, d, m;
if (h.handle == NULL) {
resp->err = strdup("Level-Zero handle not initialized");
return;
}
if (driver > h.num_drivers || device > h.num_devices[driver]) {
resp->err = strdup("driver of device index out of bounds");
return;
}
resp->total = 0;
resp->free = 0;
zes_device_ext_properties_t ext_props;
ext_props.stype = ZES_STRUCTURE_TYPE_DEVICE_EXT_PROPERTIES;
ext_props.pNext = NULL;
zes_device_properties_t props;
props.stype = ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES;
props.pNext = &ext_props;
ret = (*h.zesDeviceGetProperties)(h.devices[driver][device], &props);
if (ret != ZE_RESULT_SUCCESS) {
snprintf(buf, buflen, "unable to get device properties: %d", ret);
resp->err = strdup(buf);
return;
}
snprintf(&resp->gpu_name[0], GPU_NAME_LEN, "%s", props.modelName);
// TODO this needs to map to ONEAPI_DEVICE_SELECTOR syntax
// (this is probably wrong...)
// TODO - the driver isn't included - what if there are multiple drivers?
snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", device);
if (h.verbose) {
// When in verbose mode, report more information about
// the card we discover.
LOG(h.verbose, "[%d:%d] oneAPI device name: %s\n", driver, device,
props.modelName);
LOG(h.verbose, "[%d:%d] oneAPI brand: %s\n", driver, device,
props.brandName);
LOG(h.verbose, "[%d:%d] oneAPI vendor: %s\n", driver, device,
props.vendorName);
LOG(h.verbose, "[%d:%d] oneAPI S/N: %s\n", driver, device,
props.serialNumber);
LOG(h.verbose, "[%d:%d] oneAPI board number: %s\n", driver, device,
props.boardNumber);
}
// TODO
// Compute Capability equivalent in resp->major, resp->minor, resp->patch
uint32_t memCount = 0;
ret = (*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount,
NULL);
if (ret != ZE_RESULT_SUCCESS) {
snprintf(buf, buflen, "unable to enumerate Level-Zero memory modules: %x",
ret);
resp->err = strdup(buf);
return;
}
LOG(h.verbose, "discovered %d Level-Zero memory modules\n", memCount);
zes_mem_handle_t *mems = malloc(memCount * sizeof(zes_mem_handle_t));
(*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount, mems);
for (m = 0; m < memCount; m++) {
zes_mem_state_t state;
state.stype = ZES_STRUCTURE_TYPE_MEM_STATE;
state.pNext = NULL;
ret = (*h.zesMemoryGetState)(mems[m], &state);
if (ret != ZE_RESULT_SUCCESS) {
snprintf(buf, buflen, "unable to get memory state: %x", ret);
resp->err = strdup(buf);
free(mems);
return;
}
resp->total += state.size;
resp->free += state.free;
}
free(mems);
}
void oneapi_release(oneapi_handle_t h) {
int d;
LOG(h.verbose, "releasing oneapi library\n");
for (d = 0; d < h.num_drivers; d++) {
if (h.devices != NULL && h.devices[d] != NULL) {
free(h.devices[d]);
}
}
if (h.devices != NULL) {
free(h.devices);
h.devices = NULL;
}
if (h.num_devices != NULL) {
free(h.num_devices);
h.num_devices = NULL;
}
if (h.drivers != NULL) {
free(h.drivers);
h.drivers = NULL;
}
h.num_drivers = 0;
UNLOAD_LIBRARY(h.handle);
h.handle = NULL;
}
int oneapi_get_device_count(oneapi_handle_t h, int driver) {
if (h.handle == NULL || h.num_devices == NULL) {
return 0;
}
if (driver > h.num_drivers) {
return 0;
}
return (int)h.num_devices[driver];
}
#endif // __APPLE__
#ifndef __APPLE__
#ifndef __GPU_INFO_ONEAPI_H__
#define __GPU_INFO_ONEAPI_H__
#include "gpu_info.h"
#define ZE_MAX_DEVICE_NAME 256
#define ZE_MAX_DEVICE_UUID_SIZE 16
#define ZES_STRING_PROPERTY_SIZE 64
#define ZE_BIT(_i) (1 << _i)
// Just enough typedef's to dlopen/dlsym for memory information
typedef enum ze_result_t {
ZE_RESULT_SUCCESS = 0,
// Other values omitted for now...
} ze_result_t;
typedef uint8_t ze_bool_t;
typedef struct _zes_driver_handle_t *zes_driver_handle_t;
typedef struct _zes_device_handle_t *zes_device_handle_t;
typedef struct _zes_mem_handle_t *zes_mem_handle_t;
typedef enum _ze_structure_type_t {
ZE_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
} ze_structure_type_t;
typedef enum _zes_structure_type_t {
ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES = 0x1,
ZES_STRUCTURE_TYPE_MEM_PROPERTIES = 0xb,
ZES_STRUCTURE_TYPE_MEM_STATE = 0x1e,
ZES_STRUCTURE_TYPE_DEVICE_EXT_PROPERTIES = 0x2d,
ZES_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
} zes_structure_type_t;
typedef enum _zes_mem_type_t {
ZES_MEM_TYPE_FORCE_UINT32 = 0x7fffffff
} zes_mem_type_t;
typedef enum _zes_mem_loc_t {
ZES_MEM_LOC_SYSTEM = 0,
ZES_MEM_LOC_DEVICE = 1,
ZES_MEM_LOC_FORCE_UINT32 = 0x7fffffff
} zes_mem_loc_t;
typedef enum _zes_mem_health_t {
ZES_MEM_HEALTH_FORCE_UINT32 = 0x7fffffff
} zes_mem_health_t;
typedef struct _ze_device_uuid_t {
uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
} ze_device_uuid_t;
typedef struct _zes_uuid_t {
uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
} zes_uuid_t;
typedef enum _ze_device_type_t {
ZE_DEVICE_TYPE_GPU = 1,
ZE_DEVICE_TYPE_CPU = 2,
ZE_DEVICE_TYPE_FPGA = 3,
ZE_DEVICE_TYPE_MCA = 4,
ZE_DEVICE_TYPE_VPU = 5,
ZE_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff
} ze_device_type_t;
typedef enum _zes_device_type_t {
ZES_DEVICE_TYPE_GPU = 1,
ZES_DEVICE_TYPE_CPU = 2,
ZES_DEVICE_TYPE_FPGA = 3,
ZES_DEVICE_TYPE_MCA = 4,
ZES_DEVICE_TYPE_VPU = 5,
ZES_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff
} zes_device_type_t;
typedef uint32_t ze_device_property_flags_t;
typedef enum _ze_device_property_flag_t {
ZE_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
ZE_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
ZE_DEVICE_PROPERTY_FLAG_ONDEMANDPAGING = ZE_BIT(3),
ZE_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
} ze_device_property_flag_t;
typedef uint32_t zes_device_property_flags_t;
typedef enum _zes_device_property_flag_t {
ZES_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
ZES_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
ZES_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
ZES_DEVICE_PROPERTY_FLAG_ONDEMANDPAGING = ZE_BIT(3),
ZES_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
} zes_device_property_flag_t;
typedef struct _ze_device_properties_t {
ze_structure_type_t stype;
void *pNext;
ze_device_type_t type;
uint32_t vendorId;
uint32_t deviceId;
ze_device_property_flags_t flags;
uint32_t subdeviceId;
uint32_t coreClockRate;
uint64_t maxMemAllocSize;
uint32_t maxHardwareContexts;
uint32_t maxCommandQueuePriority;
uint32_t numThreadsPerEU;
uint32_t physicalEUSimdWidth;
uint32_t numEUsPerSubslice;
uint32_t numSubslicesPerSlice;
uint32_t numSlices;
uint64_t timerResolution;
uint32_t timestampValidBits;
uint32_t kernelTimestampValidBits;
ze_device_uuid_t uuid;
char name[ZE_MAX_DEVICE_NAME];
} ze_device_properties_t;
typedef struct _zes_device_properties_t {
zes_structure_type_t stype;
void *pNext;
ze_device_properties_t core;
uint32_t numSubdevices;
char serialNumber[ZES_STRING_PROPERTY_SIZE];
char boardNumber[ZES_STRING_PROPERTY_SIZE];
char brandName[ZES_STRING_PROPERTY_SIZE];
char modelName[ZES_STRING_PROPERTY_SIZE];
char vendorName[ZES_STRING_PROPERTY_SIZE];
char driverVersion[ZES_STRING_PROPERTY_SIZE];
} zes_device_properties_t;
typedef struct _zes_device_ext_properties_t {
zes_structure_type_t stype;
void *pNext;
zes_uuid_t uuid;
zes_device_type_t type;
zes_device_property_flags_t flags;
} zes_device_ext_properties_t;
typedef struct _zes_mem_properties_t {
zes_structure_type_t stype;
void *pNext;
zes_mem_type_t type;
ze_bool_t onSubdevice;
uint32_t subdeviceId;
zes_mem_loc_t location;
uint64_t physicalSize;
int32_t busWidth;
int32_t numChannels;
} zes_mem_properties_t;
typedef struct _zes_mem_state_t {
zes_structure_type_t stype;
const void *pNext;
zes_mem_health_t health;
uint64_t free;
uint64_t size;
} zes_mem_state_t;
typedef struct oneapi_handle {
void *handle;
uint16_t verbose;
uint32_t num_drivers;
zes_driver_handle_t *drivers;
uint32_t *num_devices;
zes_device_handle_t **devices;
// TODO Driver major, minor information
// int driver_major;
// int driver_minor;
ze_result_t (*zesInit)(int);
ze_result_t (*zesDriverGet)(uint32_t *pCount, zes_driver_handle_t *phDrivers);
ze_result_t (*zesDeviceGet)(zes_driver_handle_t hDriver, uint32_t *pCount,
zes_device_handle_t *phDevices);
ze_result_t (*zesDeviceGetProperties)(zes_device_handle_t hDevice,
zes_device_properties_t *pProperties);
ze_result_t (*zesDeviceEnumMemoryModules)(zes_device_handle_t hDevice,
uint32_t *pCount,
zes_mem_handle_t *phMemory);
ze_result_t (*zesMemoryGetProperties)(zes_mem_handle_t hMemory,
zes_mem_properties_t *pProperties);
ze_result_t (*zesMemoryGetState)(zes_mem_handle_t hMemory,
zes_mem_state_t *pState);
} oneapi_handle_t;
typedef struct oneapi_init_resp {
char *err; // If err is non-null handle is invalid
oneapi_handle_t oh;
} oneapi_init_resp_t;
typedef struct oneapi_version_resp {
ze_result_t status;
char *str; // Contains version or error string if status != 0
} oneapi_version_resp_t;
void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp);
void oneapi_check_vram(oneapi_handle_t h, int driver, int device,
mem_info_t *resp);
void oneapi_release(oneapi_handle_t h);
int oneapi_get_device_count(oneapi_handle_t h, int driver);
#endif // __GPU_INFO_INTEL_H__
#endif // __APPLE__
package discover
import (
"runtime"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestBasicGetGPUInfo(t *testing.T) {
info := GetGPUInfo()
assert.NotEmpty(t, len(info))
assert.Contains(t, "cuda rocm cpu metal", info[0].Library)
if info[0].Library != "cpu" {
assert.Greater(t, info[0].TotalMemory, uint64(0))
assert.Greater(t, info[0].FreeMemory, uint64(0))
}
}
func TestCPUMemInfo(t *testing.T) {
info, err := GetCPUMem()
require.NoError(t, err)
switch runtime.GOOS {
case "darwin":
t.Skip("CPU memory not populated on darwin")
case "linux", "windows":
assert.Greater(t, info.TotalMemory, uint64(0))
assert.Greater(t, info.FreeMemory, uint64(0))
default:
return
}
}
func TestByLibrary(t *testing.T) {
type testCase struct {
input []GpuInfo
expect int
}
testCases := map[string]*testCase{
"empty": {input: []GpuInfo{}, expect: 0},
"cpu": {input: []GpuInfo{{Library: "cpu"}}, expect: 1},
"cpu + GPU": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda"}}, expect: 2},
"cpu + 2 GPU no variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda"}, {Library: "cuda"}}, expect: 2},
"cpu + 2 GPU same variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda", Variant: "v11"}, {Library: "cuda", Variant: "v11"}}, expect: 2},
"cpu + 2 GPU diff variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda", Variant: "v11"}, {Library: "cuda", Variant: "v12"}}, expect: 3},
}
for k, v := range testCases {
t.Run(k, func(t *testing.T) {
resp := (GpuInfoList)(v.input).ByLibrary()
if len(resp) != v.expect {
t.Fatalf("expected length %d, got %d => %+v", v.expect, len(resp), resp)
}
})
}
}
// TODO - add some logic to figure out card type through other means and actually verify we got back what we expected
package discover
// Runner based GPU discovery
import (
"context"
"encoding/json"
"fmt"
"io"
"log/slog"
"math/rand"
"net"
"net/http"
"os"
"os/exec"
"path/filepath"
"runtime"
"sort"
"strconv"
"strings"
"sync"
"time"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/logutil"
"github.com/ollama/ollama/ml"
)
var (
deviceMu sync.Mutex
devices []ml.DeviceInfo
libDirs map[string]struct{}
rocmDir string
exe string
bootstrapped bool
)
func GPUDevices(ctx context.Context, runners []FilteredRunnerDiscovery) []ml.DeviceInfo {
deviceMu.Lock()
defer deviceMu.Unlock()
startDiscovery := time.Now()
msg := "overall device VRAM discovery took"
defer func() {
slog.Debug(msg, "duration", time.Since(startDiscovery))
}()
if !bootstrapped {
msg = "GPU bootstrap discovery took"
libDirs = make(map[string]struct{})
var err error
exe, err = os.Executable()
if err != nil {
slog.Error("unable to lookup executable path", "error", err)
return nil
}
if eval, err := filepath.EvalSymlinks(exe); err == nil {
exe = eval
}
files, err := filepath.Glob(filepath.Join(LibOllamaPath, "*", "*ggml-*"))
if err != nil {
slog.Debug("unable to lookup runner library directories", "error", err)
}
for _, file := range files {
libDirs[filepath.Dir(file)] = struct{}{}
}
// Our current packaging model places ggml-hip in the main directory
// but keeps rocm in an isolated directory. We have to add it to
// the [LD_LIBRARY_]PATH so ggml-hip will load properly
rocmDir = filepath.Join(LibOllamaPath, "rocm")
if _, err := os.Stat(rocmDir); err != nil {
rocmDir = ""
}
if len(libDirs) == 0 {
libDirs[""] = struct{}{}
}
slog.Info("discovering available GPUs...")
// For our initial discovery pass, we gather all the known GPUs through
// all the libraries that were detected. This pass may include GPUs that
// are enumerated, but not actually supported.
// We run this in serial to avoid potentially initializing a GPU multiple
// times concurrently leading to memory contention
for dir := range libDirs {
var dirs []string
if dir == "" {
dirs = []string{LibOllamaPath}
} else {
dirs = []string{LibOllamaPath, dir}
}
// Typically bootstrapping takes < 1s, but on some systems, with devices
// in low power/idle mode, initialization can take multiple seconds. We
// set a long timeout just for bootstrap discovery to reduce the chance
// of giving up too quickly
ctx1stPass, cancel := context.WithTimeout(ctx, 30*time.Second)
defer cancel()
// For this pass, we retain duplicates in case any are incompatible with some libraries
devices = append(devices, bootstrapDevices(ctx1stPass, dirs, nil)...)
}
// In the second pass, we more deeply initialize the GPUs to weed out devices that
// aren't supported by a given library. We run this phase in parallel to speed up discovery.
slog.Debug("filtering out unsupported or overlapping GPU library combinations", "count", len(devices))
ctx2ndPass, cancel := context.WithTimeout(ctx, 30*time.Second)
defer cancel()
var wg sync.WaitGroup
needsDelete := make([]bool, len(devices))
supportedMu := sync.Mutex{}
supported := make(map[string]map[string]map[string]int) // [Library][libDir][ID] = pre-deletion devices index
for i := range devices {
libDir := devices[i].LibraryPath[len(devices[i].LibraryPath)-1]
if devices[i].Library == "Metal" {
continue
}
slog.Debug("verifying GPU is supported", "library", libDir, "description", devices[i].Description, "compute", devices[i].Compute(), "pci_id", devices[i].PCIID)
wg.Add(1)
go func(i int) {
defer wg.Done()
var envVar string
if devices[i].Library == "ROCm" {
if runtime.GOOS != "linux" {
envVar = "HIP_VISIBLE_DEVICES"
} else {
envVar = "ROCR_VISIBLE_DEVICES"
}
} else {
envVar = "CUDA_VISIBLE_DEVICES"
}
extraEnvs := []string{
"GGML_CUDA_INIT=1", // force deep initialization to trigger crash on unsupported GPUs
envVar + "=" + devices[i].ID, // Filter to just this one GPU
}
if len(bootstrapDevices(ctx2ndPass, devices[i].LibraryPath, extraEnvs)) == 0 {
needsDelete[i] = true
} else {
supportedMu.Lock()
if _, ok := supported[devices[i].Library]; !ok {
supported[devices[i].Library] = make(map[string]map[string]int)
}
if _, ok := supported[devices[i].Library][libDir]; !ok {
supported[devices[i].Library][libDir] = make(map[string]int)
}
supported[devices[i].Library][libDir][devices[i].ID] = i
supportedMu.Unlock()
}
}(i)
}
wg.Wait()
logutil.Trace("supported GPU library combinations", "supported", supported)
// Mark for deletion any overlaps - favoring the library version that can cover all GPUs if possible
filterOverlapByLibrary(supported, needsDelete)
// TODO if we ever support multiple ROCm library versions this algorithm will need to be adjusted to keep the rocmID numeric value correct
rocmID := 0
for i := 0; i < len(needsDelete); i++ {
if needsDelete[i] {
logutil.Trace("removing unsupported or overlapping GPU combination", "libDir", devices[i].LibraryPath[len(devices[i].LibraryPath)-1], "description", devices[i].Description, "compute", devices[i].Compute(), "pci_id", devices[i].PCIID)
devices = append(devices[:i], devices[i+1:]...)
needsDelete = append(needsDelete[:i], needsDelete[i+1:]...)
i--
} else if devices[i].Library == "ROCm" {
if _, err := strconv.Atoi(devices[i].ID); err == nil {
// Replace the numeric ID with the post-filtered IDs
devices[i].FilteredID = devices[i].ID
devices[i].ID = strconv.Itoa(rocmID)
}
rocmID++
}
}
// Now filter out any overlap with different libraries (favor CUDA/ROCm over others)
for i := 0; i < len(devices); i++ {
for j := i + 1; j < len(devices); j++ {
// For this pass, we only drop exact duplicates
switch devices[i].Compare(devices[j]) {
case ml.SameBackendDevice:
// Same library and device, skip it
devices = append(devices[:j], devices[j+1:]...)
j--
continue
case ml.DuplicateDevice:
// Different library, choose based on priority
var droppedDevice ml.DeviceInfo
if devices[i].Library == "CUDA" || devices[i].Library == "ROCm" {
droppedDevice = devices[j]
} else {
droppedDevice = devices[i]
devices[i] = devices[j]
}
devices = append(devices[:j], devices[j+1:]...)
j--
typeStr := "discrete"
if droppedDevice.Integrated {
typeStr = "iGPU"
}
slog.Debug("dropping duplicate device",
"id", droppedDevice.ID,
"library", droppedDevice.Library,
"compute", droppedDevice.Compute(),
"name", droppedDevice.Name,
"description", droppedDevice.Description,
"libdirs", strings.Join(droppedDevice.LibraryPath, ","),
"driver", droppedDevice.Driver(),
"pci_id", droppedDevice.PCIID,
"type", typeStr,
"total", format.HumanBytes2(droppedDevice.TotalMemory),
"available", format.HumanBytes2(droppedDevice.FreeMemory),
)
continue
}
}
}
// Reset the libDirs to what we actually wind up using for future refreshes
libDirs = make(map[string]struct{})
for _, dev := range devices {
dir := dev.LibraryPath[len(dev.LibraryPath)-1]
if dir != LibOllamaPath {
libDirs[dir] = struct{}{}
}
}
if len(libDirs) == 0 {
libDirs[""] = struct{}{}
}
bootstrapped = true
} else {
if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
// metal never updates free VRAM
return devices
}
slog.Debug("refreshing free memory")
updated := make([]bool, len(devices))
allDone := func() bool {
allDone := true
for _, done := range updated {
if !done {
allDone = false
break
}
}
return allDone
}
// First try to use existing runners to refresh VRAM since they're already
// active on GPU(s)
for _, runner := range runners {
if runner == nil {
continue
}
deviceIDs := runner.GetActiveDeviceIDs()
if len(deviceIDs) == 0 {
// Skip this runner since it doesn't have active GPU devices
continue
}
// Check to see if this runner is active on any devices that need a refresh
skip := true
devCheck:
for _, dev := range deviceIDs {
for i := range devices {
if dev == devices[i].DeviceID {
if !updated[i] {
skip = false
break devCheck
}
}
}
}
if skip {
continue
}
// Typical refresh on existing runner is ~500ms but allow longer if the system
// is under stress before giving up and using stale data.
ctx, cancel := context.WithTimeout(ctx, 3*time.Second)
defer cancel()
start := time.Now()
updatedDevices := runner.GetDeviceInfos(ctx)
slog.Debug("existing runner discovery took", "duration", time.Since(start))
for _, u := range updatedDevices {
for i := range devices {
if u.DeviceID == devices[i].DeviceID {
updated[i] = true
devices[i].FreeMemory = u.FreeMemory
break
}
}
}
// Short circuit if we've updated all the devices
if allDone() {
break
}
}
if !allDone() {
slog.Debug("unable to refresh all GPUs with existing runners, performing bootstrap discovery")
// Bootstrapping may take longer in some cases (AMD windows), but we
// would rather use stale free data to get the model running sooner
ctx, cancel := context.WithTimeout(ctx, 3*time.Second)
defer cancel()
for dir := range libDirs {
updatedDevices := bootstrapDevices(ctx, []string{LibOllamaPath, dir}, nil)
for _, u := range updatedDevices {
for i := range devices {
if u.DeviceID == devices[i].DeviceID {
updated[i] = true
devices[i].FreeMemory = u.FreeMemory
break
}
}
// TODO - consider evaluating if new devices have appeared (e.g. hotplug)
}
if allDone() {
break
}
}
if !allDone() {
slog.Warn("unable to refresh free memory, using old values")
}
}
}
return devices
}
func filterOverlapByLibrary(supported map[string]map[string]map[string]int, needsDelete []bool) {
// For multi-GPU systems, use the newest version that supports all the GPUs
for _, byLibDirs := range supported {
libDirs := make([]string, 0, len(byLibDirs))
for libDir := range byLibDirs {
libDirs = append(libDirs, libDir)
}
sort.Sort(sort.Reverse(sort.StringSlice(libDirs)))
anyMissing := false
var newest string
for _, newest = range libDirs {
for _, libDir := range libDirs {
if libDir == newest {
continue
}
if len(byLibDirs[newest]) != len(byLibDirs[libDir]) {
anyMissing = true
break
}
for dev := range byLibDirs[newest] {
if _, found := byLibDirs[libDir][dev]; !found {
anyMissing = true
break
}
}
}
if !anyMissing {
break
}
}
// Now we can mark overlaps for deletion
for _, libDir := range libDirs {
if libDir == newest {
continue
}
for dev, i := range byLibDirs[libDir] {
if _, found := byLibDirs[newest][dev]; found {
needsDelete[i] = true
}
}
}
}
}
type bootstrapRunner struct {
port int
cmd *exec.Cmd
}
func (r *bootstrapRunner) GetPort() int {
return r.port
}
func (r *bootstrapRunner) HasExited() bool {
if r.cmd != nil && r.cmd.ProcessState != nil {
return true
}
return false
}
func bootstrapDevices(ctx context.Context, ollamaLibDirs []string, extraEnvs []string) []ml.DeviceInfo {
// TODO DRY out with llm/server.go
slog.Debug("spawing runner with", "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "extra_envs", extraEnvs)
start := time.Now()
defer func() {
slog.Debug("bootstrap discovery took", "duration", time.Since(start), "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "extra_envs", extraEnvs)
}()
port := 0
if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
var l *net.TCPListener
if l, err = net.ListenTCP("tcp", a); err == nil {
port = l.Addr().(*net.TCPAddr).Port
l.Close()
}
}
if port == 0 {
slog.Debug("ResolveTCPAddr failed, using random port")
port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
}
params := []string{"runner", "--ollama-engine", "--port", strconv.Itoa(port)}
var pathEnv string
switch runtime.GOOS {
case "windows":
pathEnv = "PATH"
case "darwin":
pathEnv = "DYLD_LIBRARY_PATH"
default:
pathEnv = "LD_LIBRARY_PATH"
}
libraryPaths := append([]string{LibOllamaPath}, ollamaLibDirs...)
if rocmDir != "" {
libraryPaths = append(libraryPaths, rocmDir)
}
// Note: we always put our dependency paths first
// since these are the exact version we compiled/linked against
if libraryPath, ok := os.LookupEnv(pathEnv); ok {
libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
}
cmd := exec.Command(exe, params...)
cmd.Env = os.Environ()
if envconfig.LogLevel() == logutil.LevelTrace {
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
}
// cmd.SysProcAttr = llm.LlamaServerSysProcAttr // circular dependency - bring back once refactored
cmd.Env = append(cmd.Env, "OLLAMA_LIBRARY_PATH="+strings.Join(ollamaLibDirs, string(filepath.ListSeparator)))
pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
pathNeeded := true
extraDone := make([]bool, len(extraEnvs))
for i := range cmd.Env {
cmp := strings.SplitN(cmd.Env[i], "=", 2)
if strings.EqualFold(cmp[0], pathEnv) {
cmd.Env[i] = pathEnv + "=" + pathEnvVal
pathNeeded = false
} else {
for j := range extraEnvs {
if extraDone[j] {
continue
}
extra := strings.SplitN(extraEnvs[j], "=", 2)
if cmp[0] == extra[0] {
cmd.Env[i] = extraEnvs[j]
extraDone[i] = true
}
}
}
}
if pathNeeded {
cmd.Env = append(cmd.Env, pathEnv+"="+pathEnvVal)
}
for i := range extraDone {
if !extraDone[i] {
cmd.Env = append(cmd.Env, extraEnvs[i])
}
}
logutil.Trace("starting runner for device discovery", "env", cmd.Env, "cmd", cmd)
if err := cmd.Start(); err != nil {
slog.Warn("unable to start discovery subprocess", "cmd", cmd, "error", err)
return nil
}
go func() {
cmd.Wait() // exit status ignored
}()
defer cmd.Process.Kill()
devices, err := GetDevicesFromRunner(ctx, &bootstrapRunner{port: port, cmd: cmd})
if err != nil {
if cmd.ProcessState != nil && cmd.ProcessState.ExitCode() >= 0 {
// Expected during bootstrapping while we filter out unsupported AMD GPUs
logutil.Trace("runner exited", "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "extra_envs", extraEnvs, "code", cmd.ProcessState.ExitCode())
} else {
slog.Info("failure during GPU discovery", "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "extra_envs", extraEnvs, "error", err)
}
}
logutil.Trace("runner enumerated devices", "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "devices", devices)
return devices
}
func GetDevicesFromRunner(ctx context.Context, runner BaseRunner) ([]ml.DeviceInfo, error) {
var moreDevices []ml.DeviceInfo
port := runner.GetPort()
tick := time.Tick(10 * time.Millisecond)
for {
select {
case <-ctx.Done():
return nil, fmt.Errorf("failed to finish discovery before timeout")
case <-tick:
r, err := http.NewRequestWithContext(ctx, http.MethodGet, fmt.Sprintf("http://127.0.0.1:%d/info", port), nil)
if err != nil {
return nil, fmt.Errorf("failed to create request: %w", err)
}
r.Header.Set("Content-Type", "application/json")
resp, err := http.DefaultClient.Do(r)
if err != nil {
// slog.Warn("failed to send request", "error", err)
if runner.HasExited() {
return nil, fmt.Errorf("runner crashed")
}
continue
}
defer resp.Body.Close()
if resp.StatusCode == http.StatusNotFound {
// old runner, fall back to bootstrapping model
return nil, fmt.Errorf("llamarunner free vram reporting not supported")
}
body, err := io.ReadAll(resp.Body)
if err != nil {
slog.Warn("failed to read response", "error", err)
continue
}
if resp.StatusCode != 200 {
logutil.Trace("runner failed to discover free VRAM", "status", resp.StatusCode, "response", body)
return nil, fmt.Errorf("runner error: %s", string(body))
}
if err := json.Unmarshal(body, &moreDevices); err != nil {
slog.Warn("unmarshal encode response", "error", err)
continue
}
return moreDevices, nil
}
}
}
package discover
import (
"testing"
"github.com/ollama/ollama/app/lifecycle"
)
func init() {
lifecycle.InitLogging()
}
func TestFilterOverlapByLibrary(t *testing.T) {
type testcase struct {
name string
inp map[string]map[string]map[string]int
exp []bool
}
for _, tc := range []testcase{
{
name: "empty",
inp: map[string]map[string]map[string]int{},
exp: []bool{}, // needs deletion
},
{
name: "single no overlap",
inp: map[string]map[string]map[string]int{
"CUDA": {
"cuda_v12": {
"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 0,
},
},
},
exp: []bool{false},
},
{
name: "100% overlap pick 2nd",
inp: map[string]map[string]map[string]int{
"CUDA": {
"cuda_v12": {
"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 0,
"GPU-cd6c3216-03d2-a8eb-8235-2ffbf571712e": 1,
},
"cuda_v13": {
"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 2,
"GPU-cd6c3216-03d2-a8eb-8235-2ffbf571712e": 3,
},
},
},
exp: []bool{true, true, false, false},
},
{
name: "100% overlap pick 1st",
inp: map[string]map[string]map[string]int{
"CUDA": {
"cuda_v13": {
"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 0,
"GPU-cd6c3216-03d2-a8eb-8235-2ffbf571712e": 1,
},
"cuda_v12": {
"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 2,
"GPU-cd6c3216-03d2-a8eb-8235-2ffbf571712e": 3,
},
},
},
exp: []bool{false, false, true, true},
},
{
name: "partial overlap pick older",
inp: map[string]map[string]map[string]int{
"CUDA": {
"cuda_v13": {
"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 0,
},
"cuda_v12": {
"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 1,
"GPU-cd6c3216-03d2-a8eb-8235-2ffbf571712e": 2,
},
},
},
exp: []bool{true, false, false},
},
{
name: "no overlap",
inp: map[string]map[string]map[string]int{
"CUDA": {
"cuda_v13": {
"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 0,
},
"cuda_v12": {
"GPU-cd6c3216-03d2-a8eb-8235-2ffbf571712e": 1,
},
},
},
exp: []bool{false, false},
},
} {
t.Run(tc.name, func(t *testing.T) {
needsDelete := make([]bool, len(tc.exp))
filterOverlapByLibrary(tc.inp, needsDelete)
for i, exp := range tc.exp {
if needsDelete[i] != exp {
t.Fatalf("expected: %v\ngot: %v", tc.exp, needsDelete)
}
}
})
}
}
package discover package discover
import ( import (
"fmt" "context"
"log/slog" "log/slog"
"path/filepath"
"runtime"
"strings"
"github.com/ollama/ollama/format" "github.com/ollama/ollama/format"
"github.com/ollama/ollama/ml"
) )
type memInfo struct { type memInfo struct {
...@@ -15,8 +19,8 @@ type memInfo struct { ...@@ -15,8 +19,8 @@ type memInfo struct {
// Beginning of an `ollama info` command // Beginning of an `ollama info` command
type GpuInfo struct { // TODO better name maybe "InferenceProcessor"? type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
ml.DeviceID
memInfo memInfo
Library string `json:"library,omitempty"`
// Optional variant to select (e.g. versions, cpu feature flags) // Optional variant to select (e.g. versions, cpu feature flags)
Variant string `json:"variant"` Variant string `json:"variant"`
...@@ -27,17 +31,13 @@ type GpuInfo struct { // TODO better name maybe "InferenceProcessor"? ...@@ -27,17 +31,13 @@ type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
// Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly // Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly
DependencyPath []string `json:"lib_path,omitempty"` DependencyPath []string `json:"lib_path,omitempty"`
// Extra environment variables specific to the GPU as list of [key=value]
EnvWorkarounds []string `json:"envs,omitempty"`
// Set to true if we can NOT reliably discover FreeMemory. A value of true indicates // Set to true if we can NOT reliably discover FreeMemory. A value of true indicates
// the FreeMemory is best effort, and may over or under report actual memory usage // the FreeMemory is best effort, and may over or under report actual memory usage
// False indicates FreeMemory can generally be trusted on this GPU // False indicates FreeMemory can generally be trusted on this GPU
UnreliableFreeMemory bool UnreliableFreeMemory bool
// GPU information // GPU information
ID string `json:"gpu_id"` // string to use for selection of this specific GPU filterID string // AMD Workaround: The numeric ID of the device used to filter out other devices
filterID int //nolint:unused,nolintlint // AMD Workaround: The numeric ID of the device used to filter out other devices
Name string `json:"name"` // user friendly name if available Name string `json:"name"` // user friendly name if available
Compute string `json:"compute"` // Compute Capability or gfx Compute string `json:"compute"` // Compute Capability or gfx
...@@ -70,37 +70,8 @@ type CPU struct { ...@@ -70,37 +70,8 @@ type CPU struct {
ThreadCount int ThreadCount int
} }
type CudaGPUInfo struct {
GpuInfo
OSOverhead uint64 // Memory overhead between the driver library and management library
index int //nolint:unused,nolintlint
computeMajor int //nolint:unused,nolintlint
computeMinor int //nolint:unused,nolintlint
}
type CudaGPUInfoList []CudaGPUInfo
type RocmGPUInfo struct {
GpuInfo
usedFilepath string //nolint:unused,nolintlint
index int //nolint:unused,nolintlint
}
type RocmGPUInfoList []RocmGPUInfo
type OneapiGPUInfo struct {
GpuInfo
driverIndex int //nolint:unused,nolintlint
gpuIndex int //nolint:unused,nolintlint
}
type OneapiGPUInfoList []OneapiGPUInfo
type GpuInfoList []GpuInfo type GpuInfoList []GpuInfo
type UnsupportedGPUInfo struct {
GpuInfo
Reason string `json:"reason"`
}
// Split up the set of gpu info's by Library and variant
func (l GpuInfoList) ByLibrary() []GpuInfoList { func (l GpuInfoList) ByLibrary() []GpuInfoList {
resp := []GpuInfoList{} resp := []GpuInfoList{}
libs := []string{} libs := []string{}
...@@ -125,18 +96,47 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList { ...@@ -125,18 +96,47 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
return resp return resp
} }
// Report the GPU information into the log an Info level func LogDetails(devices []ml.DeviceInfo) {
func (l GpuInfoList) LogDetails() { for _, dev := range devices {
for _, g := range l { var libs []string
for _, dir := range dev.LibraryPath {
if strings.Contains(dir, filepath.Join("lib", "ollama")) {
libs = append(libs, filepath.Base(dir))
}
}
typeStr := "discrete"
if dev.Integrated {
typeStr = "iGPU"
}
slog.Info("inference compute", slog.Info("inference compute",
"id", g.ID, "id", dev.ID,
"library", g.Library, "library", dev.Library,
"variant", g.Variant, "compute", dev.Compute(),
"compute", g.Compute, "name", dev.Name,
"driver", fmt.Sprintf("%d.%d", g.DriverMajor, g.DriverMinor), "description", dev.Description,
"name", g.Name, "libdirs", strings.Join(libs, ","),
"total", format.HumanBytes2(g.TotalMemory), "driver", dev.Driver(),
"available", format.HumanBytes2(g.FreeMemory), "pci_id", dev.PCIID,
"type", typeStr,
"total", format.HumanBytes2(dev.TotalMemory),
"available", format.HumanBytes2(dev.FreeMemory),
)
}
// CPU inference
if len(devices) == 0 {
dev, _ := GetCPUMem()
slog.Info("inference compute",
"id", "cpu",
"library", "cpu",
"compute", "",
"name", "cpu",
"description", "cpu",
"libdirs", "ollama",
"driver", "",
"pci_id", "",
"type", "",
"total", format.HumanBytes2(dev.TotalMemory),
"available", format.HumanBytes2(dev.FreeMemory),
) )
} }
} }
...@@ -149,16 +149,15 @@ func (a ByFreeMemory) Swap(i, j int) { a[i], a[j] = a[j], a[i] } ...@@ -149,16 +149,15 @@ func (a ByFreeMemory) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a ByFreeMemory) Less(i, j int) bool { return a[i].FreeMemory < a[j].FreeMemory } func (a ByFreeMemory) Less(i, j int) bool { return a[i].FreeMemory < a[j].FreeMemory }
type SystemInfo struct { type SystemInfo struct {
System CPUInfo `json:"system"` System CPUInfo `json:"system"`
GPUs []GpuInfo `json:"gpus"` GPUs []GpuInfo `json:"gpus"`
UnsupportedGPUs []UnsupportedGPUInfo `json:"unsupported_gpus"`
DiscoveryErrors []string `json:"discovery_errors"`
} }
// Return the optimal number of threads to use for inference // Return the optimal number of threads to use for inference
func (si SystemInfo) GetOptimalThreadCount() int { func (si SystemInfo) GetOptimalThreadCount() int {
if len(si.System.CPUs) == 0 { if len(si.System.CPUs) == 0 {
return 0 // Fall back to Go's num CPU
return runtime.NumCPU()
} }
coreCount := 0 coreCount := 0
...@@ -173,9 +172,9 @@ func (si SystemInfo) GetOptimalThreadCount() int { ...@@ -173,9 +172,9 @@ func (si SystemInfo) GetOptimalThreadCount() int {
func (l GpuInfoList) FlashAttentionSupported() bool { func (l GpuInfoList) FlashAttentionSupported() bool {
for _, gpu := range l { for _, gpu := range l {
supportsFA := gpu.Library == "cpu" || supportsFA := gpu.Library == "cpu" ||
gpu.Library == "metal" || gpu.Name == "Metal" || gpu.Library == "Metal" ||
(gpu.Library == "cuda" && gpu.DriverMajor >= 7) || (gpu.Library == "CUDA" && gpu.DriverMajor >= 7) ||
gpu.Library == "rocm" gpu.Library == "ROCm"
if !supportsFA { if !supportsFA {
return false return false
...@@ -183,3 +182,31 @@ func (l GpuInfoList) FlashAttentionSupported() bool { ...@@ -183,3 +182,31 @@ func (l GpuInfoList) FlashAttentionSupported() bool {
} }
return true return true
} }
type BaseRunner interface {
// GetPort returns the localhost port number the runner is running on
GetPort() int
// HasExited indicates if the runner is no longer running. This can be used during
// bootstrap to detect if a given filtered device is incompatible and triggered an assert
HasExited() bool
}
type RunnerDiscovery interface {
BaseRunner
// GetDeviceInfos will perform a query of the underlying device libraries
// for device identification and free VRAM information
// During bootstrap scenarios, this routine may take seconds to complete
GetDeviceInfos(ctx context.Context) []ml.DeviceInfo
}
type FilteredRunnerDiscovery interface {
RunnerDiscovery
// GetActiveDeviceIDs returns the filtered set of devices actively in
// use by this runner for running models. If the runner is a bootstrap runner, no devices
// will be active yet so no device IDs are returned.
// This routine will not query the underlying device and will return immediately
GetActiveDeviceIDs() []ml.DeviceID
}
...@@ -65,6 +65,9 @@ With ROCm v6.1, the following GPUs are supported on Windows. ...@@ -65,6 +65,9 @@ With ROCm v6.1, the following GPUs are supported on Windows.
| AMD Radeon RX | `7900 XTX` `7900 XT` `7900 GRE` `7800 XT` `7700 XT` `7600 XT` `7600` `6950 XT` `6900 XTX` `6900XT` `6800 XT` `6800` | | AMD Radeon RX | `7900 XTX` `7900 XT` `7900 GRE` `7800 XT` `7700 XT` `7600 XT` `7600` `6950 XT` `6900 XTX` `6900XT` `6800 XT` `6800` |
| AMD Radeon PRO | `W7900` `W7800` `W7700` `W7600` `W7500` `W6900X` `W6800X Duo` `W6800X` `W6800` `V620` | | AMD Radeon PRO | `W7900` `W7800` `W7700` `W7600` `W7500` `W6900X` `W6800X Duo` `W6800X` `W6800` `V620` |
### Known Workarounds
- The RX Vega 56 requires `HSA_ENABLE_SDMA=0` to disable SDMA
### Overrides on Linux ### Overrides on Linux
Ollama leverages the AMD ROCm library, which does not support all AMD GPUs. In Ollama leverages the AMD ROCm library, which does not support all AMD GPUs. In
......
...@@ -264,14 +264,13 @@ var ( ...@@ -264,14 +264,13 @@ var (
rainbowFollowups = []string{ rainbowFollowups = []string{
"Explain the physics involved in them. Be breif in your reply", "Explain the physics involved in them. Be breif in your reply",
"Explain the chemistry involved in them. Be breif in your reply", "Explain the chemistry involved in them. Be breif in your reply",
"Explain the quantum mechanics involved in them. Be breif in your reply",
"What are common myths related to them? Be brief in your reply", "What are common myths related to them? Be brief in your reply",
"What are common fairytales related to them? Be brief in your reply", "What are common fairytales related to them? Be brief in your reply",
"Can they form if there is no rain? Be breif in your reply", "Can they form if there is no rain? Be breif in your reply",
"Can they form if there are no clouds? Be breif in your reply", "Can they form if there are no clouds? Be breif in your reply",
"Do they happen on other planets? Be brief in your reply", "Do they happen on other planets? Be brief in your reply",
} }
rainbowExpected = []string{"water", "droplet", "mist", "glow", "refracted", "reflect", "color", "spectrum", "frequency", "end", "gold", "fortune", "blessing", "prosperity"} rainbowExpected = []string{"water", "droplet", "mist", "glow", "refract", "reflect", "scatter", "wave", "color", "spectrum", "raindrop", "atmosphere", "frequency", "end", "gold", "fortune", "blessing", "prosperity", "magic", "shower", "sky", "shimmer", "light", "storm", "sunny"}
) )
func init() { func init() {
...@@ -456,6 +455,24 @@ func InitServerConnection(ctx context.Context, t *testing.T) (*api.Client, strin ...@@ -456,6 +455,24 @@ func InitServerConnection(ctx context.Context, t *testing.T) (*api.Client, strin
t.Fatal(err) t.Fatal(err)
} }
} }
// Make sure server is online and healthy before returning
listCtx, cancel := context.WithDeadlineCause(
ctx,
time.Now().Add(120*time.Second),
fmt.Errorf("list models took too long"),
)
defer cancel()
models, err := client.ListRunning(listCtx)
if err != nil {
t.Fatal(err)
}
if len(models.Models) > 0 {
names := make([]string, len(models.Models))
for i, m := range models.Models {
names[i] = m.Name
}
slog.Info("currently loaded", "models", names)
}
return client, testEndpoint, func() { return client, testEndpoint, func() {
if os.Getenv("OLLAMA_TEST_EXISTING") == "" { if os.Getenv("OLLAMA_TEST_EXISTING") == "" {
...@@ -577,7 +594,7 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) { ...@@ -577,7 +594,7 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) {
KeepAlive: &api.Duration{Duration: 10 * time.Second}, KeepAlive: &api.Duration{Duration: 10 * time.Second},
}, { }, {
Model: smol, Model: smol,
Prompt: "how do rainbows form? Be brief but factual in your reply", Prompt: rainbowPrompt,
Stream: &stream, Stream: &stream,
KeepAlive: &api.Duration{Duration: 10 * time.Second}, KeepAlive: &api.Duration{Duration: 10 * time.Second},
}, { }, {
...@@ -595,7 +612,7 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) { ...@@ -595,7 +612,7 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) {
[][]string{ [][]string{
{"sunlight", "scatter", "interact", "color", "surface", "depth", "red", "orange", "yellow", "absorb", "wavelength", "water", "molecule"}, {"sunlight", "scatter", "interact", "color", "surface", "depth", "red", "orange", "yellow", "absorb", "wavelength", "water", "molecule"},
{"soil", "organic", "earth", "black", "tan", "chemical", "processes", "pigment", "particle", "iron oxide", "rust", "air", "water", "wet", "mixture", "mixing", "mineral", "element", "decomposed", "matter", "wavelength"}, {"soil", "organic", "earth", "black", "tan", "chemical", "processes", "pigment", "particle", "iron oxide", "rust", "air", "water", "wet", "mixture", "mixing", "mineral", "element", "decomposed", "matter", "wavelength"},
{"water", "droplet", "refract", "reflect", "color", "spectrum", "raindrop"}, rainbowExpected,
{"fourth", "july", "declaration", "independence"}, {"fourth", "july", "declaration", "independence"},
{"nitrogen", "oxygen", "carbon", "dioxide", "water", "vapor", "fluid", "particles", "gas"}, {"nitrogen", "oxygen", "carbon", "dioxide", "water", "vapor", "fluid", "particles", "gas"},
} }
......
...@@ -42,6 +42,7 @@ import ( ...@@ -42,6 +42,7 @@ import (
_ "github.com/ollama/ollama/llama/llama.cpp/common" _ "github.com/ollama/ollama/llama/llama.cpp/common"
_ "github.com/ollama/ollama/llama/llama.cpp/src" _ "github.com/ollama/ollama/llama/llama.cpp/src"
_ "github.com/ollama/ollama/llama/llama.cpp/tools/mtmd" _ "github.com/ollama/ollama/llama/llama.cpp/tools/mtmd"
"github.com/ollama/ollama/ml"
ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src" ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
) )
...@@ -62,8 +63,8 @@ func BackendInit() { ...@@ -62,8 +63,8 @@ func BackendInit() {
C.llama_backend_init() C.llama_backend_init()
} }
func EnumerateGPUs() []string { func EnumerateGPUs() []ml.DeviceID {
var ids []string var ids []ml.DeviceID
for i := range C.ggml_backend_dev_count() { for i := range C.ggml_backend_dev_count() {
device := C.ggml_backend_dev_get(i) device := C.ggml_backend_dev_get(i)
...@@ -71,7 +72,10 @@ func EnumerateGPUs() []string { ...@@ -71,7 +72,10 @@ func EnumerateGPUs() []string {
if C.ggml_backend_dev_type(device) == C.GGML_BACKEND_DEVICE_TYPE_GPU { if C.ggml_backend_dev_type(device) == C.GGML_BACKEND_DEVICE_TYPE_GPU {
var props C.struct_ggml_backend_dev_props var props C.struct_ggml_backend_dev_props
C.ggml_backend_dev_get_props(device, &props) C.ggml_backend_dev_get_props(device, &props)
ids = append(ids, C.GoString(props.id)) ids = append(ids, ml.DeviceID{
ID: C.GoString(props.id),
Library: C.GoString(props.library),
})
} }
} }
......
This diff is collapsed.
...@@ -196,7 +196,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin ...@@ -196,7 +196,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
} }
useFlashAttention := (envconfig.FlashAttention() || f.FlashAttention()) && useFlashAttention := (envconfig.FlashAttention() || f.FlashAttention()) &&
discover.GetGPUInfo().FlashAttentionSupported() && (discover.GpuInfoList)(gpus).FlashAttentionSupported() &&
f.SupportsFlashAttention() f.SupportsFlashAttention()
var kvct string var kvct string
...@@ -231,7 +231,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin ...@@ -231,7 +231,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
} }
// on metal there's no partial offload overhead // on metal there's no partial offload overhead
if gpus[0].Library == "metal" { if gpus[0].Library == "Metal" {
graphPartialOffload = graphFullOffload graphPartialOffload = graphFullOffload
} else if len(gpus) > 1 { } else if len(gpus) > 1 {
// multigpu should always use the partial graph size // multigpu should always use the partial graph size
......
...@@ -12,6 +12,7 @@ import ( ...@@ -12,6 +12,7 @@ import (
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/discover" "github.com/ollama/ollama/discover"
"github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/ml"
) )
func TestEstimateGPULayers(t *testing.T) { func TestEstimateGPULayers(t *testing.T) {
...@@ -55,7 +56,9 @@ func TestEstimateGPULayers(t *testing.T) { ...@@ -55,7 +56,9 @@ func TestEstimateGPULayers(t *testing.T) {
// Simple CPU scenario // Simple CPU scenario
gpus := []discover.GpuInfo{ gpus := []discover.GpuInfo{
{ {
Library: "cpu", DeviceID: ml.DeviceID{
Library: "cpu",
},
}, },
} }
projectors := []string{} projectors := []string{}
...@@ -77,11 +80,15 @@ func TestEstimateGPULayers(t *testing.T) { ...@@ -77,11 +80,15 @@ func TestEstimateGPULayers(t *testing.T) {
gpuMinimumMemory := uint64(2048) gpuMinimumMemory := uint64(2048)
gpus = []discover.GpuInfo{ gpus = []discover.GpuInfo{
{ {
Library: "cuda", DeviceID: ml.DeviceID{
Library: "cuda",
},
MinimumMemory: gpuMinimumMemory, MinimumMemory: gpuMinimumMemory,
}, },
{ {
Library: "cuda", DeviceID: ml.DeviceID{
Library: "cuda",
},
MinimumMemory: gpuMinimumMemory, MinimumMemory: gpuMinimumMemory,
}, },
} }
......
This diff is collapsed.
...@@ -16,8 +16,8 @@ import ( ...@@ -16,8 +16,8 @@ import (
func TestLLMServerFitGPU(t *testing.T) { func TestLLMServerFitGPU(t *testing.T) {
type gpu struct { type gpu struct {
library string id ml.DeviceID
free int free int
} }
tests := []struct { tests := []struct {
...@@ -37,91 +37,91 @@ func TestLLMServerFitGPU(t *testing.T) { ...@@ -37,91 +37,91 @@ func TestLLMServerFitGPU(t *testing.T) {
}, },
{ {
name: "Full single GPU", name: "Full single GPU",
gpus: []gpu{{free: 256 * format.MebiByte}}, gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte}, layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1, numGPU: -1,
expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{0, 1, 2}}}, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0, 1, 2}}},
}, },
{ {
name: "Partial single GPU", name: "Partial single GPU",
gpus: []gpu{{free: 256 * format.MebiByte}}, gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte}, layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
numGPU: -1, numGPU: -1,
expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{1, 2}}}, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1, 2}}},
}, },
{ {
name: "Single GPU with numGPU 1", name: "Single GPU with numGPU 1",
gpus: []gpu{{free: 256 * format.MebiByte}}, gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte}, layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: 1, numGPU: 1,
expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{1}}}, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1}}},
}, },
{ {
name: "Single GPU with numGPU 0", name: "Single GPU with numGPU 0",
gpus: []gpu{{free: 256 * format.MebiByte}}, gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte}, layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: 0, numGPU: 0,
expected: ml.GPULayersList{}, expected: ml.GPULayersList{},
}, },
{ {
name: "Single GPU with numGPU 999", name: "Single GPU with numGPU 999",
gpus: []gpu{{free: 256 * format.MebiByte}}, gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte}, layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
numGPU: 999, numGPU: 999,
expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{0, 1, 2, 3}}}, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0, 1, 2, 3}}},
}, },
{ {
name: "Multi GPU fits on one", name: "Multi GPU fits on one",
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}}, gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte}, layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1, numGPU: -1,
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1, 2}}}, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0, 1, 2}}},
}, },
{ {
name: "Multi GPU split", name: "Multi GPU split",
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}}, gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
layers: []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte}, layers: []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1, numGPU: -1,
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0}}, {ID: "gpu0", Layers: []int{1, 2}}}, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1, 2}}},
}, },
{ {
name: "Multi GPU partial", name: "Multi GPU partial",
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}}, gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
layers: []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte}, layers: []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1, numGPU: -1,
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{1}}}, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{1}}},
}, },
{ {
name: "Multi GPU numGPU 1", name: "Multi GPU numGPU 1",
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}}, gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte}, layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: 1, numGPU: 1,
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{1}}}, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{1}}},
}, },
{ {
name: "Multi GPU numGPU 2", name: "Multi GPU numGPU 2",
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}}, gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
layers: []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte}, layers: []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: 2, numGPU: 2,
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0}}, {ID: "gpu0", Layers: []int{1}}}, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1}}},
}, },
{ {
name: "Multi GPU numGPU 999", name: "Multi GPU numGPU 999",
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}}, gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
layers: []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte}, layers: []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
numGPU: 999, numGPU: 999,
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1}}, {ID: "gpu0", Layers: []int{2}}}, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0, 1}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{2}}},
}, },
{ {
name: "Multi GPU different libraries", name: "Multi GPU different libraries",
gpus: []gpu{{library: "cuda", free: 128 * format.MebiByte}, {library: "rocm", free: 256 * format.MebiByte}}, gpus: []gpu{{id: ml.DeviceID{Library: "CUDA", ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{Library: "ROCm", ID: "gpu1"}, free: 256 * format.MebiByte}},
layers: []int{128 * format.MebiByte, 128 * format.MebiByte, 50 * format.MebiByte}, layers: []int{128 * format.MebiByte, 128 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1, numGPU: -1,
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1}}}, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1", Library: "ROCm"}, Layers: []int{0, 1}}},
}, },
{ {
name: "requireFull", name: "requireFull",
gpus: []gpu{{free: 256 * format.MebiByte}}, gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte}, layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
numGPU: -1, numGPU: -1,
requireFull: true, requireFull: true,
...@@ -138,8 +138,7 @@ func TestLLMServerFitGPU(t *testing.T) { ...@@ -138,8 +138,7 @@ func TestLLMServerFitGPU(t *testing.T) {
gpus := make(discover.GpuInfoList, len(tt.gpus)) gpus := make(discover.GpuInfoList, len(tt.gpus))
for i := range tt.gpus { for i := range tt.gpus {
gpus[i].ID = fmt.Sprintf("gpu%d", i) gpus[i].DeviceID = tt.gpus[i].id
gpus[i].Library = tt.gpus[i].library
gpus[i].FreeMemory = uint64(tt.gpus[i].free) gpus[i].FreeMemory = uint64(tt.gpus[i].free)
} }
...@@ -164,7 +163,7 @@ func TestLLMServerFitGPU(t *testing.T) { ...@@ -164,7 +163,7 @@ func TestLLMServerFitGPU(t *testing.T) {
} }
for i := range s.mem.GPUs { for i := range s.mem.GPUs {
s.mem.GPUs[i].ID = fmt.Sprintf("gpu%d", i) s.mem.GPUs[i].DeviceID = gpus[i].DeviceID
s.mem.GPUs[i].Weights = make([]uint64, s.totalLayers) s.mem.GPUs[i].Weights = make([]uint64, s.totalLayers)
s.mem.GPUs[i].Cache = make([]uint64, s.totalLayers) s.mem.GPUs[i].Cache = make([]uint64, s.totalLayers)
} }
......
This diff is collapsed.
package ggml package ggml
// #cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm
// #cgo windows LDFLAGS: -lpthread
// #cgo CPPFLAGS: -I${SRCDIR}/ggml/include // #cgo CPPFLAGS: -I${SRCDIR}/ggml/include
// #include <stdlib.h> // #include <stdlib.h>
// #include <stdint.h> // #include <stdint.h>
...@@ -168,6 +170,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { ...@@ -168,6 +170,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
var props C.struct_ggml_backend_dev_props var props C.struct_ggml_backend_dev_props
C.ggml_backend_dev_get_props(cpuDeviceBufferType.d, &props) C.ggml_backend_dev_get_props(cpuDeviceBufferType.d, &props)
requiredMemory.CPU.ID = C.GoString(props.id) requiredMemory.CPU.ID = C.GoString(props.id)
requiredMemory.CPU.Library = C.GoString(props.library)
requiredMemory.CPU.Weights = make([]uint64, blocks+1) requiredMemory.CPU.Weights = make([]uint64, blocks+1)
requiredMemory.CPU.Cache = make([]uint64, blocks+1) requiredMemory.CPU.Cache = make([]uint64, blocks+1)
...@@ -186,6 +189,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { ...@@ -186,6 +189,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
var props C.struct_ggml_backend_dev_props var props C.struct_ggml_backend_dev_props
C.ggml_backend_dev_get_props(d, &props) C.ggml_backend_dev_get_props(d, &props)
requiredMemory.GPUs[i].ID = C.GoString(props.id) requiredMemory.GPUs[i].ID = C.GoString(props.id)
requiredMemory.GPUs[i].Library = C.GoString(props.library)
requiredMemory.GPUs[i].Weights = make([]uint64, blocks+1) requiredMemory.GPUs[i].Weights = make([]uint64, blocks+1)
requiredMemory.GPUs[i].Cache = make([]uint64, blocks+1) requiredMemory.GPUs[i].Cache = make([]uint64, blocks+1)
} }
...@@ -198,7 +202,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { ...@@ -198,7 +202,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
for _, l := range p.Layers { for _, l := range p.Layers {
if l == layer { if l == layer {
for i := range requiredMemory.GPUs { for i := range requiredMemory.GPUs {
if requiredMemory.GPUs[i].ID == p.ID { if requiredMemory.GPUs[i].DeviceID == p.DeviceID {
return gpuDeviceBufferTypes[i] return gpuDeviceBufferTypes[i]
} }
} }
...@@ -682,6 +686,52 @@ func (b *Backend) CacheConfig() ml.CacheConfig { ...@@ -682,6 +686,52 @@ func (b *Backend) CacheConfig() ml.CacheConfig {
} }
} }
func (b *Backend) BackendDevices() []ml.DeviceInfo {
deviceInfos := []ml.DeviceInfo{}
for _, dev := range gpus {
// If we have a model loaded, and it's only loaded on a subset of the devices
// skip idle/unused devices to avoid initializing them and causing VRAM allocations
if b.allocMemory {
idleDev := true
for _, backend := range b.schedBackends {
if dev == C.ggml_backend_get_device(backend) {
idleDev = false
break
}
}
if idleDev {
slog.Debug("skipping unused backend device", "description", C.GoString(C.ggml_backend_dev_description(dev)))
continue
}
}
info := ml.DeviceInfo{}
props := C.struct_ggml_backend_dev_props{}
C.ggml_backend_dev_get_props(dev, &props)
info.Name = C.GoString(props.name)
info.Description = C.GoString(props.description)
info.ID = C.GoString(props.id)
info.Library = C.GoString(props.library)
info.ComputeMajor = (int)(props.compute_major)
info.ComputeMinor = (int)(props.compute_minor)
info.DriverMajor = (int)(props.driver_major)
info.DriverMinor = (int)(props.driver_minor)
info.Integrated = props.integrated != 0
if props.library != nil {
info.Library = C.GoString(props.library)
}
info.PCIID = fmt.Sprintf("%02x:%02x.%x", props.pci_bus_id, props.pci_device_id, props.pci_domain_id)
info.LibraryPath = ggml.LibPaths()
C.ggml_backend_dev_memory(dev, &props.memory_free, &props.memory_total)
info.TotalMemory = (uint64)(props.memory_total)
info.FreeMemory = (uint64)(props.memory_free)
deviceInfos = append(deviceInfos, info)
}
return deviceInfos
}
type Context struct { type Context struct {
b *Backend b *Backend
......
...@@ -157,6 +157,15 @@ extern "C" { ...@@ -157,6 +157,15 @@ extern "C" {
size_t memory_total; size_t memory_total;
enum ggml_backend_dev_type type; enum ggml_backend_dev_type type;
struct ggml_backend_dev_caps caps; struct ggml_backend_dev_caps caps;
int driver_major;
int driver_minor;
int compute_major;
int compute_minor;
int integrated;
int pci_bus_id;
int pci_device_id;
int pci_domain_id;
const char *library;
}; };
GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device); GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device);
......
...@@ -203,6 +203,8 @@ add_library(ggml-base ...@@ -203,6 +203,8 @@ add_library(ggml-base
ggml-threading.h ggml-threading.h
ggml-quants.c ggml-quants.c
ggml-quants.h ggml-quants.h
mem_hip.cpp
mem_nvml.cpp
gguf.cpp) gguf.cpp)
target_include_directories(ggml-base PRIVATE .) target_include_directories(ggml-base PRIVATE .)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment