"git@developer.sourcefind.cn:Wenxuan/LightX2V.git" did not exist on "d8d70a2856ef760f22d218003b84b411de8b4c68"
Unverified Commit bc8909fb authored by Daniel Hiltgen's avatar Daniel Hiltgen Committed by GitHub
Browse files

Use runners for GPU discovery (#12090)

This revamps how we discover GPUs in the system by leveraging the Ollama
runner.  This should eliminate inconsistency between our GPU discovery and the
runners capabilities at runtime, particularly for cases where we try to filter
out unsupported GPUs.  Now the runner does that implicitly based on the actual
device list.  In some cases free VRAM reporting can be unreliable which can
leaad to scheduling mistakes, so this also includes a patch to leverage more
reliable VRAM reporting libraries if available.

Automatic workarounds have been removed as only one GPU leveraged this, which
is now documented. This GPU will soon fall off the support matrix with the next
ROCm bump.

Additional cleanup of the scheduler and discovery packages can be done in the
future once we have switched on the new memory management code, and removed
support for the llama runner.
parent 6b50f2b9
#ifndef __APPLE__
#ifndef __GPU_INFO_NVML_H__
#define __GPU_INFO_NVML_H__
#include "gpu_info.h"
// Just enough typedef's to dlopen/dlsym for memory information
typedef enum nvmlReturn_enum {
NVML_SUCCESS = 0,
// Other values omitted for now...
} nvmlReturn_t;
typedef void *nvmlDevice_t; // Opaque is sufficient
typedef struct nvmlMemory_st {
unsigned long long total;
unsigned long long free;
unsigned long long used;
} nvmlMemory_t;
typedef enum nvmlBrandType_enum
{
NVML_BRAND_UNKNOWN = 0,
} nvmlBrandType_t;
typedef struct nvml_handle {
void *handle;
uint16_t verbose;
nvmlReturn_t (*nvmlInit_v2)(void);
nvmlReturn_t (*nvmlShutdown)(void);
nvmlReturn_t (*nvmlDeviceGetHandleByUUID)(const char *, nvmlDevice_t *);
nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *);
} nvml_handle_t;
typedef struct nvml_init_resp {
char *err; // If err is non-null handle is invalid
nvml_handle_t ch;
} nvml_init_resp_t;
typedef struct nvml_compute_capability {
char *err;
int major;
int minor;
} nvml_compute_capability_t;
void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp);
void nvml_get_free(nvml_handle_t ch, char *uuid, uint64_t *free, uint64_t *total, uint64_t *used);
void nvml_release(nvml_handle_t ch);
#endif // __GPU_INFO_NVML_H__
#endif // __APPLE__
\ No newline at end of file
#ifndef __APPLE__
#include "gpu_info_oneapi.h"
#include <string.h>
void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) {
ze_result_t ret;
resp->err = NULL;
resp->oh.devices = NULL;
resp->oh.num_devices = NULL;
resp->oh.drivers = NULL;
resp->oh.num_drivers = 0;
const int buflen = 256;
char buf[buflen + 1];
int i, d;
struct lookup {
char *s;
void **p;
} l[] = {
{"zesInit", (void *)&resp->oh.zesInit},
{"zesDriverGet", (void *)&resp->oh.zesDriverGet},
{"zesDeviceGet", (void *)&resp->oh.zesDeviceGet},
{"zesDeviceGetProperties", (void *)&resp->oh.zesDeviceGetProperties},
{"zesDeviceEnumMemoryModules",
(void *)&resp->oh.zesDeviceEnumMemoryModules},
{"zesMemoryGetProperties", (void *)&resp->oh.zesMemoryGetProperties},
{"zesMemoryGetState", (void *)&resp->oh.zesMemoryGetState},
{NULL, NULL},
};
resp->oh.handle = LOAD_LIBRARY(oneapi_lib_path, RTLD_LAZY);
if (!resp->oh.handle) {
char *msg = LOAD_ERR();
snprintf(buf, buflen,
"Unable to load %s library to query for Intel GPUs: %s\n",
oneapi_lib_path, msg);
free(msg);
resp->err = strdup(buf);
return;
}
// TODO once we've squashed the remaining corner cases remove this log
LOG(resp->oh.verbose,
"wiring Level-Zero management library functions in %s\n",
oneapi_lib_path);
for (i = 0; l[i].s != NULL; i++) {
// TODO once we've squashed the remaining corner cases remove this log
LOG(resp->oh.verbose, "dlsym: %s\n", l[i].s);
*l[i].p = LOAD_SYMBOL(resp->oh.handle, l[i].s);
if (!*(l[i].p)) {
resp->oh.handle = NULL;
char *msg = LOAD_ERR();
LOG(resp->oh.verbose, "dlerr: %s\n", msg);
UNLOAD_LIBRARY(resp->oh.handle);
snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s, msg);
free(msg);
resp->err = strdup(buf);
return;
}
}
LOG(resp->oh.verbose, "calling zesInit\n");
ret = (*resp->oh.zesInit)(0);
if (ret != ZE_RESULT_SUCCESS) {
LOG(resp->oh.verbose, "zesInit err: %x\n", ret);
snprintf(buf, buflen, "oneapi vram init failure: %x", ret);
resp->err = strdup(buf);
oneapi_release(resp->oh);
return;
}
LOG(resp->oh.verbose, "calling zesDriverGet\n");
ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, NULL);
if (ret != ZE_RESULT_SUCCESS) {
LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret);
snprintf(buf, buflen, "unable to get driver count: %x", ret);
resp->err = strdup(buf);
oneapi_release(resp->oh);
return;
}
LOG(resp->oh.verbose, "oneapi driver count: %d\n", resp->oh.num_drivers);
resp->oh.drivers = malloc(resp->oh.num_drivers * sizeof(zes_driver_handle_t));
resp->oh.num_devices = malloc(resp->oh.num_drivers * sizeof(uint32_t));
memset(&resp->oh.num_devices[0], 0, resp->oh.num_drivers * sizeof(uint32_t));
resp->oh.devices =
malloc(resp->oh.num_drivers * sizeof(zes_device_handle_t *));
ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, &resp->oh.drivers[0]);
if (ret != ZE_RESULT_SUCCESS) {
LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret);
snprintf(buf, buflen, "unable to get driver count: %x", ret);
resp->err = strdup(buf);
oneapi_release(resp->oh);
return;
}
for (d = 0; d < resp->oh.num_drivers; d++) {
LOG(resp->oh.verbose, "calling zesDeviceGet count %d: %p\n", d, resp->oh.drivers[d]);
ret = (*resp->oh.zesDeviceGet)(resp->oh.drivers[d],
&resp->oh.num_devices[d], NULL);
if (ret != ZE_RESULT_SUCCESS) {
LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret);
snprintf(buf, buflen, "unable to get device count: %x", ret);
resp->err = strdup(buf);
oneapi_release(resp->oh);
return;
}
resp->oh.devices[d] =
malloc(resp->oh.num_devices[d] * sizeof(zes_device_handle_t));
ret = (*resp->oh.zesDeviceGet)(
resp->oh.drivers[d], &resp->oh.num_devices[d], resp->oh.devices[d]);
if (ret != ZE_RESULT_SUCCESS) {
LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret);
snprintf(buf, buflen, "unable to get device count: %x", ret);
resp->err = strdup(buf);
oneapi_release(resp->oh);
return;
}
}
return;
}
void oneapi_check_vram(oneapi_handle_t h, int driver, int device,
mem_info_t *resp) {
ze_result_t ret;
resp->err = NULL;
uint64_t totalMem = 0;
uint64_t usedMem = 0;
const int buflen = 256;
char buf[buflen + 1];
int i, d, m;
if (h.handle == NULL) {
resp->err = strdup("Level-Zero handle not initialized");
return;
}
if (driver > h.num_drivers || device > h.num_devices[driver]) {
resp->err = strdup("driver of device index out of bounds");
return;
}
resp->total = 0;
resp->free = 0;
zes_device_ext_properties_t ext_props;
ext_props.stype = ZES_STRUCTURE_TYPE_DEVICE_EXT_PROPERTIES;
ext_props.pNext = NULL;
zes_device_properties_t props;
props.stype = ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES;
props.pNext = &ext_props;
ret = (*h.zesDeviceGetProperties)(h.devices[driver][device], &props);
if (ret != ZE_RESULT_SUCCESS) {
snprintf(buf, buflen, "unable to get device properties: %d", ret);
resp->err = strdup(buf);
return;
}
snprintf(&resp->gpu_name[0], GPU_NAME_LEN, "%s", props.modelName);
// TODO this needs to map to ONEAPI_DEVICE_SELECTOR syntax
// (this is probably wrong...)
// TODO - the driver isn't included - what if there are multiple drivers?
snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", device);
if (h.verbose) {
// When in verbose mode, report more information about
// the card we discover.
LOG(h.verbose, "[%d:%d] oneAPI device name: %s\n", driver, device,
props.modelName);
LOG(h.verbose, "[%d:%d] oneAPI brand: %s\n", driver, device,
props.brandName);
LOG(h.verbose, "[%d:%d] oneAPI vendor: %s\n", driver, device,
props.vendorName);
LOG(h.verbose, "[%d:%d] oneAPI S/N: %s\n", driver, device,
props.serialNumber);
LOG(h.verbose, "[%d:%d] oneAPI board number: %s\n", driver, device,
props.boardNumber);
}
// TODO
// Compute Capability equivalent in resp->major, resp->minor, resp->patch
uint32_t memCount = 0;
ret = (*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount,
NULL);
if (ret != ZE_RESULT_SUCCESS) {
snprintf(buf, buflen, "unable to enumerate Level-Zero memory modules: %x",
ret);
resp->err = strdup(buf);
return;
}
LOG(h.verbose, "discovered %d Level-Zero memory modules\n", memCount);
zes_mem_handle_t *mems = malloc(memCount * sizeof(zes_mem_handle_t));
(*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount, mems);
for (m = 0; m < memCount; m++) {
zes_mem_state_t state;
state.stype = ZES_STRUCTURE_TYPE_MEM_STATE;
state.pNext = NULL;
ret = (*h.zesMemoryGetState)(mems[m], &state);
if (ret != ZE_RESULT_SUCCESS) {
snprintf(buf, buflen, "unable to get memory state: %x", ret);
resp->err = strdup(buf);
free(mems);
return;
}
resp->total += state.size;
resp->free += state.free;
}
free(mems);
}
void oneapi_release(oneapi_handle_t h) {
int d;
LOG(h.verbose, "releasing oneapi library\n");
for (d = 0; d < h.num_drivers; d++) {
if (h.devices != NULL && h.devices[d] != NULL) {
free(h.devices[d]);
}
}
if (h.devices != NULL) {
free(h.devices);
h.devices = NULL;
}
if (h.num_devices != NULL) {
free(h.num_devices);
h.num_devices = NULL;
}
if (h.drivers != NULL) {
free(h.drivers);
h.drivers = NULL;
}
h.num_drivers = 0;
UNLOAD_LIBRARY(h.handle);
h.handle = NULL;
}
int oneapi_get_device_count(oneapi_handle_t h, int driver) {
if (h.handle == NULL || h.num_devices == NULL) {
return 0;
}
if (driver > h.num_drivers) {
return 0;
}
return (int)h.num_devices[driver];
}
#endif // __APPLE__
#ifndef __APPLE__
#ifndef __GPU_INFO_ONEAPI_H__
#define __GPU_INFO_ONEAPI_H__
#include "gpu_info.h"
#define ZE_MAX_DEVICE_NAME 256
#define ZE_MAX_DEVICE_UUID_SIZE 16
#define ZES_STRING_PROPERTY_SIZE 64
#define ZE_BIT(_i) (1 << _i)
// Just enough typedef's to dlopen/dlsym for memory information
typedef enum ze_result_t {
ZE_RESULT_SUCCESS = 0,
// Other values omitted for now...
} ze_result_t;
typedef uint8_t ze_bool_t;
typedef struct _zes_driver_handle_t *zes_driver_handle_t;
typedef struct _zes_device_handle_t *zes_device_handle_t;
typedef struct _zes_mem_handle_t *zes_mem_handle_t;
typedef enum _ze_structure_type_t {
ZE_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
} ze_structure_type_t;
typedef enum _zes_structure_type_t {
ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES = 0x1,
ZES_STRUCTURE_TYPE_MEM_PROPERTIES = 0xb,
ZES_STRUCTURE_TYPE_MEM_STATE = 0x1e,
ZES_STRUCTURE_TYPE_DEVICE_EXT_PROPERTIES = 0x2d,
ZES_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
} zes_structure_type_t;
typedef enum _zes_mem_type_t {
ZES_MEM_TYPE_FORCE_UINT32 = 0x7fffffff
} zes_mem_type_t;
typedef enum _zes_mem_loc_t {
ZES_MEM_LOC_SYSTEM = 0,
ZES_MEM_LOC_DEVICE = 1,
ZES_MEM_LOC_FORCE_UINT32 = 0x7fffffff
} zes_mem_loc_t;
typedef enum _zes_mem_health_t {
ZES_MEM_HEALTH_FORCE_UINT32 = 0x7fffffff
} zes_mem_health_t;
typedef struct _ze_device_uuid_t {
uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
} ze_device_uuid_t;
typedef struct _zes_uuid_t {
uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
} zes_uuid_t;
typedef enum _ze_device_type_t {
ZE_DEVICE_TYPE_GPU = 1,
ZE_DEVICE_TYPE_CPU = 2,
ZE_DEVICE_TYPE_FPGA = 3,
ZE_DEVICE_TYPE_MCA = 4,
ZE_DEVICE_TYPE_VPU = 5,
ZE_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff
} ze_device_type_t;
typedef enum _zes_device_type_t {
ZES_DEVICE_TYPE_GPU = 1,
ZES_DEVICE_TYPE_CPU = 2,
ZES_DEVICE_TYPE_FPGA = 3,
ZES_DEVICE_TYPE_MCA = 4,
ZES_DEVICE_TYPE_VPU = 5,
ZES_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff
} zes_device_type_t;
typedef uint32_t ze_device_property_flags_t;
typedef enum _ze_device_property_flag_t {
ZE_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
ZE_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
ZE_DEVICE_PROPERTY_FLAG_ONDEMANDPAGING = ZE_BIT(3),
ZE_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
} ze_device_property_flag_t;
typedef uint32_t zes_device_property_flags_t;
typedef enum _zes_device_property_flag_t {
ZES_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
ZES_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
ZES_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
ZES_DEVICE_PROPERTY_FLAG_ONDEMANDPAGING = ZE_BIT(3),
ZES_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
} zes_device_property_flag_t;
typedef struct _ze_device_properties_t {
ze_structure_type_t stype;
void *pNext;
ze_device_type_t type;
uint32_t vendorId;
uint32_t deviceId;
ze_device_property_flags_t flags;
uint32_t subdeviceId;
uint32_t coreClockRate;
uint64_t maxMemAllocSize;
uint32_t maxHardwareContexts;
uint32_t maxCommandQueuePriority;
uint32_t numThreadsPerEU;
uint32_t physicalEUSimdWidth;
uint32_t numEUsPerSubslice;
uint32_t numSubslicesPerSlice;
uint32_t numSlices;
uint64_t timerResolution;
uint32_t timestampValidBits;
uint32_t kernelTimestampValidBits;
ze_device_uuid_t uuid;
char name[ZE_MAX_DEVICE_NAME];
} ze_device_properties_t;
typedef struct _zes_device_properties_t {
zes_structure_type_t stype;
void *pNext;
ze_device_properties_t core;
uint32_t numSubdevices;
char serialNumber[ZES_STRING_PROPERTY_SIZE];
char boardNumber[ZES_STRING_PROPERTY_SIZE];
char brandName[ZES_STRING_PROPERTY_SIZE];
char modelName[ZES_STRING_PROPERTY_SIZE];
char vendorName[ZES_STRING_PROPERTY_SIZE];
char driverVersion[ZES_STRING_PROPERTY_SIZE];
} zes_device_properties_t;
typedef struct _zes_device_ext_properties_t {
zes_structure_type_t stype;
void *pNext;
zes_uuid_t uuid;
zes_device_type_t type;
zes_device_property_flags_t flags;
} zes_device_ext_properties_t;
typedef struct _zes_mem_properties_t {
zes_structure_type_t stype;
void *pNext;
zes_mem_type_t type;
ze_bool_t onSubdevice;
uint32_t subdeviceId;
zes_mem_loc_t location;
uint64_t physicalSize;
int32_t busWidth;
int32_t numChannels;
} zes_mem_properties_t;
typedef struct _zes_mem_state_t {
zes_structure_type_t stype;
const void *pNext;
zes_mem_health_t health;
uint64_t free;
uint64_t size;
} zes_mem_state_t;
typedef struct oneapi_handle {
void *handle;
uint16_t verbose;
uint32_t num_drivers;
zes_driver_handle_t *drivers;
uint32_t *num_devices;
zes_device_handle_t **devices;
// TODO Driver major, minor information
// int driver_major;
// int driver_minor;
ze_result_t (*zesInit)(int);
ze_result_t (*zesDriverGet)(uint32_t *pCount, zes_driver_handle_t *phDrivers);
ze_result_t (*zesDeviceGet)(zes_driver_handle_t hDriver, uint32_t *pCount,
zes_device_handle_t *phDevices);
ze_result_t (*zesDeviceGetProperties)(zes_device_handle_t hDevice,
zes_device_properties_t *pProperties);
ze_result_t (*zesDeviceEnumMemoryModules)(zes_device_handle_t hDevice,
uint32_t *pCount,
zes_mem_handle_t *phMemory);
ze_result_t (*zesMemoryGetProperties)(zes_mem_handle_t hMemory,
zes_mem_properties_t *pProperties);
ze_result_t (*zesMemoryGetState)(zes_mem_handle_t hMemory,
zes_mem_state_t *pState);
} oneapi_handle_t;
typedef struct oneapi_init_resp {
char *err; // If err is non-null handle is invalid
oneapi_handle_t oh;
} oneapi_init_resp_t;
typedef struct oneapi_version_resp {
ze_result_t status;
char *str; // Contains version or error string if status != 0
} oneapi_version_resp_t;
void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp);
void oneapi_check_vram(oneapi_handle_t h, int driver, int device,
mem_info_t *resp);
void oneapi_release(oneapi_handle_t h);
int oneapi_get_device_count(oneapi_handle_t h, int driver);
#endif // __GPU_INFO_INTEL_H__
#endif // __APPLE__
package discover
import (
"runtime"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestBasicGetGPUInfo(t *testing.T) {
info := GetGPUInfo()
assert.NotEmpty(t, len(info))
assert.Contains(t, "cuda rocm cpu metal", info[0].Library)
if info[0].Library != "cpu" {
assert.Greater(t, info[0].TotalMemory, uint64(0))
assert.Greater(t, info[0].FreeMemory, uint64(0))
}
}
func TestCPUMemInfo(t *testing.T) {
info, err := GetCPUMem()
require.NoError(t, err)
switch runtime.GOOS {
case "darwin":
t.Skip("CPU memory not populated on darwin")
case "linux", "windows":
assert.Greater(t, info.TotalMemory, uint64(0))
assert.Greater(t, info.FreeMemory, uint64(0))
default:
return
}
}
func TestByLibrary(t *testing.T) {
type testCase struct {
input []GpuInfo
expect int
}
testCases := map[string]*testCase{
"empty": {input: []GpuInfo{}, expect: 0},
"cpu": {input: []GpuInfo{{Library: "cpu"}}, expect: 1},
"cpu + GPU": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda"}}, expect: 2},
"cpu + 2 GPU no variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda"}, {Library: "cuda"}}, expect: 2},
"cpu + 2 GPU same variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda", Variant: "v11"}, {Library: "cuda", Variant: "v11"}}, expect: 2},
"cpu + 2 GPU diff variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda", Variant: "v11"}, {Library: "cuda", Variant: "v12"}}, expect: 3},
}
for k, v := range testCases {
t.Run(k, func(t *testing.T) {
resp := (GpuInfoList)(v.input).ByLibrary()
if len(resp) != v.expect {
t.Fatalf("expected length %d, got %d => %+v", v.expect, len(resp), resp)
}
})
}
}
// TODO - add some logic to figure out card type through other means and actually verify we got back what we expected
package discover
// Runner based GPU discovery
import (
"context"
"encoding/json"
"fmt"
"io"
"log/slog"
"math/rand"
"net"
"net/http"
"os"
"os/exec"
"path/filepath"
"runtime"
"sort"
"strconv"
"strings"
"sync"
"time"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/logutil"
"github.com/ollama/ollama/ml"
)
var (
deviceMu sync.Mutex
devices []ml.DeviceInfo
libDirs map[string]struct{}
rocmDir string
exe string
bootstrapped bool
)
func GPUDevices(ctx context.Context, runners []FilteredRunnerDiscovery) []ml.DeviceInfo {
deviceMu.Lock()
defer deviceMu.Unlock()
startDiscovery := time.Now()
msg := "overall device VRAM discovery took"
defer func() {
slog.Debug(msg, "duration", time.Since(startDiscovery))
}()
if !bootstrapped {
msg = "GPU bootstrap discovery took"
libDirs = make(map[string]struct{})
var err error
exe, err = os.Executable()
if err != nil {
slog.Error("unable to lookup executable path", "error", err)
return nil
}
if eval, err := filepath.EvalSymlinks(exe); err == nil {
exe = eval
}
files, err := filepath.Glob(filepath.Join(LibOllamaPath, "*", "*ggml-*"))
if err != nil {
slog.Debug("unable to lookup runner library directories", "error", err)
}
for _, file := range files {
libDirs[filepath.Dir(file)] = struct{}{}
}
// Our current packaging model places ggml-hip in the main directory
// but keeps rocm in an isolated directory. We have to add it to
// the [LD_LIBRARY_]PATH so ggml-hip will load properly
rocmDir = filepath.Join(LibOllamaPath, "rocm")
if _, err := os.Stat(rocmDir); err != nil {
rocmDir = ""
}
if len(libDirs) == 0 {
libDirs[""] = struct{}{}
}
slog.Info("discovering available GPUs...")
// For our initial discovery pass, we gather all the known GPUs through
// all the libraries that were detected. This pass may include GPUs that
// are enumerated, but not actually supported.
// We run this in serial to avoid potentially initializing a GPU multiple
// times concurrently leading to memory contention
for dir := range libDirs {
var dirs []string
if dir == "" {
dirs = []string{LibOllamaPath}
} else {
dirs = []string{LibOllamaPath, dir}
}
// Typically bootstrapping takes < 1s, but on some systems, with devices
// in low power/idle mode, initialization can take multiple seconds. We
// set a long timeout just for bootstrap discovery to reduce the chance
// of giving up too quickly
ctx1stPass, cancel := context.WithTimeout(ctx, 30*time.Second)
defer cancel()
// For this pass, we retain duplicates in case any are incompatible with some libraries
devices = append(devices, bootstrapDevices(ctx1stPass, dirs, nil)...)
}
// In the second pass, we more deeply initialize the GPUs to weed out devices that
// aren't supported by a given library. We run this phase in parallel to speed up discovery.
slog.Debug("filtering out unsupported or overlapping GPU library combinations", "count", len(devices))
ctx2ndPass, cancel := context.WithTimeout(ctx, 30*time.Second)
defer cancel()
var wg sync.WaitGroup
needsDelete := make([]bool, len(devices))
supportedMu := sync.Mutex{}
supported := make(map[string]map[string]map[string]int) // [Library][libDir][ID] = pre-deletion devices index
for i := range devices {
libDir := devices[i].LibraryPath[len(devices[i].LibraryPath)-1]
if devices[i].Library == "Metal" {
continue
}
slog.Debug("verifying GPU is supported", "library", libDir, "description", devices[i].Description, "compute", devices[i].Compute(), "pci_id", devices[i].PCIID)
wg.Add(1)
go func(i int) {
defer wg.Done()
var envVar string
if devices[i].Library == "ROCm" {
if runtime.GOOS != "linux" {
envVar = "HIP_VISIBLE_DEVICES"
} else {
envVar = "ROCR_VISIBLE_DEVICES"
}
} else {
envVar = "CUDA_VISIBLE_DEVICES"
}
extraEnvs := []string{
"GGML_CUDA_INIT=1", // force deep initialization to trigger crash on unsupported GPUs
envVar + "=" + devices[i].ID, // Filter to just this one GPU
}
if len(bootstrapDevices(ctx2ndPass, devices[i].LibraryPath, extraEnvs)) == 0 {
needsDelete[i] = true
} else {
supportedMu.Lock()
if _, ok := supported[devices[i].Library]; !ok {
supported[devices[i].Library] = make(map[string]map[string]int)
}
if _, ok := supported[devices[i].Library][libDir]; !ok {
supported[devices[i].Library][libDir] = make(map[string]int)
}
supported[devices[i].Library][libDir][devices[i].ID] = i
supportedMu.Unlock()
}
}(i)
}
wg.Wait()
logutil.Trace("supported GPU library combinations", "supported", supported)
// Mark for deletion any overlaps - favoring the library version that can cover all GPUs if possible
filterOverlapByLibrary(supported, needsDelete)
// TODO if we ever support multiple ROCm library versions this algorithm will need to be adjusted to keep the rocmID numeric value correct
rocmID := 0
for i := 0; i < len(needsDelete); i++ {
if needsDelete[i] {
logutil.Trace("removing unsupported or overlapping GPU combination", "libDir", devices[i].LibraryPath[len(devices[i].LibraryPath)-1], "description", devices[i].Description, "compute", devices[i].Compute(), "pci_id", devices[i].PCIID)
devices = append(devices[:i], devices[i+1:]...)
needsDelete = append(needsDelete[:i], needsDelete[i+1:]...)
i--
} else if devices[i].Library == "ROCm" {
if _, err := strconv.Atoi(devices[i].ID); err == nil {
// Replace the numeric ID with the post-filtered IDs
devices[i].FilteredID = devices[i].ID
devices[i].ID = strconv.Itoa(rocmID)
}
rocmID++
}
}
// Now filter out any overlap with different libraries (favor CUDA/ROCm over others)
for i := 0; i < len(devices); i++ {
for j := i + 1; j < len(devices); j++ {
// For this pass, we only drop exact duplicates
switch devices[i].Compare(devices[j]) {
case ml.SameBackendDevice:
// Same library and device, skip it
devices = append(devices[:j], devices[j+1:]...)
j--
continue
case ml.DuplicateDevice:
// Different library, choose based on priority
var droppedDevice ml.DeviceInfo
if devices[i].Library == "CUDA" || devices[i].Library == "ROCm" {
droppedDevice = devices[j]
} else {
droppedDevice = devices[i]
devices[i] = devices[j]
}
devices = append(devices[:j], devices[j+1:]...)
j--
typeStr := "discrete"
if droppedDevice.Integrated {
typeStr = "iGPU"
}
slog.Debug("dropping duplicate device",
"id", droppedDevice.ID,
"library", droppedDevice.Library,
"compute", droppedDevice.Compute(),
"name", droppedDevice.Name,
"description", droppedDevice.Description,
"libdirs", strings.Join(droppedDevice.LibraryPath, ","),
"driver", droppedDevice.Driver(),
"pci_id", droppedDevice.PCIID,
"type", typeStr,
"total", format.HumanBytes2(droppedDevice.TotalMemory),
"available", format.HumanBytes2(droppedDevice.FreeMemory),
)
continue
}
}
}
// Reset the libDirs to what we actually wind up using for future refreshes
libDirs = make(map[string]struct{})
for _, dev := range devices {
dir := dev.LibraryPath[len(dev.LibraryPath)-1]
if dir != LibOllamaPath {
libDirs[dir] = struct{}{}
}
}
if len(libDirs) == 0 {
libDirs[""] = struct{}{}
}
bootstrapped = true
} else {
if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
// metal never updates free VRAM
return devices
}
slog.Debug("refreshing free memory")
updated := make([]bool, len(devices))
allDone := func() bool {
allDone := true
for _, done := range updated {
if !done {
allDone = false
break
}
}
return allDone
}
// First try to use existing runners to refresh VRAM since they're already
// active on GPU(s)
for _, runner := range runners {
if runner == nil {
continue
}
deviceIDs := runner.GetActiveDeviceIDs()
if len(deviceIDs) == 0 {
// Skip this runner since it doesn't have active GPU devices
continue
}
// Check to see if this runner is active on any devices that need a refresh
skip := true
devCheck:
for _, dev := range deviceIDs {
for i := range devices {
if dev == devices[i].DeviceID {
if !updated[i] {
skip = false
break devCheck
}
}
}
}
if skip {
continue
}
// Typical refresh on existing runner is ~500ms but allow longer if the system
// is under stress before giving up and using stale data.
ctx, cancel := context.WithTimeout(ctx, 3*time.Second)
defer cancel()
start := time.Now()
updatedDevices := runner.GetDeviceInfos(ctx)
slog.Debug("existing runner discovery took", "duration", time.Since(start))
for _, u := range updatedDevices {
for i := range devices {
if u.DeviceID == devices[i].DeviceID {
updated[i] = true
devices[i].FreeMemory = u.FreeMemory
break
}
}
}
// Short circuit if we've updated all the devices
if allDone() {
break
}
}
if !allDone() {
slog.Debug("unable to refresh all GPUs with existing runners, performing bootstrap discovery")
// Bootstrapping may take longer in some cases (AMD windows), but we
// would rather use stale free data to get the model running sooner
ctx, cancel := context.WithTimeout(ctx, 3*time.Second)
defer cancel()
for dir := range libDirs {
updatedDevices := bootstrapDevices(ctx, []string{LibOllamaPath, dir}, nil)
for _, u := range updatedDevices {
for i := range devices {
if u.DeviceID == devices[i].DeviceID {
updated[i] = true
devices[i].FreeMemory = u.FreeMemory
break
}
}
// TODO - consider evaluating if new devices have appeared (e.g. hotplug)
}
if allDone() {
break
}
}
if !allDone() {
slog.Warn("unable to refresh free memory, using old values")
}
}
}
return devices
}
func filterOverlapByLibrary(supported map[string]map[string]map[string]int, needsDelete []bool) {
// For multi-GPU systems, use the newest version that supports all the GPUs
for _, byLibDirs := range supported {
libDirs := make([]string, 0, len(byLibDirs))
for libDir := range byLibDirs {
libDirs = append(libDirs, libDir)
}
sort.Sort(sort.Reverse(sort.StringSlice(libDirs)))
anyMissing := false
var newest string
for _, newest = range libDirs {
for _, libDir := range libDirs {
if libDir == newest {
continue
}
if len(byLibDirs[newest]) != len(byLibDirs[libDir]) {
anyMissing = true
break
}
for dev := range byLibDirs[newest] {
if _, found := byLibDirs[libDir][dev]; !found {
anyMissing = true
break
}
}
}
if !anyMissing {
break
}
}
// Now we can mark overlaps for deletion
for _, libDir := range libDirs {
if libDir == newest {
continue
}
for dev, i := range byLibDirs[libDir] {
if _, found := byLibDirs[newest][dev]; found {
needsDelete[i] = true
}
}
}
}
}
type bootstrapRunner struct {
port int
cmd *exec.Cmd
}
func (r *bootstrapRunner) GetPort() int {
return r.port
}
func (r *bootstrapRunner) HasExited() bool {
if r.cmd != nil && r.cmd.ProcessState != nil {
return true
}
return false
}
func bootstrapDevices(ctx context.Context, ollamaLibDirs []string, extraEnvs []string) []ml.DeviceInfo {
// TODO DRY out with llm/server.go
slog.Debug("spawing runner with", "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "extra_envs", extraEnvs)
start := time.Now()
defer func() {
slog.Debug("bootstrap discovery took", "duration", time.Since(start), "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "extra_envs", extraEnvs)
}()
port := 0
if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
var l *net.TCPListener
if l, err = net.ListenTCP("tcp", a); err == nil {
port = l.Addr().(*net.TCPAddr).Port
l.Close()
}
}
if port == 0 {
slog.Debug("ResolveTCPAddr failed, using random port")
port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
}
params := []string{"runner", "--ollama-engine", "--port", strconv.Itoa(port)}
var pathEnv string
switch runtime.GOOS {
case "windows":
pathEnv = "PATH"
case "darwin":
pathEnv = "DYLD_LIBRARY_PATH"
default:
pathEnv = "LD_LIBRARY_PATH"
}
libraryPaths := append([]string{LibOllamaPath}, ollamaLibDirs...)
if rocmDir != "" {
libraryPaths = append(libraryPaths, rocmDir)
}
// Note: we always put our dependency paths first
// since these are the exact version we compiled/linked against
if libraryPath, ok := os.LookupEnv(pathEnv); ok {
libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
}
cmd := exec.Command(exe, params...)
cmd.Env = os.Environ()
if envconfig.LogLevel() == logutil.LevelTrace {
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
}
// cmd.SysProcAttr = llm.LlamaServerSysProcAttr // circular dependency - bring back once refactored
cmd.Env = append(cmd.Env, "OLLAMA_LIBRARY_PATH="+strings.Join(ollamaLibDirs, string(filepath.ListSeparator)))
pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
pathNeeded := true
extraDone := make([]bool, len(extraEnvs))
for i := range cmd.Env {
cmp := strings.SplitN(cmd.Env[i], "=", 2)
if strings.EqualFold(cmp[0], pathEnv) {
cmd.Env[i] = pathEnv + "=" + pathEnvVal
pathNeeded = false
} else {
for j := range extraEnvs {
if extraDone[j] {
continue
}
extra := strings.SplitN(extraEnvs[j], "=", 2)
if cmp[0] == extra[0] {
cmd.Env[i] = extraEnvs[j]
extraDone[i] = true
}
}
}
}
if pathNeeded {
cmd.Env = append(cmd.Env, pathEnv+"="+pathEnvVal)
}
for i := range extraDone {
if !extraDone[i] {
cmd.Env = append(cmd.Env, extraEnvs[i])
}
}
logutil.Trace("starting runner for device discovery", "env", cmd.Env, "cmd", cmd)
if err := cmd.Start(); err != nil {
slog.Warn("unable to start discovery subprocess", "cmd", cmd, "error", err)
return nil
}
go func() {
cmd.Wait() // exit status ignored
}()
defer cmd.Process.Kill()
devices, err := GetDevicesFromRunner(ctx, &bootstrapRunner{port: port, cmd: cmd})
if err != nil {
if cmd.ProcessState != nil && cmd.ProcessState.ExitCode() >= 0 {
// Expected during bootstrapping while we filter out unsupported AMD GPUs
logutil.Trace("runner exited", "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "extra_envs", extraEnvs, "code", cmd.ProcessState.ExitCode())
} else {
slog.Info("failure during GPU discovery", "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "extra_envs", extraEnvs, "error", err)
}
}
logutil.Trace("runner enumerated devices", "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "devices", devices)
return devices
}
func GetDevicesFromRunner(ctx context.Context, runner BaseRunner) ([]ml.DeviceInfo, error) {
var moreDevices []ml.DeviceInfo
port := runner.GetPort()
tick := time.Tick(10 * time.Millisecond)
for {
select {
case <-ctx.Done():
return nil, fmt.Errorf("failed to finish discovery before timeout")
case <-tick:
r, err := http.NewRequestWithContext(ctx, http.MethodGet, fmt.Sprintf("http://127.0.0.1:%d/info", port), nil)
if err != nil {
return nil, fmt.Errorf("failed to create request: %w", err)
}
r.Header.Set("Content-Type", "application/json")
resp, err := http.DefaultClient.Do(r)
if err != nil {
// slog.Warn("failed to send request", "error", err)
if runner.HasExited() {
return nil, fmt.Errorf("runner crashed")
}
continue
}
defer resp.Body.Close()
if resp.StatusCode == http.StatusNotFound {
// old runner, fall back to bootstrapping model
return nil, fmt.Errorf("llamarunner free vram reporting not supported")
}
body, err := io.ReadAll(resp.Body)
if err != nil {
slog.Warn("failed to read response", "error", err)
continue
}
if resp.StatusCode != 200 {
logutil.Trace("runner failed to discover free VRAM", "status", resp.StatusCode, "response", body)
return nil, fmt.Errorf("runner error: %s", string(body))
}
if err := json.Unmarshal(body, &moreDevices); err != nil {
slog.Warn("unmarshal encode response", "error", err)
continue
}
return moreDevices, nil
}
}
}
package discover
import (
"testing"
"github.com/ollama/ollama/app/lifecycle"
)
func init() {
lifecycle.InitLogging()
}
func TestFilterOverlapByLibrary(t *testing.T) {
type testcase struct {
name string
inp map[string]map[string]map[string]int
exp []bool
}
for _, tc := range []testcase{
{
name: "empty",
inp: map[string]map[string]map[string]int{},
exp: []bool{}, // needs deletion
},
{
name: "single no overlap",
inp: map[string]map[string]map[string]int{
"CUDA": {
"cuda_v12": {
"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 0,
},
},
},
exp: []bool{false},
},
{
name: "100% overlap pick 2nd",
inp: map[string]map[string]map[string]int{
"CUDA": {
"cuda_v12": {
"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 0,
"GPU-cd6c3216-03d2-a8eb-8235-2ffbf571712e": 1,
},
"cuda_v13": {
"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 2,
"GPU-cd6c3216-03d2-a8eb-8235-2ffbf571712e": 3,
},
},
},
exp: []bool{true, true, false, false},
},
{
name: "100% overlap pick 1st",
inp: map[string]map[string]map[string]int{
"CUDA": {
"cuda_v13": {
"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 0,
"GPU-cd6c3216-03d2-a8eb-8235-2ffbf571712e": 1,
},
"cuda_v12": {
"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 2,
"GPU-cd6c3216-03d2-a8eb-8235-2ffbf571712e": 3,
},
},
},
exp: []bool{false, false, true, true},
},
{
name: "partial overlap pick older",
inp: map[string]map[string]map[string]int{
"CUDA": {
"cuda_v13": {
"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 0,
},
"cuda_v12": {
"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 1,
"GPU-cd6c3216-03d2-a8eb-8235-2ffbf571712e": 2,
},
},
},
exp: []bool{true, false, false},
},
{
name: "no overlap",
inp: map[string]map[string]map[string]int{
"CUDA": {
"cuda_v13": {
"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 0,
},
"cuda_v12": {
"GPU-cd6c3216-03d2-a8eb-8235-2ffbf571712e": 1,
},
},
},
exp: []bool{false, false},
},
} {
t.Run(tc.name, func(t *testing.T) {
needsDelete := make([]bool, len(tc.exp))
filterOverlapByLibrary(tc.inp, needsDelete)
for i, exp := range tc.exp {
if needsDelete[i] != exp {
t.Fatalf("expected: %v\ngot: %v", tc.exp, needsDelete)
}
}
})
}
}
package discover package discover
import ( import (
"fmt" "context"
"log/slog" "log/slog"
"path/filepath"
"runtime"
"strings"
"github.com/ollama/ollama/format" "github.com/ollama/ollama/format"
"github.com/ollama/ollama/ml"
) )
type memInfo struct { type memInfo struct {
...@@ -15,8 +19,8 @@ type memInfo struct { ...@@ -15,8 +19,8 @@ type memInfo struct {
// Beginning of an `ollama info` command // Beginning of an `ollama info` command
type GpuInfo struct { // TODO better name maybe "InferenceProcessor"? type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
ml.DeviceID
memInfo memInfo
Library string `json:"library,omitempty"`
// Optional variant to select (e.g. versions, cpu feature flags) // Optional variant to select (e.g. versions, cpu feature flags)
Variant string `json:"variant"` Variant string `json:"variant"`
...@@ -27,17 +31,13 @@ type GpuInfo struct { // TODO better name maybe "InferenceProcessor"? ...@@ -27,17 +31,13 @@ type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
// Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly // Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly
DependencyPath []string `json:"lib_path,omitempty"` DependencyPath []string `json:"lib_path,omitempty"`
// Extra environment variables specific to the GPU as list of [key=value]
EnvWorkarounds []string `json:"envs,omitempty"`
// Set to true if we can NOT reliably discover FreeMemory. A value of true indicates // Set to true if we can NOT reliably discover FreeMemory. A value of true indicates
// the FreeMemory is best effort, and may over or under report actual memory usage // the FreeMemory is best effort, and may over or under report actual memory usage
// False indicates FreeMemory can generally be trusted on this GPU // False indicates FreeMemory can generally be trusted on this GPU
UnreliableFreeMemory bool UnreliableFreeMemory bool
// GPU information // GPU information
ID string `json:"gpu_id"` // string to use for selection of this specific GPU filterID string // AMD Workaround: The numeric ID of the device used to filter out other devices
filterID int //nolint:unused,nolintlint // AMD Workaround: The numeric ID of the device used to filter out other devices
Name string `json:"name"` // user friendly name if available Name string `json:"name"` // user friendly name if available
Compute string `json:"compute"` // Compute Capability or gfx Compute string `json:"compute"` // Compute Capability or gfx
...@@ -70,37 +70,8 @@ type CPU struct { ...@@ -70,37 +70,8 @@ type CPU struct {
ThreadCount int ThreadCount int
} }
type CudaGPUInfo struct {
GpuInfo
OSOverhead uint64 // Memory overhead between the driver library and management library
index int //nolint:unused,nolintlint
computeMajor int //nolint:unused,nolintlint
computeMinor int //nolint:unused,nolintlint
}
type CudaGPUInfoList []CudaGPUInfo
type RocmGPUInfo struct {
GpuInfo
usedFilepath string //nolint:unused,nolintlint
index int //nolint:unused,nolintlint
}
type RocmGPUInfoList []RocmGPUInfo
type OneapiGPUInfo struct {
GpuInfo
driverIndex int //nolint:unused,nolintlint
gpuIndex int //nolint:unused,nolintlint
}
type OneapiGPUInfoList []OneapiGPUInfo
type GpuInfoList []GpuInfo type GpuInfoList []GpuInfo
type UnsupportedGPUInfo struct {
GpuInfo
Reason string `json:"reason"`
}
// Split up the set of gpu info's by Library and variant
func (l GpuInfoList) ByLibrary() []GpuInfoList { func (l GpuInfoList) ByLibrary() []GpuInfoList {
resp := []GpuInfoList{} resp := []GpuInfoList{}
libs := []string{} libs := []string{}
...@@ -125,18 +96,47 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList { ...@@ -125,18 +96,47 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
return resp return resp
} }
// Report the GPU information into the log an Info level func LogDetails(devices []ml.DeviceInfo) {
func (l GpuInfoList) LogDetails() { for _, dev := range devices {
for _, g := range l { var libs []string
for _, dir := range dev.LibraryPath {
if strings.Contains(dir, filepath.Join("lib", "ollama")) {
libs = append(libs, filepath.Base(dir))
}
}
typeStr := "discrete"
if dev.Integrated {
typeStr = "iGPU"
}
slog.Info("inference compute", slog.Info("inference compute",
"id", g.ID, "id", dev.ID,
"library", g.Library, "library", dev.Library,
"variant", g.Variant, "compute", dev.Compute(),
"compute", g.Compute, "name", dev.Name,
"driver", fmt.Sprintf("%d.%d", g.DriverMajor, g.DriverMinor), "description", dev.Description,
"name", g.Name, "libdirs", strings.Join(libs, ","),
"total", format.HumanBytes2(g.TotalMemory), "driver", dev.Driver(),
"available", format.HumanBytes2(g.FreeMemory), "pci_id", dev.PCIID,
"type", typeStr,
"total", format.HumanBytes2(dev.TotalMemory),
"available", format.HumanBytes2(dev.FreeMemory),
)
}
// CPU inference
if len(devices) == 0 {
dev, _ := GetCPUMem()
slog.Info("inference compute",
"id", "cpu",
"library", "cpu",
"compute", "",
"name", "cpu",
"description", "cpu",
"libdirs", "ollama",
"driver", "",
"pci_id", "",
"type", "",
"total", format.HumanBytes2(dev.TotalMemory),
"available", format.HumanBytes2(dev.FreeMemory),
) )
} }
} }
...@@ -149,16 +149,15 @@ func (a ByFreeMemory) Swap(i, j int) { a[i], a[j] = a[j], a[i] } ...@@ -149,16 +149,15 @@ func (a ByFreeMemory) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a ByFreeMemory) Less(i, j int) bool { return a[i].FreeMemory < a[j].FreeMemory } func (a ByFreeMemory) Less(i, j int) bool { return a[i].FreeMemory < a[j].FreeMemory }
type SystemInfo struct { type SystemInfo struct {
System CPUInfo `json:"system"` System CPUInfo `json:"system"`
GPUs []GpuInfo `json:"gpus"` GPUs []GpuInfo `json:"gpus"`
UnsupportedGPUs []UnsupportedGPUInfo `json:"unsupported_gpus"`
DiscoveryErrors []string `json:"discovery_errors"`
} }
// Return the optimal number of threads to use for inference // Return the optimal number of threads to use for inference
func (si SystemInfo) GetOptimalThreadCount() int { func (si SystemInfo) GetOptimalThreadCount() int {
if len(si.System.CPUs) == 0 { if len(si.System.CPUs) == 0 {
return 0 // Fall back to Go's num CPU
return runtime.NumCPU()
} }
coreCount := 0 coreCount := 0
...@@ -173,9 +172,9 @@ func (si SystemInfo) GetOptimalThreadCount() int { ...@@ -173,9 +172,9 @@ func (si SystemInfo) GetOptimalThreadCount() int {
func (l GpuInfoList) FlashAttentionSupported() bool { func (l GpuInfoList) FlashAttentionSupported() bool {
for _, gpu := range l { for _, gpu := range l {
supportsFA := gpu.Library == "cpu" || supportsFA := gpu.Library == "cpu" ||
gpu.Library == "metal" || gpu.Name == "Metal" || gpu.Library == "Metal" ||
(gpu.Library == "cuda" && gpu.DriverMajor >= 7) || (gpu.Library == "CUDA" && gpu.DriverMajor >= 7) ||
gpu.Library == "rocm" gpu.Library == "ROCm"
if !supportsFA { if !supportsFA {
return false return false
...@@ -183,3 +182,31 @@ func (l GpuInfoList) FlashAttentionSupported() bool { ...@@ -183,3 +182,31 @@ func (l GpuInfoList) FlashAttentionSupported() bool {
} }
return true return true
} }
type BaseRunner interface {
// GetPort returns the localhost port number the runner is running on
GetPort() int
// HasExited indicates if the runner is no longer running. This can be used during
// bootstrap to detect if a given filtered device is incompatible and triggered an assert
HasExited() bool
}
type RunnerDiscovery interface {
BaseRunner
// GetDeviceInfos will perform a query of the underlying device libraries
// for device identification and free VRAM information
// During bootstrap scenarios, this routine may take seconds to complete
GetDeviceInfos(ctx context.Context) []ml.DeviceInfo
}
type FilteredRunnerDiscovery interface {
RunnerDiscovery
// GetActiveDeviceIDs returns the filtered set of devices actively in
// use by this runner for running models. If the runner is a bootstrap runner, no devices
// will be active yet so no device IDs are returned.
// This routine will not query the underlying device and will return immediately
GetActiveDeviceIDs() []ml.DeviceID
}
...@@ -65,6 +65,9 @@ With ROCm v6.1, the following GPUs are supported on Windows. ...@@ -65,6 +65,9 @@ With ROCm v6.1, the following GPUs are supported on Windows.
| AMD Radeon RX | `7900 XTX` `7900 XT` `7900 GRE` `7800 XT` `7700 XT` `7600 XT` `7600` `6950 XT` `6900 XTX` `6900XT` `6800 XT` `6800` | | AMD Radeon RX | `7900 XTX` `7900 XT` `7900 GRE` `7800 XT` `7700 XT` `7600 XT` `7600` `6950 XT` `6900 XTX` `6900XT` `6800 XT` `6800` |
| AMD Radeon PRO | `W7900` `W7800` `W7700` `W7600` `W7500` `W6900X` `W6800X Duo` `W6800X` `W6800` `V620` | | AMD Radeon PRO | `W7900` `W7800` `W7700` `W7600` `W7500` `W6900X` `W6800X Duo` `W6800X` `W6800` `V620` |
### Known Workarounds
- The RX Vega 56 requires `HSA_ENABLE_SDMA=0` to disable SDMA
### Overrides on Linux ### Overrides on Linux
Ollama leverages the AMD ROCm library, which does not support all AMD GPUs. In Ollama leverages the AMD ROCm library, which does not support all AMD GPUs. In
......
...@@ -264,14 +264,13 @@ var ( ...@@ -264,14 +264,13 @@ var (
rainbowFollowups = []string{ rainbowFollowups = []string{
"Explain the physics involved in them. Be breif in your reply", "Explain the physics involved in them. Be breif in your reply",
"Explain the chemistry involved in them. Be breif in your reply", "Explain the chemistry involved in them. Be breif in your reply",
"Explain the quantum mechanics involved in them. Be breif in your reply",
"What are common myths related to them? Be brief in your reply", "What are common myths related to them? Be brief in your reply",
"What are common fairytales related to them? Be brief in your reply", "What are common fairytales related to them? Be brief in your reply",
"Can they form if there is no rain? Be breif in your reply", "Can they form if there is no rain? Be breif in your reply",
"Can they form if there are no clouds? Be breif in your reply", "Can they form if there are no clouds? Be breif in your reply",
"Do they happen on other planets? Be brief in your reply", "Do they happen on other planets? Be brief in your reply",
} }
rainbowExpected = []string{"water", "droplet", "mist", "glow", "refracted", "reflect", "color", "spectrum", "frequency", "end", "gold", "fortune", "blessing", "prosperity"} rainbowExpected = []string{"water", "droplet", "mist", "glow", "refract", "reflect", "scatter", "wave", "color", "spectrum", "raindrop", "atmosphere", "frequency", "end", "gold", "fortune", "blessing", "prosperity", "magic", "shower", "sky", "shimmer", "light", "storm", "sunny"}
) )
func init() { func init() {
...@@ -456,6 +455,24 @@ func InitServerConnection(ctx context.Context, t *testing.T) (*api.Client, strin ...@@ -456,6 +455,24 @@ func InitServerConnection(ctx context.Context, t *testing.T) (*api.Client, strin
t.Fatal(err) t.Fatal(err)
} }
} }
// Make sure server is online and healthy before returning
listCtx, cancel := context.WithDeadlineCause(
ctx,
time.Now().Add(120*time.Second),
fmt.Errorf("list models took too long"),
)
defer cancel()
models, err := client.ListRunning(listCtx)
if err != nil {
t.Fatal(err)
}
if len(models.Models) > 0 {
names := make([]string, len(models.Models))
for i, m := range models.Models {
names[i] = m.Name
}
slog.Info("currently loaded", "models", names)
}
return client, testEndpoint, func() { return client, testEndpoint, func() {
if os.Getenv("OLLAMA_TEST_EXISTING") == "" { if os.Getenv("OLLAMA_TEST_EXISTING") == "" {
...@@ -577,7 +594,7 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) { ...@@ -577,7 +594,7 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) {
KeepAlive: &api.Duration{Duration: 10 * time.Second}, KeepAlive: &api.Duration{Duration: 10 * time.Second},
}, { }, {
Model: smol, Model: smol,
Prompt: "how do rainbows form? Be brief but factual in your reply", Prompt: rainbowPrompt,
Stream: &stream, Stream: &stream,
KeepAlive: &api.Duration{Duration: 10 * time.Second}, KeepAlive: &api.Duration{Duration: 10 * time.Second},
}, { }, {
...@@ -595,7 +612,7 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) { ...@@ -595,7 +612,7 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) {
[][]string{ [][]string{
{"sunlight", "scatter", "interact", "color", "surface", "depth", "red", "orange", "yellow", "absorb", "wavelength", "water", "molecule"}, {"sunlight", "scatter", "interact", "color", "surface", "depth", "red", "orange", "yellow", "absorb", "wavelength", "water", "molecule"},
{"soil", "organic", "earth", "black", "tan", "chemical", "processes", "pigment", "particle", "iron oxide", "rust", "air", "water", "wet", "mixture", "mixing", "mineral", "element", "decomposed", "matter", "wavelength"}, {"soil", "organic", "earth", "black", "tan", "chemical", "processes", "pigment", "particle", "iron oxide", "rust", "air", "water", "wet", "mixture", "mixing", "mineral", "element", "decomposed", "matter", "wavelength"},
{"water", "droplet", "refract", "reflect", "color", "spectrum", "raindrop"}, rainbowExpected,
{"fourth", "july", "declaration", "independence"}, {"fourth", "july", "declaration", "independence"},
{"nitrogen", "oxygen", "carbon", "dioxide", "water", "vapor", "fluid", "particles", "gas"}, {"nitrogen", "oxygen", "carbon", "dioxide", "water", "vapor", "fluid", "particles", "gas"},
} }
......
...@@ -42,6 +42,7 @@ import ( ...@@ -42,6 +42,7 @@ import (
_ "github.com/ollama/ollama/llama/llama.cpp/common" _ "github.com/ollama/ollama/llama/llama.cpp/common"
_ "github.com/ollama/ollama/llama/llama.cpp/src" _ "github.com/ollama/ollama/llama/llama.cpp/src"
_ "github.com/ollama/ollama/llama/llama.cpp/tools/mtmd" _ "github.com/ollama/ollama/llama/llama.cpp/tools/mtmd"
"github.com/ollama/ollama/ml"
ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src" ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
) )
...@@ -62,8 +63,8 @@ func BackendInit() { ...@@ -62,8 +63,8 @@ func BackendInit() {
C.llama_backend_init() C.llama_backend_init()
} }
func EnumerateGPUs() []string { func EnumerateGPUs() []ml.DeviceID {
var ids []string var ids []ml.DeviceID
for i := range C.ggml_backend_dev_count() { for i := range C.ggml_backend_dev_count() {
device := C.ggml_backend_dev_get(i) device := C.ggml_backend_dev_get(i)
...@@ -71,7 +72,10 @@ func EnumerateGPUs() []string { ...@@ -71,7 +72,10 @@ func EnumerateGPUs() []string {
if C.ggml_backend_dev_type(device) == C.GGML_BACKEND_DEVICE_TYPE_GPU { if C.ggml_backend_dev_type(device) == C.GGML_BACKEND_DEVICE_TYPE_GPU {
var props C.struct_ggml_backend_dev_props var props C.struct_ggml_backend_dev_props
C.ggml_backend_dev_get_props(device, &props) C.ggml_backend_dev_get_props(device, &props)
ids = append(ids, C.GoString(props.id)) ids = append(ids, ml.DeviceID{
ID: C.GoString(props.id),
Library: C.GoString(props.library),
})
} }
} }
......
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Tue, 26 Aug 2025 12:48:29 -0700
Subject: [PATCH] GPU discovery enhancements
Expose more information about the devices through backend props, and leverage
management libraries for more accurate VRAM usage reporting if available.
---
ggml/include/ggml-backend.h | 9 +
ggml/src/CMakeLists.txt | 2 +
ggml/src/ggml-cuda/ggml-cuda.cu | 75 +++++-
ggml/src/ggml-cuda/vendors/hip.h | 1 +
ggml/src/ggml-impl.h | 8 +
ggml/src/ggml-metal/ggml-metal.m | 2 +
ggml/src/mem_hip.cpp | 449 +++++++++++++++++++++++++++++++
ggml/src/mem_nvml.cpp | 172 ++++++++++++
8 files changed, 717 insertions(+), 1 deletion(-)
create mode 100644 ggml/src/mem_hip.cpp
create mode 100644 ggml/src/mem_nvml.cpp
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index fda5ceb24..7c2d86703 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -158,6 +158,15 @@ extern "C" {
size_t memory_total;
enum ggml_backend_dev_type type;
struct ggml_backend_dev_caps caps;
+ int driver_major;
+ int driver_minor;
+ int compute_major;
+ int compute_minor;
+ int integrated;
+ int pci_bus_id;
+ int pci_device_id;
+ int pci_domain_id;
+ const char *library;
};
GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device);
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 5158acd6a..3a428a22d 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -203,6 +203,8 @@ add_library(ggml-base
ggml-threading.h
ggml-quants.c
ggml-quants.h
+ mem_hip.cpp
+ mem_nvml.cpp
gguf.cpp)
target_include_directories(ggml-base PRIVATE .)
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index e43fde523..14baf0fb1 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -279,6 +279,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
for (int id = 0; id < info.device_count; ++id) {
int device_vmm = 0;
+#if defined(GGML_USE_HIP)
+ if (std::getenv("GGML_CUDA_INIT") != NULL) {
+ GGML_LOG_INFO("%s: initializing rocBLAS on device %d\n", __func__, id);
+ CUDA_CHECK(cudaSetDevice(id));
+ // rocblas_initialize will SIGABRT if the GPU isn't supported
+ rocblas_initialize();
+ GGML_LOG_INFO("%s: rocBLAS initialized on device %d\n", __func__, id);
+ }
+#endif
+
#if defined(GGML_USE_VMM)
CUdevice device;
CU_CHECK(cuDeviceGet(&device, id));
@@ -332,9 +342,15 @@ static ggml_cuda_device_info ggml_cuda_init() {
#else
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
info.devices[id].cc = 100*prop.major + 10*prop.minor;
+#ifdef __CUDA_ARCH_LIST__
+ if (std::getenv("GGML_CUDA_INIT") != NULL) {
+ GGML_ASSERT(ggml_cuda_has_arch(info.devices[id].cc) && "ggml was not compiled with support for this arch");
+ }
+#endif // defined(__CUDA_ARCH_LIST__)
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
ggml_cuda_parse_uuid(prop, id).c_str());
+
#endif // defined(GGML_USE_HIP)
}
@@ -3215,6 +3231,14 @@ struct ggml_backend_cuda_device_context {
std::string name;
std::string description;
std::string id;
+ int major;
+ int minor;
+ int driver_major;
+ int driver_minor;
+ int integrated;
+ int pci_bus_id;
+ int pci_device_id;
+ int pci_domain_id;
};
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -3235,6 +3259,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
ggml_cuda_set_device(ctx->device);
+
+#if defined(GGML_USE_HIP)
+ if (ggml_hip_mgmt_init() == 0) {
+ int status = ggml_hip_get_device_memory(ctx->pci_bus_id, ctx->pci_device_id, free, total);
+ if (status == 0) {
+ GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total);
+ ggml_hip_mgmt_release();
+ return;
+ }
+ ggml_hip_mgmt_release();
+ }
+#else
+ if (ggml_nvml_init() == 0) {
+ int status = ggml_nvml_get_device_memory(ctx->id.c_str(), free, total);
+ if (status == 0) {
+ GGML_LOG_DEBUG("%s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, *free, *total);
+ ggml_nvml_release();
+ return;
+ }
+ ggml_nvml_release();
+ }
+#endif
CUDA_CHECK(cudaMemGetInfo(free, total));
}
@@ -3243,6 +3289,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
return GGML_BACKEND_DEVICE_TYPE_GPU;
}
+#define GGML_HIP_NAME "HIP"
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
props->name = ggml_backend_cuda_device_get_name(dev);
props->description = ggml_backend_cuda_device_get_description(dev);
@@ -3253,6 +3300,23 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
// If you need the memory data, call ggml_backend_dev_memory() explicitly.
props->memory_total = props->memory_free = 0;
+ ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
+#if defined(GGML_USE_HIP)
+ int cc = ggml_cuda_info().devices[ctx->device].cc - GGML_CUDA_CC_OFFSET_AMD;
+ props->compute_major = cc / 0x100;
+ props->compute_minor = cc - (props->compute_major * 0x100);
+#else
+ props->compute_major = ctx->major;
+ props->compute_minor = ctx->minor;
+#endif
+ props->driver_major = ctx->driver_major;
+ props->driver_minor = ctx->driver_minor;
+ props->integrated = ctx->integrated;
+ props->pci_bus_id = ctx->pci_bus_id;
+ props->pci_device_id = ctx->pci_device_id;
+ props->pci_domain_id = ctx->pci_domain_id;
+ props->library = GGML_CUDA_NAME;
+
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
#ifdef GGML_CUDA_NO_PEER_COPY
bool events = false;
@@ -3843,6 +3907,8 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
std::lock_guard<std::mutex> lock(mutex);
if (!initialized) {
ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
+ int driverVersion = 0;
+ CUDA_CHECK(cudaDriverGetVersion(&driverVersion));
for (int i = 0; i < ggml_cuda_info().device_count; i++) {
ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
@@ -3853,7 +3919,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
dev_ctx->description = prop.name;
dev_ctx->id = ggml_cuda_parse_uuid(prop, i);
-
+ dev_ctx->major = prop.major;
+ dev_ctx->minor = prop.minor;
+ dev_ctx->driver_major = driverVersion / 1000;
+ dev_ctx->driver_minor = (driverVersion - (dev_ctx->driver_major * 1000)) / 10;
+ dev_ctx->integrated = prop.integrated;
+ dev_ctx->pci_bus_id = prop.pciBusID;
+ dev_ctx->pci_device_id = prop.pciDeviceID;
+ dev_ctx->pci_domain_id = prop.pciDomainID;
ggml_backend_dev_t dev = new ggml_backend_device {
/* .iface = */ ggml_backend_cuda_device_interface,
/* .reg = */ &reg,
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
index cf22e60d2..957a795f2 100644
--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@@ -42,6 +42,7 @@
#define cudaDeviceProp hipDeviceProp_t
#define cudaDeviceReset hipDeviceReset
#define cudaDeviceSynchronize hipDeviceSynchronize
+#define cudaDriverGetVersion hipDriverGetVersion
#define cudaError_t hipError_t
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index 19a7adb2d..b9b102a5e 100644
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -602,6 +602,14 @@ static inline bool ggml_can_fuse(const struct ggml_cgraph * cgraph, int node_idx
return true;
}
+// Management libraries for fetching more accurate free VRAM data
+GGML_API int ggml_nvml_init();
+GGML_API int ggml_nvml_get_device_memory(const char *uuid, size_t *free, size_t *total);
+GGML_API void ggml_nvml_release();
+GGML_API int ggml_hip_mgmt_init();
+GGML_API int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total);
+GGML_API void ggml_hip_mgmt_release();
+
#ifdef __cplusplus
}
#endif
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index e4c31268f..ec6b385ba 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -6523,12 +6523,14 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
GGML_UNUSED(dev);
}
+#define GGML_METAL_NAME "Metal"
static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
props->name = ggml_backend_metal_device_get_name(dev);
props->description = ggml_backend_metal_device_get_description(dev);
props->id = "0";
props->type = ggml_backend_metal_device_get_type(dev);
ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
+ props->library = GGML_METAL_NAME;
props->caps = (struct ggml_backend_dev_caps) {
/* .async = */ false,
/* .host_buffer = */ false,
diff --git a/ggml/src/mem_hip.cpp b/ggml/src/mem_hip.cpp
new file mode 100644
index 000000000..8ef19b8cf
--- /dev/null
+++ b/ggml/src/mem_hip.cpp
@@ -0,0 +1,449 @@
+#include "ggml.h"
+
+#ifdef _WIN32
+// AMD Device Library eXtra (ADLX)
+//
+// https://github.com/GPUOpen-LibrariesAndSDKs/ADLX
+//
+// This Windows-only library provides accurate VRAM reporting for AMD GPUs.
+// The runtime DLL is installed with every AMD Driver on Windows, however
+// the SDK isn't a part of the HIP SDK packaging. As such, we avoid including
+// the headers from the SDK to simplify building from source.
+//
+// ADLX relies heavily on function pointer tables.
+// Only the minimal set of types are defined below to facilitate
+// finding the target AMD GPU(s) and querying their current VRAM usage
+// Unused function parameters are commented out to avoid unnecessary type
+// definitions.
+
+#include "ggml-impl.h"
+#include <filesystem>
+#include <mutex>
+
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+# define NOMINMAX
+#endif
+#include <windows.h>
+
+namespace fs = std::filesystem;
+
+#include <stdio.h>
+#include <stdint.h>
+
+// Begin minimal ADLX definitions - derived from tag v1.0 (Dec 2022)
+typedef uint64_t adlx_uint64;
+typedef uint32_t adlx_uint32;
+typedef int32_t adlx_int32;
+typedef adlx_int32 adlx_int;
+typedef adlx_uint32 adlx_uint;
+typedef long adlx_long;
+typedef uint8_t adlx_uint8;
+typedef enum
+{
+ ADLX_OK = 0, /**< @ENG_START_DOX This result indicates success. @ENG_END_DOX */
+ ADLX_ALREADY_ENABLED, /**< @ENG_START_DOX This result indicates that the asked action is already enabled. @ENG_END_DOX */
+ ADLX_ALREADY_INITIALIZED, /**< @ENG_START_DOX This result indicates that ADLX has a unspecified type of initialization. @ENG_END_DOX */
+ ADLX_FAIL, /**< @ENG_START_DOX This result indicates an unspecified failure. @ENG_END_DOX */
+ ADLX_INVALID_ARGS, /**< @ENG_START_DOX This result indicates that the arguments are invalid. @ENG_END_DOX */
+ ADLX_BAD_VER, /**< @ENG_START_DOX This result indicates that the asked version is incompatible with the current version. @ENG_END_DOX */
+ ADLX_UNKNOWN_INTERFACE, /**< @ENG_START_DOX This result indicates that an unknown interface was asked. @ENG_END_DOX */
+ ADLX_TERMINATED, /**< @ENG_START_DOX This result indicates that the calls were made in an interface after ADLX was terminated. @ENG_END_DOX */
+ ADLX_ADL_INIT_ERROR, /**< @ENG_START_DOX This result indicates that the ADL initialization failed. @ENG_END_DOX */
+ ADLX_NOT_FOUND, /**< @ENG_START_DOX This result indicates that the item is not found. @ENG_END_DOX */
+ ADLX_INVALID_OBJECT, /**< @ENG_START_DOX This result indicates that the method was called into an invalid object. @ENG_END_DOX */
+ ADLX_ORPHAN_OBJECTS, /**< @ENG_START_DOX This result indicates that ADLX was terminated with outstanding ADLX objects. Any interface obtained from ADLX points to invalid memory and calls in their methods will result in unexpected behavior. @ENG_END_DOX */
+ ADLX_NOT_SUPPORTED, /**< @ENG_START_DOX This result indicates that the asked feature is not supported. @ENG_END_DOX */
+ ADLX_PENDING_OPERATION, /**< @ENG_START_DOX This result indicates a failure due to an operation currently in progress. @ENG_END_DOX */
+ ADLX_GPU_INACTIVE /**< @ENG_START_DOX This result indicates that the GPU is inactive. @ENG_END_DOX */
+} ADLX_RESULT;
+#define ADLX_SUCCEEDED(x) (ADLX_OK == (x) || ADLX_ALREADY_ENABLED == (x) || ADLX_ALREADY_INITIALIZED == (x))
+#define ADLX_FAILED(x) (ADLX_OK != (x) && ADLX_ALREADY_ENABLED != (x) && ADLX_ALREADY_INITIALIZED != (x))
+#define ADLX_VER_MAJOR 1
+#define ADLX_VER_MINOR 0
+#define ADLX_VER_RELEASE 5
+#define ADLX_VER_BUILD_NUM 30
+#define ADLX_MAKE_FULL_VER(VERSION_MAJOR, VERSION_MINOR, VERSION_RELEASE, VERSION_BUILD_NUM) ( ((adlx_uint64)(VERSION_MAJOR) << 48ull) | ((adlx_uint64)(VERSION_MINOR) << 32ull) | ((adlx_uint64)(VERSION_RELEASE) << 16ull) | (adlx_uint64)(VERSION_BUILD_NUM))
+#define ADLX_FULL_VERSION ADLX_MAKE_FULL_VER(ADLX_VER_MAJOR, ADLX_VER_MINOR, ADLX_VER_RELEASE, ADLX_VER_BUILD_NUM)
+#define ADLX_CORE_LINK __declspec(dllexport)
+#define ADLX_STD_CALL __stdcall
+#define ADLX_CDECL_CALL __cdecl
+#define ADLX_FAST_CALL __fastcall
+#define ADLX_INLINE __inline
+#define ADLX_FORCEINLINE __forceinline
+#define ADLX_NO_VTABLE __declspec(novtable)
+
+#if defined(__cplusplus)
+typedef bool adlx_bool;
+#else
+typedef adlx_uint8 adlx_bool;
+#define true 1
+#define false 0
+#endif
+
+typedef struct IADLXSystem IADLXSystem;
+typedef struct IADLXGPUList IADLXGPUList;
+typedef struct IADLXGPU IADLXGPU;
+typedef struct IADLXInterface IADLXInterface;
+typedef struct IADLXPerformanceMonitoringServices IADLXPerformanceMonitoringServices;
+typedef struct IADLXGPUMetrics IADLXGPUMetrics;
+typedef struct IADLXGPUMetricsSupport IADLXGPUMetricsSupport;
+
+typedef struct IADLXSystemVtbl
+{
+ // IADLXSystem interface
+ ADLX_RESULT (ADLX_STD_CALL *GetHybridGraphicsType)(/* IADLXSystem* pThis, ADLX_HG_TYPE* hgType */);
+ ADLX_RESULT (ADLX_STD_CALL *GetGPUs)(IADLXSystem* pThis, IADLXGPUList** ppGPUs); // Used
+ ADLX_RESULT (ADLX_STD_CALL *QueryInterface)(/* IADLXSystem* pThis, const wchar_t* interfaceId, void** ppInterface */);
+ ADLX_RESULT (ADLX_STD_CALL *GetDisplaysServices)(/* IADLXSystem* pThis, IADLXDisplayServices** ppDispServices */);
+ ADLX_RESULT (ADLX_STD_CALL *GetDesktopsServices)(/* IADLXSystem* pThis, IADLXDesktopServices** ppDeskServices */);
+ ADLX_RESULT (ADLX_STD_CALL *GetGPUsChangedHandling)(/* IADLXSystem* pThis, IADLXGPUsChangedHandling** ppGPUsChangedHandling */);
+ ADLX_RESULT (ADLX_STD_CALL *EnableLog)(/* IADLXSystem* pThis, ADLX_LOG_DESTINATION mode, ADLX_LOG_SEVERITY severity, IADLXLog* pLogger, const wchar_t* fileName */);
+ ADLX_RESULT (ADLX_STD_CALL *Get3DSettingsServices)(/* IADLXSystem* pThis, IADLX3DSettingsServices** pp3DSettingsServices */);
+ ADLX_RESULT (ADLX_STD_CALL *GetGPUTuningServices)(/* IADLXSystem* pThis, IADLXGPUTuningServices** ppGPUTuningServices */);
+ ADLX_RESULT (ADLX_STD_CALL *GetPerformanceMonitoringServices)(IADLXSystem* pThis, IADLXPerformanceMonitoringServices** ppPerformanceMonitoringServices); // Used
+ ADLX_RESULT (ADLX_STD_CALL *TotalSystemRAM)(/* IADLXSystem* pThis, adlx_uint* ramMB */);
+ ADLX_RESULT (ADLX_STD_CALL *GetI2C)(/* IADLXSystem* pThis, IADLXGPU* pGPU, IADLXI2C** ppI2C */);
+} IADLXSystemVtbl;
+struct IADLXSystem { const IADLXSystemVtbl *pVtbl; };
+
+typedef struct IADLXGPUVtbl
+{
+ //IADLXInterface
+ adlx_long (ADLX_STD_CALL *Acquire)(/* IADLXGPU* pThis */);
+ adlx_long (ADLX_STD_CALL *Release)(IADLXGPU* pThis); // Used
+ ADLX_RESULT (ADLX_STD_CALL *QueryInterface)(/* IADLXGPU* pThis, const wchar_t* interfaceId, void** ppInterface */);
+
+ //IADLXGPU
+ ADLX_RESULT (ADLX_STD_CALL *VendorId)(/* IADLXGPU* pThis, const char** vendorId */);
+ ADLX_RESULT (ADLX_STD_CALL *ASICFamilyType)(/* IADLXGPU* pThis, ADLX_ASIC_FAMILY_TYPE* asicFamilyType */);
+ ADLX_RESULT (ADLX_STD_CALL *Type)(/* IADLXGPU* pThis, ADLX_GPU_TYPE* gpuType */);
+ ADLX_RESULT (ADLX_STD_CALL *IsExternal)(/* IADLXGPU* pThis, adlx_bool* isExternal */);
+ ADLX_RESULT (ADLX_STD_CALL *Name)(/* IADLXGPU* pThis, const char** gpuName */);
+ ADLX_RESULT (ADLX_STD_CALL *DriverPath)(/* IADLXGPU* pThis, const char** driverPath */);
+ ADLX_RESULT (ADLX_STD_CALL *PNPString)(/* IADLXGPU* pThis, const char** pnpString */);
+ ADLX_RESULT (ADLX_STD_CALL *HasDesktops)(/* IADLXGPU* pThis, adlx_bool* hasDesktops */);
+ ADLX_RESULT (ADLX_STD_CALL *TotalVRAM)(IADLXGPU* pThis, adlx_uint* vramMB); // Used
+ ADLX_RESULT (ADLX_STD_CALL *VRAMType)(/* IADLXGPU* pThis, const char** type */);
+ ADLX_RESULT (ADLX_STD_CALL *BIOSInfo)(/* IADLXGPU* pThis, const char** partNumber, const char** version, const char** date */);
+ ADLX_RESULT (ADLX_STD_CALL *DeviceId)(/* IADLXGPU* pThis, const char** deviceId */);
+ ADLX_RESULT (ADLX_STD_CALL *RevisionId)(/* IADLXGPU* pThis, const char** revisionId */);
+ ADLX_RESULT (ADLX_STD_CALL *SubSystemId)(/* IADLXGPU* pThis, const char** subSystemId */);
+ ADLX_RESULT (ADLX_STD_CALL *SubSystemVendorId)(/* IADLXGPU* pThis, const char** subSystemVendorId */);
+ ADLX_RESULT (ADLX_STD_CALL *UniqueId)(IADLXGPU* pThis, adlx_int* uniqueId); // Used
+} IADLXGPUVtbl;
+struct IADLXGPU { const IADLXGPUVtbl *pVtbl; };
+
+typedef struct IADLXGPUListVtbl
+{
+ //IADLXInterface
+ adlx_long (ADLX_STD_CALL *Acquire)(/* IADLXGPUList* pThis */);
+ adlx_long (ADLX_STD_CALL *Release)(IADLXGPUList* pThis); // Used
+ ADLX_RESULT (ADLX_STD_CALL *QueryInterface)(/* IADLXGPUList* pThis, const wchar_t* interfaceId, void** ppInterface */);
+
+ //IADLXList
+ adlx_uint (ADLX_STD_CALL *Size)(/* IADLXGPUList* pThis */);
+ adlx_uint8 (ADLX_STD_CALL *Empty)(/* IADLXGPUList* pThis */);
+ adlx_uint (ADLX_STD_CALL *Begin)(IADLXGPUList* pThis); // Used
+ adlx_uint (ADLX_STD_CALL *End)(IADLXGPUList* pThis); // Used
+ ADLX_RESULT (ADLX_STD_CALL *At)(/* IADLXGPUList* pThis, const adlx_uint location, IADLXInterface** ppItem */);
+ ADLX_RESULT (ADLX_STD_CALL *Clear)(/* IADLXGPUList* pThis */);
+ ADLX_RESULT (ADLX_STD_CALL *Remove_Back)(/* IADLXGPUList* pThis */);
+ ADLX_RESULT (ADLX_STD_CALL *Add_Back)(/* IADLXGPUList* pThis, IADLXInterface* pItem */);
+
+ //IADLXGPUList
+ ADLX_RESULT (ADLX_STD_CALL *At_GPUList)(IADLXGPUList* pThis, const adlx_uint location, IADLXGPU** ppItem); // Used
+ ADLX_RESULT (ADLX_STD_CALL *Add_Back_GPUList)(/* IADLXGPUList* pThis, IADLXGPU* pItem */);
+
+} IADLXGPUListVtbl;
+struct IADLXGPUList { const IADLXGPUListVtbl *pVtbl; };
+
+typedef struct IADLXPerformanceMonitoringServicesVtbl
+{
+ //IADLXInterface
+ adlx_long (ADLX_STD_CALL *Acquire)(/* IADLXPerformanceMonitoringServices* pThis */);
+ adlx_long (ADLX_STD_CALL *Release)(IADLXPerformanceMonitoringServices* pThis); // Used
+ ADLX_RESULT (ADLX_STD_CALL *QueryInterface)(/* IADLXPerformanceMonitoringServices* pThis, const wchar_t* interfaceId, void** ppInterface */);
+
+ //IADLXPerformanceMonitoringServices
+ ADLX_RESULT (ADLX_STD_CALL *GetSamplingIntervalRange)(/* IADLXPerformanceMonitoringServices* pThis, ADLX_IntRange* range */);
+ ADLX_RESULT (ADLX_STD_CALL *SetSamplingInterval)(/* IADLXPerformanceMonitoringServices* pThis, adlx_int intervalMs */);
+ ADLX_RESULT (ADLX_STD_CALL *GetSamplingInterval)(/* IADLXPerformanceMonitoringServices* pThis, adlx_int* intervalMs */);
+ ADLX_RESULT (ADLX_STD_CALL *GetMaxPerformanceMetricsHistorySizeRange)(/* IADLXPerformanceMonitoringServices* pThis, ADLX_IntRange* range */);
+ ADLX_RESULT (ADLX_STD_CALL *SetMaxPerformanceMetricsHistorySize)(/* IADLXPerformanceMonitoringServices* pThis, adlx_int sizeSec */);
+ ADLX_RESULT (ADLX_STD_CALL *GetMaxPerformanceMetricsHistorySize)(/* IADLXPerformanceMonitoringServices* pThis, adlx_int* sizeSec */);
+ ADLX_RESULT (ADLX_STD_CALL *ClearPerformanceMetricsHistory)(/* IADLXPerformanceMonitoringServices* pThis */);
+ ADLX_RESULT (ADLX_STD_CALL *GetCurrentPerformanceMetricsHistorySize)(/* IADLXPerformanceMonitoringServices* pThis, adlx_int* sizeSec */);
+ ADLX_RESULT (ADLX_STD_CALL *StartPerformanceMetricsTracking)(/* IADLXPerformanceMonitoringServices* pThis */);
+ ADLX_RESULT (ADLX_STD_CALL *StopPerformanceMetricsTracking)(/* IADLXPerformanceMonitoringServices* pThis */);
+ ADLX_RESULT (ADLX_STD_CALL *GetAllMetricsHistory)(/* IADLXPerformanceMonitoringServices* pThis, adlx_int startMs, adlx_int stopMs, IADLXAllMetricsList** ppMetricsList */);
+ ADLX_RESULT (ADLX_STD_CALL *GetGPUMetricsHistory)(/* IADLXPerformanceMonitoringServices* pThis, IADLXGPU* pGPU, adlx_int startMs, adlx_int stopMs, IADLXGPUMetricsList** ppMetricsList */);
+ ADLX_RESULT (ADLX_STD_CALL *GetSystemMetricsHistory)(/* IADLXPerformanceMonitoringServices* pThis, adlx_int startMs, adlx_int stopMs, IADLXSystemMetricsList** ppMetricsList */);
+ ADLX_RESULT (ADLX_STD_CALL *GetFPSHistory)(/* IADLXPerformanceMonitoringServices* pThis, adlx_int startMs, adlx_int stopMs, IADLXFPSList** ppMetricsList */);
+ ADLX_RESULT (ADLX_STD_CALL *GetCurrentAllMetrics)(/* IADLXPerformanceMonitoringServices* pThis, IADLXAllMetrics** ppMetrics */);
+ ADLX_RESULT (ADLX_STD_CALL *GetCurrentGPUMetrics)(IADLXPerformanceMonitoringServices* pThis, IADLXGPU* pGPU, IADLXGPUMetrics** ppMetrics); // Used
+ ADLX_RESULT (ADLX_STD_CALL *GetCurrentSystemMetrics)(/* IADLXPerformanceMonitoringServices* pThis, IADLXSystemMetrics** ppMetrics */);
+ ADLX_RESULT (ADLX_STD_CALL *GetCurrentFPS)(/* IADLXPerformanceMonitoringServices* pThis, IADLXFPS** ppMetrics */);
+ ADLX_RESULT (ADLX_STD_CALL *GetSupportedGPUMetrics)(IADLXPerformanceMonitoringServices* pThis, IADLXGPU* pGPU, IADLXGPUMetricsSupport** ppMetricsSupported); // Used
+ ADLX_RESULT (ADLX_STD_CALL *GetSupportedSystemMetrics)(/* IADLXPerformanceMonitoringServices* pThis, IADLXSystemMetricsSupport** ppMetricsSupported */);
+}IADLXPerformanceMonitoringServicesVtbl;
+struct IADLXPerformanceMonitoringServices { const IADLXPerformanceMonitoringServicesVtbl *pVtbl; };
+
+typedef struct IADLXGPUMetricsSupportVtbl
+{
+ //IADLXInterface
+ adlx_long (ADLX_STD_CALL* Acquire)(/* IADLXGPUMetricsSupport* pThis */);
+ adlx_long (ADLX_STD_CALL* Release)(IADLXGPUMetricsSupport* pThis); // Used
+ ADLX_RESULT (ADLX_STD_CALL* QueryInterface)(/* IADLXGPUMetricsSupport* pThis, const wchar_t* interfaceId, void** ppInterface */);
+
+ //IADLXGPUMetricsSupport
+ ADLX_RESULT (ADLX_STD_CALL* IsSupportedGPUUsage)(/* IADLXGPUMetricsSupport* pThis, adlx_bool* supported */);
+ ADLX_RESULT (ADLX_STD_CALL* IsSupportedGPUClockSpeed)(/* IADLXGPUMetricsSupport* pThis, adlx_bool* supported */);
+ ADLX_RESULT (ADLX_STD_CALL* IsSupportedGPUVRAMClockSpeed)(/* IADLXGPUMetricsSupport* pThis, adlx_bool* supported */);
+ ADLX_RESULT (ADLX_STD_CALL* IsSupportedGPUTemperature)(/* IADLXGPUMetricsSupport* pThis, adlx_bool* supported */);
+ ADLX_RESULT (ADLX_STD_CALL* IsSupportedGPUHotspotTemperature)(/* IADLXGPUMetricsSupport* pThis, adlx_bool* supported */);
+ ADLX_RESULT (ADLX_STD_CALL* IsSupportedGPUPower)(/* IADLXGPUMetricsSupport* pThis, adlx_bool* supported */);
+ ADLX_RESULT (ADLX_STD_CALL* IsSupportedGPUTotalBoardPower)(/* IADLXGPUMetricsSupport* pThis, adlx_bool* supported */);
+ ADLX_RESULT (ADLX_STD_CALL* IsSupportedGPUFanSpeed)(/* IADLXGPUMetricsSupport* pThis, adlx_bool* supported */);
+ ADLX_RESULT (ADLX_STD_CALL* IsSupportedGPUVRAM)(IADLXGPUMetricsSupport* pThis, adlx_bool* supported); // Used
+ ADLX_RESULT (ADLX_STD_CALL* IsSupportedGPUVoltage)(/* IADLXGPUMetricsSupport* pThis, adlx_bool* supported */);
+
+ ADLX_RESULT (ADLX_STD_CALL* GetGPUUsageRange)(/* IADLXGPUMetricsSupport* pThis, adlx_int* minValue, adlx_int* maxValue */);
+ ADLX_RESULT (ADLX_STD_CALL* GetGPUClockSpeedRange)(/* IADLXGPUMetricsSupport* pThis, adlx_int* minValue, adlx_int* maxValue */);
+ ADLX_RESULT (ADLX_STD_CALL* GetGPUVRAMClockSpeedRange)(/* IADLXGPUMetricsSupport* pThis, adlx_int* minValue, adlx_int* maxValue */);
+ ADLX_RESULT (ADLX_STD_CALL* GetGPUTemperatureRange)(/* IADLXGPUMetricsSupport* pThis, adlx_int* minValue, adlx_int* maxValue */);
+ ADLX_RESULT (ADLX_STD_CALL* GetGPUHotspotTemperatureRange)(/* IADLXGPUMetricsSupport* pThis, adlx_int* minValue, adlx_int* maxValue */);
+ ADLX_RESULT (ADLX_STD_CALL* GetGPUPowerRange)(/* IADLXGPUMetricsSupport* pThis, adlx_int* minValue, adlx_int* maxValue */);
+ ADLX_RESULT (ADLX_STD_CALL* GetGPUFanSpeedRange)(/* IADLXGPUMetricsSupport* pThis, adlx_int* minValue, adlx_int* maxValue */);
+ ADLX_RESULT (ADLX_STD_CALL* GetGPUVRAMRange)(/* IADLXGPUMetricsSupport* pThis, adlx_int* minValue, adlx_int* maxValue */);
+ ADLX_RESULT (ADLX_STD_CALL* GetGPUVoltageRange)(/* IADLXGPUMetricsSupport* pThis, adlx_int* minValue, adlx_int* maxValue */);
+ ADLX_RESULT (ADLX_STD_CALL* GetGPUTotalBoardPowerRange)(/* IADLXGPUMetricsSupport* pThis, adlx_int* minValue, adlx_int* maxValue */);
+} IADLXGPUMetricsSupportVtbl;
+struct IADLXGPUMetricsSupport { const IADLXGPUMetricsSupportVtbl *pVtbl; };
+
+typedef struct IADLXGPUMetricsVtbl
+{
+ //IADLXInterface
+ adlx_long (ADLX_STD_CALL* Acquire)(/* IADLXGPUMetrics* pThis */);
+ adlx_long (ADLX_STD_CALL* Release)(IADLXGPUMetrics* pThis); // Used
+ ADLX_RESULT (ADLX_STD_CALL* QueryInterface)(/* IADLXGPUMetrics* pThis, const wchar_t* interfaceId, void** ppInterface */);
+
+ //IADLXGPUMetrics
+ ADLX_RESULT (ADLX_STD_CALL* TimeStamp)(/* IADLXGPUMetrics* pThis, adlx_int64* ms */);
+ ADLX_RESULT (ADLX_STD_CALL* GPUUsage)(/* IADLXGPUMetrics* pThis, adlx_double* data */);
+ ADLX_RESULT (ADLX_STD_CALL* GPUClockSpeed)(/* IADLXGPUMetrics* pThis, adlx_int* data */);
+ ADLX_RESULT (ADLX_STD_CALL* GPUVRAMClockSpeed)(/* IADLXGPUMetrics* pThis, adlx_int* data */);
+ ADLX_RESULT (ADLX_STD_CALL* GPUTemperature)(/* IADLXGPUMetrics* pThis, adlx_double* data */);
+ ADLX_RESULT (ADLX_STD_CALL* GPUHotspotTemperature)(/* IADLXGPUMetrics* pThis, adlx_double* data */);
+ ADLX_RESULT (ADLX_STD_CALL* GPUPower)(/* IADLXGPUMetrics* pThis, adlx_double* data */);
+ ADLX_RESULT (ADLX_STD_CALL* GPUTotalBoardPower)(/* IADLXGPUMetrics* pThis, adlx_double* data */);
+ ADLX_RESULT (ADLX_STD_CALL* GPUFanSpeed)(/* IADLXGPUMetrics* pThis, adlx_int* data */);
+ ADLX_RESULT (ADLX_STD_CALL* GPUVRAM)(IADLXGPUMetrics* pThis, adlx_int* data); // Used
+ ADLX_RESULT (ADLX_STD_CALL* GPUVoltage)(/* IADLXGPUMetrics* pThis, adlx_int* data */);
+} IADLXGPUMetricsVtbl;
+struct IADLXGPUMetrics { const IADLXGPUMetricsVtbl *pVtbl; };
+
+struct {
+ void *handle;
+ ADLX_RESULT (*ADLXInitialize)(adlx_uint64 version, IADLXSystem** ppSystem);
+ ADLX_RESULT (*ADLXInitializeWithIncompatibleDriver)(adlx_uint64 version, IADLXSystem** ppSystem);
+ ADLX_RESULT (*ADLXQueryVersion)(const char** version);
+ ADLX_RESULT (*ADLXTerminate)();
+ IADLXSystem *sys;
+} adlx { NULL, NULL, NULL, NULL, NULL, NULL };
+static std::mutex ggml_adlx_lock;
+
+extern "C" {
+
+int ggml_hip_mgmt_init() {
+ std::lock_guard<std::mutex> lock(ggml_adlx_lock);
+ if (adlx.handle != NULL) {
+ // Already initialized
+ return 0;
+ }
+ DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
+ SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
+ fs::path libPath = fs::path("\\Windows") / fs::path("System32") / fs::path("amdadlx64.dll");
+
+ adlx.handle = (void*)LoadLibraryW(libPath.wstring().c_str());
+ if (adlx.handle == NULL) {
+ return ADLX_NOT_FOUND;
+ }
+
+ adlx.ADLXInitialize = (ADLX_RESULT (*)(adlx_uint64 version, IADLXSystem **ppSystem)) GetProcAddress((HMODULE)(adlx.handle), "ADLXInitialize");
+ adlx.ADLXInitializeWithIncompatibleDriver = (ADLX_RESULT (*)(adlx_uint64 version, IADLXSystem **ppSystem)) GetProcAddress((HMODULE)(adlx.handle), "ADLXInitializeWithIncompatibleDriver");
+ adlx.ADLXTerminate = (ADLX_RESULT (*)()) GetProcAddress((HMODULE)(adlx.handle), "ADLXTerminate");
+ adlx.ADLXQueryVersion = (ADLX_RESULT (*)(const char **version)) GetProcAddress((HMODULE)(adlx.handle), "ADLXQueryVersion");
+ if (adlx.ADLXInitialize == NULL || adlx.ADLXInitializeWithIncompatibleDriver == NULL || adlx.ADLXTerminate == NULL) {
+ GGML_LOG_INFO("%s unable to locate required symbols in amdadlx64.dll, falling back to hip free memory reporting", __func__);
+ FreeLibrary((HMODULE)(adlx.handle));
+ adlx.handle = NULL;
+ return ADLX_NOT_FOUND;
+ }
+
+ SetErrorMode(old_mode);
+
+ // Aid in troubleshooting...
+ if (adlx.ADLXQueryVersion != NULL) {
+ const char *version = NULL;
+ ADLX_RESULT status = adlx.ADLXQueryVersion(&version);
+ if (ADLX_SUCCEEDED(status)) {
+ GGML_LOG_DEBUG("%s located ADLX version %s\n", __func__, version);
+ }
+ }
+
+ ADLX_RESULT status = adlx.ADLXInitialize(ADLX_FULL_VERSION, &adlx.sys);
+ if (ADLX_FAILED(status)) {
+ // GGML_LOG_DEBUG("%s failed to initialize ADLX error=%d - attempting with incompatible driver...\n", __func__, status);
+ // Try with the incompatible driver
+ status = adlx.ADLXInitializeWithIncompatibleDriver(ADLX_FULL_VERSION, &adlx.sys);
+ if (ADLX_FAILED(status)) {
+ GGML_LOG_INFO("%s failed to initialize ADLX error=%d\n", __func__, status);
+ FreeLibrary((HMODULE)(adlx.handle));
+ adlx.handle = NULL;
+ adlx.sys = NULL;
+ return status;
+ }
+ // GGML_LOG_DEBUG("%s initialized ADLX with incpomatible driver\n", __func__);
+ }
+ return ADLX_OK;
+}
+
+void ggml_hip_mgmt_release() {
+ std::lock_guard<std::mutex> lock(ggml_adlx_lock);
+ if (adlx.handle == NULL) {
+ // Already free
+ return;
+ }
+ ADLX_RESULT status = adlx.ADLXTerminate();
+ if (ADLX_FAILED(status)) {
+ GGML_LOG_INFO("%s failed to terminate Adlx %d\n", __func__, status);
+ // Unload anyway...
+ }
+ FreeLibrary((HMODULE)(adlx.handle));
+ adlx.handle = NULL;
+}
+
+#define adlx_gdm_cleanup \
+ if (gpuMetricsSupport != NULL) gpuMetricsSupport->pVtbl->Release(gpuMetricsSupport); \
+ if (gpuMetrics != NULL) gpuMetrics->pVtbl->Release(gpuMetrics); \
+ if (perfMonitoringServices != NULL) perfMonitoringServices->pVtbl->Release(perfMonitoringServices); \
+ if (gpus != NULL) gpus->pVtbl->Release(gpus); \
+ if (gpu != NULL) gpu->pVtbl->Release(gpu)
+
+int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total) {
+ std::lock_guard<std::mutex> lock(ggml_adlx_lock);
+ if (adlx.handle == NULL) {
+ GGML_LOG_INFO("%s ADLX was not initialized\n", __func__);
+ return ADLX_ADL_INIT_ERROR;
+ }
+ IADLXGPUMetricsSupport *gpuMetricsSupport = NULL;
+ IADLXPerformanceMonitoringServices *perfMonitoringServices = NULL;
+ IADLXGPUList* gpus = NULL;
+ IADLXGPU* gpu = NULL;
+ IADLXGPUMetrics *gpuMetrics = NULL;
+ ADLX_RESULT status;
+ // The "UniqueID" exposed in ADLX is the PCI Bus and Device IDs
+ adlx_int target = (pci_bus_id << 8) | (pci_device_id & 0xff);
+
+ status = adlx.sys->pVtbl->GetPerformanceMonitoringServices(adlx.sys, &perfMonitoringServices);
+ if (ADLX_FAILED(status)) {
+ GGML_LOG_INFO("%s GetPerformanceMonitoringServices failed %d\n", __func__, status);
+ return status;
+ }
+
+ status = adlx.sys->pVtbl->GetGPUs(adlx.sys, &gpus);
+ if (ADLX_FAILED(status)) {
+ GGML_LOG_INFO("%s GetGPUs failed %d\n", __func__, status);
+ adlx_gdm_cleanup;
+ return status;
+ }
+
+ // Get GPU list
+ for (adlx_uint crt = gpus->pVtbl->Begin(gpus); crt != gpus->pVtbl->End(gpus); ++crt)
+ {
+ status = gpus->pVtbl->At_GPUList(gpus, crt, &gpu);
+ if (ADLX_FAILED(status))
+ {
+ GGML_LOG_INFO("%s %d] At_GPUList failed %d\n", __func__, crt, status);
+ continue;
+ }
+ adlx_int id;
+ status = gpu->pVtbl->UniqueId(gpu, &id);
+ if (ADLX_FAILED(status)) {
+ GGML_LOG_INFO("%s %d] UniqueId lookup failed %d\n", __func__, crt, status);
+ gpu->pVtbl->Release(gpu);
+ gpu = NULL;
+ continue;
+ }
+ if (id != target) {
+ GGML_LOG_DEBUG("%s %d] GPU UniqueId: %x does not match target %02x %02x\n", __func__, crt, id, pci_bus_id, pci_device_id);
+ gpu->pVtbl->Release(gpu);
+ gpu = NULL;
+ continue;
+ }
+ // Any failures at this point should cause a fall-back to other APIs
+ status = perfMonitoringServices->pVtbl->GetSupportedGPUMetrics(perfMonitoringServices, gpu, &gpuMetricsSupport);
+ if (ADLX_FAILED(status)) {
+ GGML_LOG_INFO("%s GetSupportedGPUMetrics failed %d\n", __func__, status);
+ adlx_gdm_cleanup;
+ return status;
+ }
+ status = perfMonitoringServices->pVtbl->GetCurrentGPUMetrics(perfMonitoringServices, gpu, &gpuMetrics);
+ if (ADLX_FAILED(status)) {
+ GGML_LOG_INFO("%s GetCurrentGPUMetrics failed %d\n", __func__, status);
+ adlx_gdm_cleanup;
+ return status;
+ }
+
+ adlx_bool supported = false;
+ status = gpuMetricsSupport->pVtbl->IsSupportedGPUVRAM(gpuMetricsSupport, &supported);
+ if (ADLX_FAILED(status)) {
+ GGML_LOG_INFO("%s IsSupportedGPUVRAM failed %d\n", __func__, status);
+ adlx_gdm_cleanup;
+ return status;
+ }
+
+ adlx_uint totalVRAM = 0;
+ status = gpu->pVtbl->TotalVRAM(gpu, &totalVRAM);
+ if (ADLX_FAILED(status)) {
+ GGML_LOG_INFO("%s TotalVRAM failed %d\n", __func__, status);
+ adlx_gdm_cleanup;
+ return status;
+ }
+
+ adlx_int usedVRAM = 0;
+ status = gpuMetrics->pVtbl->GPUVRAM(gpuMetrics, &usedVRAM);
+ if (ADLX_FAILED(status)) {
+ GGML_LOG_INFO("%s GPUVRAM failed %d\n", __func__, status);
+ adlx_gdm_cleanup;
+ return status;
+ }
+ *total = size_t(totalVRAM) * 1024 * 1024;
+ *free = size_t(totalVRAM-usedVRAM) * 1024 * 1024;
+
+ adlx_gdm_cleanup;
+ return ADLX_OK;
+ }
+ adlx_gdm_cleanup;
+ return ADLX_NOT_FOUND;
+}
+
+} // extern "C"
+
+#else // #ifdef _WIN32
+
+extern "C" {
+
+// TODO Linux implementation of accurate VRAM reporting
+int ggml_hip_mgmt_init() {
+ return -1;
+}
+void ggml_hip_mgmt_release() {}
+int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total) {
+ return -1;
+}
+
+} // extern "C"
+
+#endif // #ifdef _WIN32
\ No newline at end of file
diff --git a/ggml/src/mem_nvml.cpp b/ggml/src/mem_nvml.cpp
new file mode 100644
index 000000000..aa05e9dc1
--- /dev/null
+++ b/ggml/src/mem_nvml.cpp
@@ -0,0 +1,172 @@
+// NVIDIA Management Library (NVML)
+//
+// https://developer.nvidia.com/management-library-nvml
+//
+// This library provides accurate VRAM reporting for NVIDIA GPUs, particularly
+// on Windows, where the cuda library provides inaccurate VRAM usage metrics. The
+// runtime DLL is installed with every driver on Windows, and most Linux
+// systems, and the headers are included in the standard CUDA SDK install. As
+// such, we can include the header here to simplify the code.
+
+
+#include "ggml-impl.h"
+#include <filesystem>
+#include <mutex>
+
+#ifdef _WIN32
+# define WIN32_LEAN_AND_MEAN
+# ifndef NOMINMAX
+# define NOMINMAX
+# endif
+# include <windows.h>
+#else
+# include <dlfcn.h>
+# include <unistd.h>
+#endif
+
+namespace fs = std::filesystem;
+
+// Minimal definitions to avoid including the nvml.h header
+typedef enum nvmlReturn_enum
+{
+ // cppcheck-suppress *
+ NVML_SUCCESS = 0, //!< The operation was successful
+ NVML_ERROR_UNINITIALIZED = 1, //!< NVML was not first initialized with nvmlInit()
+ NVML_ERROR_INVALID_ARGUMENT = 2, //!< A supplied argument is invalid
+ NVML_ERROR_NOT_SUPPORTED = 3, //!< The requested operation is not available on target device
+ NVML_ERROR_NO_PERMISSION = 4, //!< The current user does not have permission for operation
+ NVML_ERROR_ALREADY_INITIALIZED = 5, //!< Deprecated: Multiple initializations are now allowed through ref counting
+ NVML_ERROR_NOT_FOUND = 6, //!< A query to find an object was unsuccessful
+ NVML_ERROR_INSUFFICIENT_SIZE = 7, //!< An input argument is not large enough
+ NVML_ERROR_INSUFFICIENT_POWER = 8, //!< A device's external power cables are not properly attached
+ NVML_ERROR_DRIVER_NOT_LOADED = 9, //!< NVIDIA driver is not loaded
+ NVML_ERROR_TIMEOUT = 10, //!< User provided timeout passed
+ NVML_ERROR_IRQ_ISSUE = 11, //!< NVIDIA Kernel detected an interrupt issue with a GPU
+ NVML_ERROR_LIBRARY_NOT_FOUND = 12, //!< NVML Shared Library couldn't be found or loaded
+ NVML_ERROR_FUNCTION_NOT_FOUND = 13, //!< Local version of NVML doesn't implement this function
+ NVML_ERROR_CORRUPTED_INFOROM = 14, //!< infoROM is corrupted
+ NVML_ERROR_GPU_IS_LOST = 15, //!< The GPU has fallen off the bus or has otherwise become inaccessible
+ NVML_ERROR_RESET_REQUIRED = 16, //!< The GPU requires a reset before it can be used again
+ NVML_ERROR_OPERATING_SYSTEM = 17, //!< The GPU control device has been blocked by the operating system/cgroups
+ NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18, //!< RM detects a driver/library version mismatch
+ NVML_ERROR_IN_USE = 19, //!< An operation cannot be performed because the GPU is currently in use
+ NVML_ERROR_MEMORY = 20, //!< Insufficient memory
+ NVML_ERROR_NO_DATA = 21, //!< No data
+ NVML_ERROR_VGPU_ECC_NOT_SUPPORTED = 22, //!< The requested vgpu operation is not available on target device, becasue ECC is enabled
+ NVML_ERROR_INSUFFICIENT_RESOURCES = 23, //!< Ran out of critical resources, other than memory
+ NVML_ERROR_FREQ_NOT_SUPPORTED = 24, //!< Ran out of critical resources, other than memory
+ NVML_ERROR_ARGUMENT_VERSION_MISMATCH = 25, //!< The provided version is invalid/unsupported
+ NVML_ERROR_DEPRECATED = 26, //!< The requested functionality has been deprecated
+ NVML_ERROR_NOT_READY = 27, //!< The system is not ready for the request
+ NVML_ERROR_GPU_NOT_FOUND = 28, //!< No GPUs were found
+ NVML_ERROR_INVALID_STATE = 29, //!< Resource not in correct state to perform requested operation
+ NVML_ERROR_UNKNOWN = 999 //!< An internal driver error occurred
+} nvmlReturn_t;
+typedef struct nvmlDevice_st* nvmlDevice_t;
+typedef struct nvmlMemory_st
+{
+ unsigned long long total; //!< Total physical device memory (in bytes)
+ unsigned long long free; //!< Unallocated device memory (in bytes)
+ unsigned long long used; //!< Sum of Reserved and Allocated device memory (in bytes).
+ //!< Note that the driver/GPU always sets aside a small amount of memory for bookkeeping
+} nvmlMemory_t;
+// end nvml.h definitions
+
+struct {
+ void *handle;
+ nvmlReturn_t (*nvmlInit_v2)(void);
+ nvmlReturn_t (*nvmlShutdown)(void);
+ nvmlReturn_t (*nvmlDeviceGetHandleByUUID)(const char *, nvmlDevice_t *);
+ nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *);
+} nvml { NULL, NULL, NULL, NULL, NULL };
+static std::mutex ggml_nvml_lock;
+
+extern "C" {
+
+int ggml_nvml_init() {
+ std::lock_guard<std::mutex> lock(ggml_nvml_lock);
+ if (nvml.handle != NULL) {
+ // Already initialized
+ return 0;
+ }
+#ifdef _WIN32
+ DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
+ SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
+ fs::path libPath[2];
+ const char * programDir = std::getenv("ProgramW6432");
+ if (programDir == NULL) {
+ libPath[0] = fs::path("Program Files") / fs::path("NVIDIA Corporation") / fs::path("NVSMI") / fs::path("NVML.dll");
+ } else {
+ libPath[0] = fs::path(programDir) / fs::path("NVIDIA Corporation") / fs::path("NVSMI") / fs::path("NVML.dll");
+ }
+ libPath[1] = fs::path("\\Windows") / fs::path("System32") / fs::path("NVML.dll");
+
+ for (int i = 0; i < 2; i++) {
+ nvml.handle = (void*)LoadLibraryW(libPath[i].wstring().c_str());
+ if (nvml.handle != NULL) {
+ break;
+ }
+ }
+ if (nvml.handle == NULL) {
+ return NVML_ERROR_NOT_FOUND;
+ }
+
+ nvml.nvmlInit_v2 = (nvmlReturn_enum (*)()) GetProcAddress((HMODULE)(nvml.handle), "nvmlInit_v2");
+ nvml.nvmlShutdown = (nvmlReturn_enum (*)()) GetProcAddress((HMODULE)(nvml.handle), "nvmlShutdown");
+ nvml.nvmlDeviceGetHandleByUUID = (nvmlReturn_t (*)(const char *, nvmlDevice_t *)) GetProcAddress((HMODULE)(nvml.handle), "nvmlDeviceGetHandleByUUID");
+ nvml.nvmlDeviceGetMemoryInfo = (nvmlReturn_t (*)(nvmlDevice_t, nvmlMemory_t *)) GetProcAddress((HMODULE)(nvml.handle), "nvmlDeviceGetMemoryInfo");
+ if (nvml.nvmlInit_v2 == NULL || nvml.nvmlShutdown == NULL || nvml.nvmlDeviceGetHandleByUUID == NULL || nvml.nvmlDeviceGetMemoryInfo == NULL) {
+ GGML_LOG_INFO("%s unable to locate required symbols in NVML.dll", __func__);
+ FreeLibrary((HMODULE)(nvml.handle));
+ nvml.handle = NULL;
+ return NVML_ERROR_NOT_FOUND;
+ }
+
+ SetErrorMode(old_mode);
+
+#else
+ // Not currently wired up on Linux
+ return NVML_ERROR_NOT_SUPPORTED;
+#endif
+ int status = nvml.nvmlInit_v2();
+ return NVML_SUCCESS;
+}
+
+void ggml_nvml_release() {
+ std::lock_guard<std::mutex> lock(ggml_nvml_lock);
+ if (nvml.handle == NULL) {
+ // Already free
+ return;
+ }
+ nvmlReturn_enum status = nvml.nvmlShutdown();
+ if (status != NVML_SUCCESS) {
+ GGML_LOG_INFO("%s failed to shutdown NVML: %d\n", __func__, status);
+ }
+#ifdef _WIN32
+ FreeLibrary((HMODULE)(nvml.handle));
+ nvml.handle = NULL;
+#else
+ // Not currently wired up on Linux
+#endif
+}
+
+int ggml_nvml_get_device_memory(const char *uuid, size_t *free, size_t *total) {
+ std::lock_guard<std::mutex> lock(ggml_nvml_lock);
+ if (nvml.handle == NULL) {
+ return NVML_ERROR_UNINITIALIZED;
+ }
+ nvmlDevice_t device;
+ auto status = nvml.nvmlDeviceGetHandleByUUID(uuid, &device);
+ if (status != NVML_SUCCESS) {
+ return status;
+ }
+ nvmlMemory_t memInfo = {0};
+ status = nvml.nvmlDeviceGetMemoryInfo(device, &memInfo);
+ if (status == NVML_SUCCESS) {
+ *free = memInfo.free;
+ *total = memInfo.total;
+ }
+ return status;
+}
+
+}
\ No newline at end of file
...@@ -196,7 +196,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin ...@@ -196,7 +196,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
} }
useFlashAttention := (envconfig.FlashAttention() || f.FlashAttention()) && useFlashAttention := (envconfig.FlashAttention() || f.FlashAttention()) &&
discover.GetGPUInfo().FlashAttentionSupported() && (discover.GpuInfoList)(gpus).FlashAttentionSupported() &&
f.SupportsFlashAttention() f.SupportsFlashAttention()
var kvct string var kvct string
...@@ -231,7 +231,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin ...@@ -231,7 +231,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
} }
// on metal there's no partial offload overhead // on metal there's no partial offload overhead
if gpus[0].Library == "metal" { if gpus[0].Library == "Metal" {
graphPartialOffload = graphFullOffload graphPartialOffload = graphFullOffload
} else if len(gpus) > 1 { } else if len(gpus) > 1 {
// multigpu should always use the partial graph size // multigpu should always use the partial graph size
......
...@@ -12,6 +12,7 @@ import ( ...@@ -12,6 +12,7 @@ import (
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/discover" "github.com/ollama/ollama/discover"
"github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/ml"
) )
func TestEstimateGPULayers(t *testing.T) { func TestEstimateGPULayers(t *testing.T) {
...@@ -55,7 +56,9 @@ func TestEstimateGPULayers(t *testing.T) { ...@@ -55,7 +56,9 @@ func TestEstimateGPULayers(t *testing.T) {
// Simple CPU scenario // Simple CPU scenario
gpus := []discover.GpuInfo{ gpus := []discover.GpuInfo{
{ {
Library: "cpu", DeviceID: ml.DeviceID{
Library: "cpu",
},
}, },
} }
projectors := []string{} projectors := []string{}
...@@ -77,11 +80,15 @@ func TestEstimateGPULayers(t *testing.T) { ...@@ -77,11 +80,15 @@ func TestEstimateGPULayers(t *testing.T) {
gpuMinimumMemory := uint64(2048) gpuMinimumMemory := uint64(2048)
gpus = []discover.GpuInfo{ gpus = []discover.GpuInfo{
{ {
Library: "cuda", DeviceID: ml.DeviceID{
Library: "cuda",
},
MinimumMemory: gpuMinimumMemory, MinimumMemory: gpuMinimumMemory,
}, },
{ {
Library: "cuda", DeviceID: ml.DeviceID{
Library: "cuda",
},
MinimumMemory: gpuMinimumMemory, MinimumMemory: gpuMinimumMemory,
}, },
} }
......
...@@ -66,7 +66,7 @@ func (e filteredEnv) LogValue() slog.Value { ...@@ -66,7 +66,7 @@ func (e filteredEnv) LogValue() slog.Value {
type LlamaServer interface { type LlamaServer interface {
ModelPath() string ModelPath() string
Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) error Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error)
Ping(ctx context.Context) error Ping(ctx context.Context) error
WaitUntilRunning(ctx context.Context) error WaitUntilRunning(ctx context.Context) error
Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
...@@ -76,8 +76,11 @@ type LlamaServer interface { ...@@ -76,8 +76,11 @@ type LlamaServer interface {
Close() error Close() error
VRAMSize() uint64 // Total VRAM across all GPUs VRAMSize() uint64 // Total VRAM across all GPUs
TotalSize() uint64 TotalSize() uint64
VRAMByGPU(gpuID string) uint64 VRAMByGPU(id ml.DeviceID) uint64
Pid() int Pid() int
GetPort() int
GetDeviceInfos(ctx context.Context) []ml.DeviceInfo
HasExited() bool
} }
// llmServer is an instance of a runner hosting a single model // llmServer is an instance of a runner hosting a single model
...@@ -331,6 +334,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a ...@@ -331,6 +334,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
if gpu.DependencyPath != nil { if gpu.DependencyPath != nil {
slog.Debug("adding gpu dependency paths", "paths", gpu.DependencyPath) slog.Debug("adding gpu dependency paths", "paths", gpu.DependencyPath)
libraryPaths = append(gpu.DependencyPath, libraryPaths...) libraryPaths = append(gpu.DependencyPath, libraryPaths...)
ggmlPaths = append(ggmlPaths, gpu.DependencyPath...)
} }
} }
...@@ -361,12 +365,8 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a ...@@ -361,12 +365,8 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
s.cmd.Env = append(s.cmd.Env, "OLLAMA_LIBRARY_PATH="+strings.Join(ggmlPaths, string(filepath.ListSeparator))) s.cmd.Env = append(s.cmd.Env, "OLLAMA_LIBRARY_PATH="+strings.Join(ggmlPaths, string(filepath.ListSeparator)))
envWorkarounds := []string{}
for _, gpu := range gpus {
envWorkarounds = append(envWorkarounds, gpu.EnvWorkarounds...)
}
// Always filter down the set of GPUs in case there are any unsupported devices that might crash // Always filter down the set of GPUs in case there are any unsupported devices that might crash
envWorkarounds = append(envWorkarounds, gpus.GetVisibleDevicesEnv()...) envWorkarounds := gpus.GetVisibleDevicesEnv()
pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator)) pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
// Update or add the path variable with our adjusted version // Update or add the path variable with our adjusted version
...@@ -496,7 +496,7 @@ type LoadResponse struct { ...@@ -496,7 +496,7 @@ type LoadResponse struct {
var ErrLoadRequiredFull = errors.New("unable to load full model on GPU") var ErrLoadRequiredFull = errors.New("unable to load full model on GPU")
func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) error { func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error) {
systemInfo := discover.GetSystemInfo() systemInfo := discover.GetSystemInfo()
systemTotalMemory := systemInfo.System.TotalMemory systemTotalMemory := systemInfo.System.TotalMemory
systemFreeMemory := systemInfo.System.FreeMemory systemFreeMemory := systemInfo.System.FreeMemory
...@@ -509,7 +509,7 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi ...@@ -509,7 +509,7 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
g = pickBestPartialFitByLibrary(s.ggml, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, gpus, s.numParallel) g = pickBestPartialFitByLibrary(s.ggml, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, gpus, s.numParallel)
} else { } else {
slog.Info("model requires more memory than is currently available, evicting a model to make space", "estimate", s.estimate) slog.Info("model requires more memory than is currently available, evicting a model to make space", "estimate", s.estimate)
return ErrLoadRequiredFull return nil, ErrLoadRequiredFull
} }
} }
...@@ -518,13 +518,13 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi ...@@ -518,13 +518,13 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
if len(gpus) > 1 || gpus[0].Library != "cpu" { if len(gpus) > 1 || gpus[0].Library != "cpu" {
switch { switch {
case gpus[0].Library == "metal" && s.estimate.VRAMSize > systemInfo.System.TotalMemory: case gpus[0].Library == "Metal" && s.estimate.VRAMSize > systemInfo.System.TotalMemory:
// disable partial offloading when model is greater than total system memory as this // disable partial offloading when model is greater than total system memory as this
// can lead to locking up the system // can lead to locking up the system
s.options.NumGPU = 0 s.options.NumGPU = 0
case gpus[0].Library != "metal" && s.estimate.Layers == 0: case gpus[0].Library != "Metal" && s.estimate.Layers == 0:
// Don't bother loading into the GPU if no layers can fit // Don't bother loading into the GPU if no layers can fit
gpus = discover.GetCPUInfo() gpus = discover.GpuInfoList{discover.GetCPUInfo()}
case s.options.NumGPU < 0 && s.estimate.Layers > 0 && gpus[0].Library != "cpu": case s.options.NumGPU < 0 && s.estimate.Layers > 0 && gpus[0].Library != "cpu":
s.options.NumGPU = s.estimate.Layers s.options.NumGPU = s.estimate.Layers
} }
...@@ -537,7 +537,7 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi ...@@ -537,7 +537,7 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
available := systemInfo.System.FreeMemory + systemInfo.System.FreeSwap available := systemInfo.System.FreeMemory + systemInfo.System.FreeSwap
if systemMemoryRequired > available { if systemMemoryRequired > available {
slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "available", format.HumanBytes2(available), "total", format.HumanBytes2(systemInfo.System.TotalMemory), "free", format.HumanBytes2(systemInfo.System.FreeMemory), "swap", format.HumanBytes2(systemInfo.System.FreeSwap)) slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "available", format.HumanBytes2(available), "total", format.HumanBytes2(systemInfo.System.TotalMemory), "free", format.HumanBytes2(systemInfo.System.FreeMemory), "swap", format.HumanBytes2(systemInfo.System.FreeSwap))
return fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available)) return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available))
} }
} }
...@@ -552,7 +552,7 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi ...@@ -552,7 +552,7 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
// mmap has issues with partial offloading on metal // mmap has issues with partial offloading on metal
for _, g := range gpus { for _, g := range gpus {
if g.Library == "metal" && if g.Library == "Metal" &&
uint64(s.options.NumGPU) > 0 && uint64(s.options.NumGPU) > 0 &&
uint64(s.options.NumGPU) < s.ggml.KV().BlockCount()+1 { uint64(s.options.NumGPU) < s.ggml.KV().BlockCount()+1 {
s.options.UseMMap = new(bool) s.options.UseMMap = new(bool)
...@@ -563,7 +563,7 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi ...@@ -563,7 +563,7 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
// Windows CUDA should not use mmap for best performance // Windows CUDA should not use mmap for best performance
// Linux with a model larger than free space, mmap leads to thrashing // Linux with a model larger than free space, mmap leads to thrashing
// For CPU loads we want the memory to be allocated, not FS cache // For CPU loads we want the memory to be allocated, not FS cache
if (runtime.GOOS == "windows" && gpus[0].Library == "cuda" && s.options.UseMMap == nil) || if (runtime.GOOS == "windows" && gpus[0].Library == "CUDA" && s.options.UseMMap == nil) ||
(runtime.GOOS == "linux" && systemInfo.System.FreeMemory < s.estimate.TotalSize && s.options.UseMMap == nil) || (runtime.GOOS == "linux" && systemInfo.System.FreeMemory < s.estimate.TotalSize && s.options.UseMMap == nil) ||
(gpus[0].Library == "cpu" && s.options.UseMMap == nil) || (gpus[0].Library == "cpu" && s.options.UseMMap == nil) ||
(s.options.UseMMap != nil && !*s.options.UseMMap) { (s.options.UseMMap != nil && !*s.options.UseMMap) {
...@@ -572,12 +572,12 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi ...@@ -572,12 +572,12 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
} }
if err := s.waitUntilRunnerLaunched(ctx); err != nil { if err := s.waitUntilRunnerLaunched(ctx); err != nil {
return err return nil, err
} }
resp, err := s.initModel(ctx, s.loadRequest, LoadOperationCommit) resp, err := s.initModel(ctx, s.loadRequest, LoadOperationCommit)
if err != nil { if err != nil {
return err return nil, err
} }
// On the Ollama engine, we can print out a summary of the memory allocations. // On the Ollama engine, we can print out a summary of the memory allocations.
...@@ -588,16 +588,16 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi ...@@ -588,16 +588,16 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
if !resp.Success { if !resp.Success {
slog.Warn("failed to allocate memory for model", "memory", resp.Memory) slog.Warn("failed to allocate memory for model", "memory", resp.Memory)
return errors.New("failed to allocate memory for model") return nil, errors.New("failed to allocate memory for model")
} }
// The llama engine does its memory allocations together with model loading, so we // The llama engine does its memory allocations together with model loading, so we
// need to wait until it is done to ensure that we have accurate memory data before // need to wait until it is done to ensure that we have accurate memory data before
// loading the next model // loading the next model
if s.textProcessor == nil { if s.textProcessor == nil {
return s.WaitUntilRunning(ctx) return uniqueDeviceIDs(s.loadRequest.GPULayers), s.WaitUntilRunning(ctx)
} else { } else {
return nil return uniqueDeviceIDs(s.loadRequest.GPULayers), nil
} }
} }
...@@ -610,7 +610,7 @@ func createGPULayers(estimate MemoryEstimate, ggml *ggml.GGML, gpus discover.Gpu ...@@ -610,7 +610,7 @@ func createGPULayers(estimate MemoryEstimate, ggml *ggml.GGML, gpus discover.Gpu
gpuLayers := make(ml.GPULayersList, len(gpus)) gpuLayers := make(ml.GPULayersList, len(gpus))
for i := range gpuLayers { for i := range gpuLayers {
gpuLayers[i].ID = gpus[i].ID gpuLayers[i].DeviceID = gpus[i].DeviceID
} }
var sum float32 var sum float32
...@@ -658,7 +658,9 @@ func createGPULayers(estimate MemoryEstimate, ggml *ggml.GGML, gpus discover.Gpu ...@@ -658,7 +658,9 @@ func createGPULayers(estimate MemoryEstimate, ggml *ggml.GGML, gpus discover.Gpu
// //
// This process is repeated for higher levels of loading the model (fit, allocate, commit). The earlier levels are quicker, // This process is repeated for higher levels of loading the model (fit, allocate, commit). The earlier levels are quicker,
// allowing for faster iteration, but may return less information. // allowing for faster iteration, but may return less information.
func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) error { //
// Returns the list of GPU IDs that were used in the final allocation on success
func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error) {
var success bool var success bool
defer func() { defer func() {
if !success { if !success {
...@@ -683,7 +685,7 @@ func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requ ...@@ -683,7 +685,7 @@ func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requ
if gpu.FreeMemory < envconfig.GpuOverhead()+gpu.MinimumMemory { if gpu.FreeMemory < envconfig.GpuOverhead()+gpu.MinimumMemory {
available = 0 available = 0
} }
slog.Info("gpu memory", "id", gpu.ID, slog.Info("gpu memory", "id", gpu.ID, "library", gpu.Library,
"available", format.HumanBytes2(available), "available", format.HumanBytes2(available),
"free", format.HumanBytes2(gpu.FreeMemory), "free", format.HumanBytes2(gpu.FreeMemory),
"minimum", format.HumanBytes2(gpu.MinimumMemory), "minimum", format.HumanBytes2(gpu.MinimumMemory),
...@@ -696,11 +698,11 @@ func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requ ...@@ -696,11 +698,11 @@ func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requ
gpuLayers, err := s.createLayout(systemInfo, gpus, s.mem, requireFull, backoff) gpuLayers, err := s.createLayout(systemInfo, gpus, s.mem, requireFull, backoff)
if err != nil { if err != nil {
return err return nil, err
} }
if err := s.waitUntilRunnerLaunched(ctx); err != nil { if err := s.waitUntilRunnerLaunched(ctx); err != nil {
return err return nil, err
} }
nextOperation: nextOperation:
...@@ -710,7 +712,7 @@ nextOperation: ...@@ -710,7 +712,7 @@ nextOperation:
s.loadRequest.GPULayers = gpuLayers s.loadRequest.GPULayers = gpuLayers
resp, err := s.initModel(ctx, s.loadRequest, operation) resp, err := s.initModel(ctx, s.loadRequest, operation)
if err != nil { if err != nil {
return err return nil, err
} }
resp.Memory.Log(slog.LevelDebug) resp.Memory.Log(slog.LevelDebug)
...@@ -722,7 +724,7 @@ nextOperation: ...@@ -722,7 +724,7 @@ nextOperation:
for { for {
newGPULayers, err := s.createLayout(systemInfo, gpus, s.mem, requireFull, backoff) newGPULayers, err := s.createLayout(systemInfo, gpus, s.mem, requireFull, backoff)
if err != nil { if err != nil {
return err return nil, err
} }
slog.Debug("new layout created", "layers", newGPULayers) slog.Debug("new layout created", "layers", newGPULayers)
...@@ -756,7 +758,7 @@ nextOperation: ...@@ -756,7 +758,7 @@ nextOperation:
newGPULayers, err = s.createLayout(systemInfo, gpus, s.mem, requireFull, backoff) newGPULayers, err = s.createLayout(systemInfo, gpus, s.mem, requireFull, backoff)
s.options.NumGPU = -1 s.options.NumGPU = -1
if err != nil { if err != nil {
return err return nil, err
} }
slog.Debug("new layout created", "layers", newGPULayers) slog.Debug("new layout created", "layers", newGPULayers)
...@@ -764,7 +766,7 @@ nextOperation: ...@@ -764,7 +766,7 @@ nextOperation:
s.loadRequest.GPULayers = newGPULayers s.loadRequest.GPULayers = newGPULayers
resp, err = s.initModel(ctx, s.loadRequest, operation) resp, err = s.initModel(ctx, s.loadRequest, operation)
if err != nil { if err != nil {
return err return nil, err
} }
resp.Memory.Log(slog.LevelDebug) resp.Memory.Log(slog.LevelDebug)
...@@ -773,7 +775,7 @@ nextOperation: ...@@ -773,7 +775,7 @@ nextOperation:
if resp.Success { if resp.Success {
verifyGPULayers, err := s.createLayout(systemInfo, gpus, &resp.Memory, requireFull, backoff) verifyGPULayers, err := s.createLayout(systemInfo, gpus, &resp.Memory, requireFull, backoff)
if err != nil { if err != nil {
return err return nil, err
} }
slog.Debug("verifying layout", "layers", verifyGPULayers) slog.Debug("verifying layout", "layers", verifyGPULayers)
...@@ -798,7 +800,7 @@ nextOperation: ...@@ -798,7 +800,7 @@ nextOperation:
} }
if s.options.NumGPU >= 0 { if s.options.NumGPU >= 0 {
return fmt.Errorf("memory layout cannot be allocated with num_gpu = %v", s.options.NumGPU) return nil, fmt.Errorf("memory layout cannot be allocated with num_gpu = %v", s.options.NumGPU)
} }
// Memory allocation failed even though we created a layout that we thought should // Memory allocation failed even though we created a layout that we thought should
...@@ -808,7 +810,7 @@ nextOperation: ...@@ -808,7 +810,7 @@ nextOperation:
// space. // space.
if backoff > 1 { if backoff > 1 {
slog.Warn("memory layout cannot be allocated", "memory", resp.Memory) slog.Warn("memory layout cannot be allocated", "memory", resp.Memory)
return errors.New("memory layout cannot be allocated") return nil, errors.New("memory layout cannot be allocated")
} else if backoff == 0 { } else if backoff == 0 {
backoff = 0.01 backoff = 0.01
} else { } else {
...@@ -823,7 +825,7 @@ nextOperation: ...@@ -823,7 +825,7 @@ nextOperation:
s.loadRequest.GPULayers = gpuLayers s.loadRequest.GPULayers = gpuLayers
resp, err := s.initModel(ctx, s.loadRequest, LoadOperationCommit) resp, err := s.initModel(ctx, s.loadRequest, LoadOperationCommit)
if err != nil { if err != nil {
return err return nil, err
} }
success = resp.Success success = resp.Success
...@@ -831,10 +833,27 @@ nextOperation: ...@@ -831,10 +833,27 @@ nextOperation:
if !success { if !success {
slog.Warn("failed to commit memory for model", "memory", resp.Memory) slog.Warn("failed to commit memory for model", "memory", resp.Memory)
return errors.New("failed to commit memory for model") return nil, errors.New("failed to commit memory for model")
} }
return nil return uniqueDeviceIDs(gpuLayers), nil
}
func uniqueDeviceIDs(gpuLayers ml.GPULayersList) []ml.DeviceID {
devices := []ml.DeviceID{}
for _, layer := range gpuLayers {
new := true
for _, ID := range devices {
if layer.DeviceID == ID {
new = false
break
}
}
if new {
devices = append(devices, layer.DeviceID)
}
}
return devices
} }
// createLayout uses the current best view of memory requirements and creates a layout of model layers on GPUs. // createLayout uses the current best view of memory requirements and creates a layout of model layers on GPUs.
...@@ -879,7 +898,7 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d ...@@ -879,7 +898,7 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
for i := range gl { for i := range gl {
found := false found := false
for j := range memory.GPUs { for j := range memory.GPUs {
if gl[i].ID == memory.GPUs[j].ID { if gl[i].DeviceID == memory.GPUs[j].DeviceID {
if memory.GPUs[j].Graph != 0 { if memory.GPUs[j].Graph != 0 {
lastUsedGPU = i lastUsedGPU = i
} }
...@@ -891,7 +910,7 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d ...@@ -891,7 +910,7 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
gl[i].FreeMemory = 0 gl[i].FreeMemory = 0
} }
slog.Debug("available gpu", "id", gl[i].ID, slog.Debug("available gpu", "id", gl[i].ID, "library", gl[i].Library,
"available layer vram", format.HumanBytes2(gl[i].FreeMemory), "available layer vram", format.HumanBytes2(gl[i].FreeMemory),
"backoff", fmt.Sprintf("%.2f", backoff), "minimum", format.HumanBytes2(gl[i].MinimumMemory), "backoff", fmt.Sprintf("%.2f", backoff), "minimum", format.HumanBytes2(gl[i].MinimumMemory),
"overhead", format.HumanBytes2(envconfig.GpuOverhead()), "overhead", format.HumanBytes2(envconfig.GpuOverhead()),
...@@ -918,7 +937,7 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d ...@@ -918,7 +937,7 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
var vramSize uint64 var vramSize uint64
for _, gl := range gpuLayers { for _, gl := range gpuLayers {
for _, gpu := range memory.GPUs { for _, gpu := range memory.GPUs {
if gl.ID == gpu.ID { if gl.DeviceID == gpu.DeviceID {
vramSize += gpu.Graph vramSize += gpu.Graph
break break
} }
...@@ -1039,7 +1058,7 @@ func findBestFit(layers []uint64, gpus discover.GpuInfoList, requestedLayers int ...@@ -1039,7 +1058,7 @@ func findBestFit(layers []uint64, gpus discover.GpuInfoList, requestedLayers int
// greedyFit assigns layers incrementally to GPUs, spilling over as each runs out of free space // greedyFit assigns layers incrementally to GPUs, spilling over as each runs out of free space
func greedyFit(layers []uint64, gpus discover.GpuInfoList, capacity float32, requestedLayers int) (gpuLayers ml.GPULayersList) { func greedyFit(layers []uint64, gpus discover.GpuInfoList, capacity float32, requestedLayers int) (gpuLayers ml.GPULayersList) {
device := len(gpus) - 1 device := len(gpus) - 1
gpuLayers = ml.GPULayersList{{ID: gpus[device].ID}} gpuLayers = ml.GPULayersList{{DeviceID: gpus[device].DeviceID}}
freeSpace := uint64(float32(gpus[device].FreeMemory) * capacity) freeSpace := uint64(float32(gpus[device].FreeMemory) * capacity)
for i := len(layers) - 1; i >= 0; i-- { for i := len(layers) - 1; i >= 0; i-- {
if requestedLayers >= 0 && len(layers)-1-i >= requestedLayers { if requestedLayers >= 0 && len(layers)-1-i >= requestedLayers {
...@@ -1057,7 +1076,7 @@ func greedyFit(layers []uint64, gpus discover.GpuInfoList, capacity float32, req ...@@ -1057,7 +1076,7 @@ func greedyFit(layers []uint64, gpus discover.GpuInfoList, capacity float32, req
if device < 0 { if device < 0 {
return gpuLayers return gpuLayers
} }
gpuLayers = append(ml.GPULayersList{{ID: gpus[device].ID}}, gpuLayers...) gpuLayers = append(ml.GPULayersList{{DeviceID: gpus[device].DeviceID}}, gpuLayers...)
freeSpace = uint64(float32(gpus[device].FreeMemory) * capacity) freeSpace = uint64(float32(gpus[device].FreeMemory) * capacity)
} }
} }
...@@ -1312,6 +1331,17 @@ func (s *llmServer) Pid() int { ...@@ -1312,6 +1331,17 @@ func (s *llmServer) Pid() int {
return -1 return -1
} }
func (s *llmServer) GetPort() int {
return s.port
}
func (s *llmServer) HasExited() bool {
if s.cmd != nil && s.cmd.ProcessState != nil && s.cmd.ProcessState.ExitCode() >= 0 {
return true
}
return false
}
var grammarJSON = ` var grammarJSON = `
root ::= object root ::= object
value ::= object | array | string | number | ("true" | "false" | "null") ws value ::= object | array | string | number | ("true" | "false" | "null") ws
...@@ -1386,7 +1416,7 @@ type CompletionResponse struct { ...@@ -1386,7 +1416,7 @@ type CompletionResponse struct {
func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error { func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
slog.Debug("completion request", "images", len(req.Images), "prompt", len(req.Prompt), "format", string(req.Format)) slog.Debug("completion request", "images", len(req.Images), "prompt", len(req.Prompt), "format", string(req.Format))
slog.Log(ctx, logutil.LevelTrace, "completion request", "prompt", req.Prompt) logutil.Trace("completion request", "prompt", req.Prompt)
if len(req.Format) > 0 { if len(req.Format) > 0 {
switch string(req.Format) { switch string(req.Format) {
...@@ -1552,7 +1582,7 @@ type EmbeddingResponse struct { ...@@ -1552,7 +1582,7 @@ type EmbeddingResponse struct {
} }
func (s *llmServer) Embedding(ctx context.Context, input string) ([]float32, error) { func (s *llmServer) Embedding(ctx context.Context, input string) ([]float32, error) {
slog.Log(ctx, logutil.LevelTrace, "embedding request", "input", input) logutil.Trace("embedding request", "input", input)
if err := s.sem.Acquire(ctx, 1); err != nil { if err := s.sem.Acquire(ctx, 1); err != nil {
if errors.Is(err, context.Canceled) { if errors.Is(err, context.Canceled) {
...@@ -1704,9 +1734,9 @@ func (s *llamaServer) TotalSize() uint64 { ...@@ -1704,9 +1734,9 @@ func (s *llamaServer) TotalSize() uint64 {
return s.estimate.TotalSize return s.estimate.TotalSize
} }
func (s *llamaServer) VRAMByGPU(gpuID string) uint64 { func (s *llamaServer) VRAMByGPU(id ml.DeviceID) uint64 {
for i, gpu := range s.gpus { for i, gpu := range s.gpus {
if gpu.ID == gpuID { if gpu.DeviceID == id {
if i < len(s.estimate.GPUSizes) { if i < len(s.estimate.GPUSizes) {
return s.estimate.GPUSizes[i] return s.estimate.GPUSizes[i]
} }
...@@ -1715,6 +1745,11 @@ func (s *llamaServer) VRAMByGPU(gpuID string) uint64 { ...@@ -1715,6 +1745,11 @@ func (s *llamaServer) VRAMByGPU(gpuID string) uint64 {
return 0 return 0
} }
func (s *llamaServer) GetDeviceInfos(ctx context.Context) []ml.DeviceInfo {
slog.Debug("llamarunner free vram reporting not supported")
return nil
}
func (s *ollamaServer) VRAMSize() uint64 { func (s *ollamaServer) VRAMSize() uint64 {
if s.mem == nil { if s.mem == nil {
return 0 return 0
...@@ -1757,16 +1792,28 @@ func (s *ollamaServer) TotalSize() uint64 { ...@@ -1757,16 +1792,28 @@ func (s *ollamaServer) TotalSize() uint64 {
return mem return mem
} }
func (s *ollamaServer) VRAMByGPU(gpuID string) uint64 { func (s *ollamaServer) VRAMByGPU(id ml.DeviceID) uint64 {
if s.mem == nil { if s.mem == nil {
return 0 return 0
} }
for _, g := range s.mem.GPUs { for _, g := range s.mem.GPUs {
if g.ID == gpuID { if g.DeviceID == id {
return g.Size() return g.Size()
} }
} }
return 0 return 0
} }
func (s *ollamaServer) GetDeviceInfos(ctx context.Context) []ml.DeviceInfo {
devices, err := discover.GetDevicesFromRunner(ctx, s)
if err != nil {
if s.cmd != nil && s.cmd.ProcessState == nil {
// Still running but hit an error, log
slog.Debug("failure refreshing GPU information", "error", err)
}
// else no longer running so suppress logging as a failure is expected
}
return devices
}
...@@ -16,8 +16,8 @@ import ( ...@@ -16,8 +16,8 @@ import (
func TestLLMServerFitGPU(t *testing.T) { func TestLLMServerFitGPU(t *testing.T) {
type gpu struct { type gpu struct {
library string id ml.DeviceID
free int free int
} }
tests := []struct { tests := []struct {
...@@ -37,91 +37,91 @@ func TestLLMServerFitGPU(t *testing.T) { ...@@ -37,91 +37,91 @@ func TestLLMServerFitGPU(t *testing.T) {
}, },
{ {
name: "Full single GPU", name: "Full single GPU",
gpus: []gpu{{free: 256 * format.MebiByte}}, gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte}, layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1, numGPU: -1,
expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{0, 1, 2}}}, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0, 1, 2}}},
}, },
{ {
name: "Partial single GPU", name: "Partial single GPU",
gpus: []gpu{{free: 256 * format.MebiByte}}, gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte}, layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
numGPU: -1, numGPU: -1,
expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{1, 2}}}, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1, 2}}},
}, },
{ {
name: "Single GPU with numGPU 1", name: "Single GPU with numGPU 1",
gpus: []gpu{{free: 256 * format.MebiByte}}, gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte}, layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: 1, numGPU: 1,
expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{1}}}, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1}}},
}, },
{ {
name: "Single GPU with numGPU 0", name: "Single GPU with numGPU 0",
gpus: []gpu{{free: 256 * format.MebiByte}}, gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte}, layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: 0, numGPU: 0,
expected: ml.GPULayersList{}, expected: ml.GPULayersList{},
}, },
{ {
name: "Single GPU with numGPU 999", name: "Single GPU with numGPU 999",
gpus: []gpu{{free: 256 * format.MebiByte}}, gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte}, layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
numGPU: 999, numGPU: 999,
expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{0, 1, 2, 3}}}, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0, 1, 2, 3}}},
}, },
{ {
name: "Multi GPU fits on one", name: "Multi GPU fits on one",
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}}, gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte}, layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1, numGPU: -1,
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1, 2}}}, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0, 1, 2}}},
}, },
{ {
name: "Multi GPU split", name: "Multi GPU split",
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}}, gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
layers: []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte}, layers: []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1, numGPU: -1,
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0}}, {ID: "gpu0", Layers: []int{1, 2}}}, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1, 2}}},
}, },
{ {
name: "Multi GPU partial", name: "Multi GPU partial",
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}}, gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
layers: []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte}, layers: []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1, numGPU: -1,
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{1}}}, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{1}}},
}, },
{ {
name: "Multi GPU numGPU 1", name: "Multi GPU numGPU 1",
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}}, gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte}, layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: 1, numGPU: 1,
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{1}}}, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{1}}},
}, },
{ {
name: "Multi GPU numGPU 2", name: "Multi GPU numGPU 2",
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}}, gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
layers: []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte}, layers: []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: 2, numGPU: 2,
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0}}, {ID: "gpu0", Layers: []int{1}}}, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1}}},
}, },
{ {
name: "Multi GPU numGPU 999", name: "Multi GPU numGPU 999",
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}}, gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
layers: []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte}, layers: []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
numGPU: 999, numGPU: 999,
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1}}, {ID: "gpu0", Layers: []int{2}}}, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0, 1}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{2}}},
}, },
{ {
name: "Multi GPU different libraries", name: "Multi GPU different libraries",
gpus: []gpu{{library: "cuda", free: 128 * format.MebiByte}, {library: "rocm", free: 256 * format.MebiByte}}, gpus: []gpu{{id: ml.DeviceID{Library: "CUDA", ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{Library: "ROCm", ID: "gpu1"}, free: 256 * format.MebiByte}},
layers: []int{128 * format.MebiByte, 128 * format.MebiByte, 50 * format.MebiByte}, layers: []int{128 * format.MebiByte, 128 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1, numGPU: -1,
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1}}}, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1", Library: "ROCm"}, Layers: []int{0, 1}}},
}, },
{ {
name: "requireFull", name: "requireFull",
gpus: []gpu{{free: 256 * format.MebiByte}}, gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte}, layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
numGPU: -1, numGPU: -1,
requireFull: true, requireFull: true,
...@@ -138,8 +138,7 @@ func TestLLMServerFitGPU(t *testing.T) { ...@@ -138,8 +138,7 @@ func TestLLMServerFitGPU(t *testing.T) {
gpus := make(discover.GpuInfoList, len(tt.gpus)) gpus := make(discover.GpuInfoList, len(tt.gpus))
for i := range tt.gpus { for i := range tt.gpus {
gpus[i].ID = fmt.Sprintf("gpu%d", i) gpus[i].DeviceID = tt.gpus[i].id
gpus[i].Library = tt.gpus[i].library
gpus[i].FreeMemory = uint64(tt.gpus[i].free) gpus[i].FreeMemory = uint64(tt.gpus[i].free)
} }
...@@ -164,7 +163,7 @@ func TestLLMServerFitGPU(t *testing.T) { ...@@ -164,7 +163,7 @@ func TestLLMServerFitGPU(t *testing.T) {
} }
for i := range s.mem.GPUs { for i := range s.mem.GPUs {
s.mem.GPUs[i].ID = fmt.Sprintf("gpu%d", i) s.mem.GPUs[i].DeviceID = gpus[i].DeviceID
s.mem.GPUs[i].Weights = make([]uint64, s.totalLayers) s.mem.GPUs[i].Weights = make([]uint64, s.totalLayers)
s.mem.GPUs[i].Cache = make([]uint64, s.totalLayers) s.mem.GPUs[i].Cache = make([]uint64, s.totalLayers)
} }
......
...@@ -5,14 +5,11 @@ import ( ...@@ -5,14 +5,11 @@ import (
"context" "context"
"encoding/binary" "encoding/binary"
"fmt" "fmt"
"hash/maphash"
"log/slog"
"math" "math"
"slices" "slices"
"strconv" "strconv"
"strings" "strings"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/fs" "github.com/ollama/ollama/fs"
) )
...@@ -29,6 +26,9 @@ type Backend interface { ...@@ -29,6 +26,9 @@ type Backend interface {
Get(name string) Tensor Get(name string) Tensor
NewContext() Context NewContext() Context
NewContextSize(size int) Context NewContextSize(size int) Context
// Enumerate the devices available for inference via this backend
BackendDevices() []DeviceInfo
} }
// BackendCacheConfig should be implemented by backends that need special output // BackendCacheConfig should be implemented by backends that need special output
...@@ -60,77 +60,6 @@ type CacheConfig struct { ...@@ -60,77 +60,6 @@ type CacheConfig struct {
MaskBatchPadding int MaskBatchPadding int
} }
// GPULayers is a set of layers to be allocated on a single GPU
type GPULayers struct {
// ID is the identifier of the GPU, as reported in DeviceMemory
ID string
// Layers is a set of layer indicies to load
Layers []int
}
func (g GPULayers) String() string {
if len(g.Layers) == 0 {
return ""
}
slices.Sort(g.Layers)
contiguous := true
base := g.Layers[0]
for i := range g.Layers {
if g.Layers[i] != base+i {
contiguous = false
break
}
}
if contiguous {
return fmt.Sprintf("ID:%v Layers:%v(%v..%v)", g.ID, len(g.Layers), g.Layers[0], g.Layers[len(g.Layers)-1])
} else {
return fmt.Sprintf("ID:%v Layers:%v%v", g.ID, len(g.Layers), g.Layers)
}
}
// GPULayersList is a set of layer allocations across multiple GPUs
type GPULayersList []GPULayers
func (l GPULayersList) String() string {
if l.Sum() > 0 {
return fmt.Sprintf("%v%v", l.Sum(), []GPULayers(l))
} else {
return fmt.Sprintf("%v", []GPULayers(l))
}
}
// Sum is the total number of layers assigned across all GPUs
func (l GPULayersList) Sum() int {
var sum int
for _, g := range l {
sum += len(g.Layers)
}
return sum
}
var h maphash.Hash
// Hash is an identifier of this layer assignment
func (l GPULayersList) Hash() uint64 {
h.Reset()
for _, g := range l {
if len(g.Layers) > 0 {
h.WriteString(g.ID)
for _, l := range g.Layers {
binary.Write(&h, binary.NativeEndian, int64(l))
}
}
}
return h.Sum64()
}
// BackendParams controls how the backend loads and executes models // BackendParams controls how the backend loads and executes models
type BackendParams struct { type BackendParams struct {
// AllocMemory causes the backend to allocate memory for the model. If // AllocMemory causes the backend to allocate memory for the model. If
...@@ -148,150 +77,6 @@ type BackendParams struct { ...@@ -148,150 +77,6 @@ type BackendParams struct {
FlashAttention bool FlashAttention bool
} }
// ErrNoMem is returned when panicing due to insufficient memory. It includes
// the attempted memory allocation.
type ErrNoMem struct {
BackendMemory
}
func (e ErrNoMem) Error() string {
return fmt.Sprintf("insufficient memory - required allocations: %+v", e.BackendMemory)
}
// DeviceMemory provides a breakdown of the memory needed
// per device, such as a CPU or GPU.
type DeviceMemory struct {
// Name is the name of the device as labeled by the backend. It
// may not be persistent across instances of the runner.
Name string
// ID is an identifier for the device for matching with system
// management libraries.
ID string
// Weights is the per-layer memory needed for the model weights.
Weights []uint64
// Cache is the per-layer memory needed for the KV cache.
Cache []uint64
// Graph is the size of the compute graph. It is not per-layer.
Graph uint64
}
func sumMemory(mem []uint64) uint64 {
var sum uint64
for _, m := range mem {
sum += m
}
return sum
}
// Size returns the total size of the memory required by this device
func (m DeviceMemory) Size() uint64 {
return sumMemory(m.Weights) + sumMemory(m.Cache) + m.Graph
}
func memoryPresent(mem []uint64) bool {
return slices.ContainsFunc(mem, func(m uint64) bool { return m != 0 })
}
func (m DeviceMemory) LogValue() slog.Value {
var attrs []slog.Attr
if memoryPresent(m.Weights) {
attrs = append(attrs, slog.Any("Weights", m.Weights))
}
if memoryPresent(m.Cache) {
attrs = append(attrs, slog.Any("Cache", m.Cache))
}
if m.Graph != 0 {
attrs = append(attrs, slog.Any("Graph", m.Graph))
}
if len(attrs) > 0 && m.ID != "" {
attrs = append([]slog.Attr{slog.String("ID", m.ID)}, attrs...)
}
return slog.GroupValue(attrs...)
}
// BackendMemory provides the amount of memory required to load the model
// per device based on the BackendParams. In some cases, not all required
// allocations will be known at this point. However, the size of the most recent
// allocation is guaranteed to be provided so that if it failed, the caller can
// accommodate that to make forward progress.
type BackendMemory struct {
// InputWeights are always located on the CPU and cannot be moved
InputWeights uint64
// CPU model components are located in system memory. This does not
// include unified memory allocated through the GPU.
CPU DeviceMemory
// GPU model components are located on one or more GPUs.
GPUs []DeviceMemory
}
func (m BackendMemory) LogValue() slog.Value {
var attrs []slog.Attr
if m.InputWeights != 0 {
attrs = append(attrs, slog.Any("InputWeights", m.InputWeights))
}
attrs = append(attrs, slog.Any(m.CPU.Name, m.CPU))
for _, g := range m.GPUs {
attrs = append(attrs, slog.Any(g.Name, g))
}
return slog.GroupValue(attrs...)
}
// Log prints a high level summary of the memory
func (m BackendMemory) Log(level slog.Level) {
var total uint64
for _, gpu := range m.GPUs {
if sum := sumMemory(gpu.Weights); sum > 0 {
slog.Log(context.TODO(), level, "model weights", "device", gpu.Name, "size", format.HumanBytes2(sum))
total += sum
}
}
if sum := m.InputWeights + sumMemory(m.CPU.Weights); sum > 0 {
slog.Log(context.TODO(), level, "model weights", "device", m.CPU.Name, "size", format.HumanBytes2(sum))
total += sum
}
for _, gpu := range m.GPUs {
if sum := sumMemory(gpu.Cache); sum > 0 {
slog.Log(context.TODO(), level, "kv cache", "device", gpu.Name, "size", format.HumanBytes2(sum))
total += sum
}
}
if sum := sumMemory(m.CPU.Cache); sum > 0 {
slog.Log(context.TODO(), level, "kv cache", "device", m.CPU.Name, "size", format.HumanBytes2(sum))
total += sum
}
for _, gpu := range m.GPUs {
if sum := gpu.Graph; sum > 0 {
slog.Log(context.TODO(), level, "compute graph", "device", gpu.Name, "size", format.HumanBytes2(sum))
total += sum
}
}
if sum := m.CPU.Graph; sum > 0 {
slog.Log(context.TODO(), level, "compute graph", "device", m.CPU.Name, "size", format.HumanBytes2(sum))
total += sum
}
if total > 0 {
slog.Log(context.TODO(), level, "total memory", "size", format.HumanBytes2(total))
}
}
var backends = make(map[string]func(string, BackendParams) (Backend, error)) var backends = make(map[string]func(string, BackendParams) (Backend, error))
func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)) { func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)) {
......
package ggml package ggml
// #cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm
// #cgo windows LDFLAGS: -lpthread
// #cgo CPPFLAGS: -I${SRCDIR}/ggml/include // #cgo CPPFLAGS: -I${SRCDIR}/ggml/include
// #include <stdlib.h> // #include <stdlib.h>
// #include <stdint.h> // #include <stdint.h>
...@@ -168,6 +170,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { ...@@ -168,6 +170,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
var props C.struct_ggml_backend_dev_props var props C.struct_ggml_backend_dev_props
C.ggml_backend_dev_get_props(cpuDeviceBufferType.d, &props) C.ggml_backend_dev_get_props(cpuDeviceBufferType.d, &props)
requiredMemory.CPU.ID = C.GoString(props.id) requiredMemory.CPU.ID = C.GoString(props.id)
requiredMemory.CPU.Library = C.GoString(props.library)
requiredMemory.CPU.Weights = make([]uint64, blocks+1) requiredMemory.CPU.Weights = make([]uint64, blocks+1)
requiredMemory.CPU.Cache = make([]uint64, blocks+1) requiredMemory.CPU.Cache = make([]uint64, blocks+1)
...@@ -186,6 +189,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { ...@@ -186,6 +189,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
var props C.struct_ggml_backend_dev_props var props C.struct_ggml_backend_dev_props
C.ggml_backend_dev_get_props(d, &props) C.ggml_backend_dev_get_props(d, &props)
requiredMemory.GPUs[i].ID = C.GoString(props.id) requiredMemory.GPUs[i].ID = C.GoString(props.id)
requiredMemory.GPUs[i].Library = C.GoString(props.library)
requiredMemory.GPUs[i].Weights = make([]uint64, blocks+1) requiredMemory.GPUs[i].Weights = make([]uint64, blocks+1)
requiredMemory.GPUs[i].Cache = make([]uint64, blocks+1) requiredMemory.GPUs[i].Cache = make([]uint64, blocks+1)
} }
...@@ -198,7 +202,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { ...@@ -198,7 +202,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
for _, l := range p.Layers { for _, l := range p.Layers {
if l == layer { if l == layer {
for i := range requiredMemory.GPUs { for i := range requiredMemory.GPUs {
if requiredMemory.GPUs[i].ID == p.ID { if requiredMemory.GPUs[i].DeviceID == p.DeviceID {
return gpuDeviceBufferTypes[i] return gpuDeviceBufferTypes[i]
} }
} }
...@@ -682,6 +686,52 @@ func (b *Backend) CacheConfig() ml.CacheConfig { ...@@ -682,6 +686,52 @@ func (b *Backend) CacheConfig() ml.CacheConfig {
} }
} }
func (b *Backend) BackendDevices() []ml.DeviceInfo {
deviceInfos := []ml.DeviceInfo{}
for _, dev := range gpus {
// If we have a model loaded, and it's only loaded on a subset of the devices
// skip idle/unused devices to avoid initializing them and causing VRAM allocations
if b.allocMemory {
idleDev := true
for _, backend := range b.schedBackends {
if dev == C.ggml_backend_get_device(backend) {
idleDev = false
break
}
}
if idleDev {
slog.Debug("skipping unused backend device", "description", C.GoString(C.ggml_backend_dev_description(dev)))
continue
}
}
info := ml.DeviceInfo{}
props := C.struct_ggml_backend_dev_props{}
C.ggml_backend_dev_get_props(dev, &props)
info.Name = C.GoString(props.name)
info.Description = C.GoString(props.description)
info.ID = C.GoString(props.id)
info.Library = C.GoString(props.library)
info.ComputeMajor = (int)(props.compute_major)
info.ComputeMinor = (int)(props.compute_minor)
info.DriverMajor = (int)(props.driver_major)
info.DriverMinor = (int)(props.driver_minor)
info.Integrated = props.integrated != 0
if props.library != nil {
info.Library = C.GoString(props.library)
}
info.PCIID = fmt.Sprintf("%02x:%02x.%x", props.pci_bus_id, props.pci_device_id, props.pci_domain_id)
info.LibraryPath = ggml.LibPaths()
C.ggml_backend_dev_memory(dev, &props.memory_free, &props.memory_total)
info.TotalMemory = (uint64)(props.memory_total)
info.FreeMemory = (uint64)(props.memory_free)
deviceInfos = append(deviceInfos, info)
}
return deviceInfos
}
type Context struct { type Context struct {
b *Backend b *Backend
......
...@@ -157,6 +157,15 @@ extern "C" { ...@@ -157,6 +157,15 @@ extern "C" {
size_t memory_total; size_t memory_total;
enum ggml_backend_dev_type type; enum ggml_backend_dev_type type;
struct ggml_backend_dev_caps caps; struct ggml_backend_dev_caps caps;
int driver_major;
int driver_minor;
int compute_major;
int compute_minor;
int integrated;
int pci_bus_id;
int pci_device_id;
int pci_domain_id;
const char *library;
}; };
GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device); GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device);
......
...@@ -203,6 +203,8 @@ add_library(ggml-base ...@@ -203,6 +203,8 @@ add_library(ggml-base
ggml-threading.h ggml-threading.h
ggml-quants.c ggml-quants.c
ggml-quants.h ggml-quants.h
mem_hip.cpp
mem_nvml.cpp
gguf.cpp) gguf.cpp)
target_include_directories(ggml-base PRIVATE .) target_include_directories(ggml-base PRIVATE .)
......
...@@ -279,6 +279,16 @@ static ggml_cuda_device_info ggml_cuda_init() { ...@@ -279,6 +279,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
for (int id = 0; id < info.device_count; ++id) { for (int id = 0; id < info.device_count; ++id) {
int device_vmm = 0; int device_vmm = 0;
#if defined(GGML_USE_HIP)
if (std::getenv("GGML_CUDA_INIT") != NULL) {
GGML_LOG_INFO("%s: initializing rocBLAS on device %d\n", __func__, id);
CUDA_CHECK(cudaSetDevice(id));
// rocblas_initialize will SIGABRT if the GPU isn't supported
rocblas_initialize();
GGML_LOG_INFO("%s: rocBLAS initialized on device %d\n", __func__, id);
}
#endif
#if defined(GGML_USE_VMM) #if defined(GGML_USE_VMM)
CUdevice device; CUdevice device;
CU_CHECK(cuDeviceGet(&device, id)); CU_CHECK(cuDeviceGet(&device, id));
...@@ -332,9 +342,15 @@ static ggml_cuda_device_info ggml_cuda_init() { ...@@ -332,9 +342,15 @@ static ggml_cuda_device_info ggml_cuda_init() {
#else #else
info.devices[id].smpbo = prop.sharedMemPerBlockOptin; info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
info.devices[id].cc = 100*prop.major + 10*prop.minor; info.devices[id].cc = 100*prop.major + 10*prop.minor;
#ifdef __CUDA_ARCH_LIST__
if (std::getenv("GGML_CUDA_INIT") != NULL) {
GGML_ASSERT(ggml_cuda_has_arch(info.devices[id].cc) && "ggml was not compiled with support for this arch");
}
#endif // defined(__CUDA_ARCH_LIST__)
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n", GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
ggml_cuda_parse_uuid(prop, id).c_str()); ggml_cuda_parse_uuid(prop, id).c_str());
#endif // defined(GGML_USE_HIP) #endif // defined(GGML_USE_HIP)
} }
...@@ -3352,6 +3368,14 @@ struct ggml_backend_cuda_device_context { ...@@ -3352,6 +3368,14 @@ struct ggml_backend_cuda_device_context {
std::string name; std::string name;
std::string description; std::string description;
std::string id; std::string id;
int major;
int minor;
int driver_major;
int driver_minor;
int integrated;
int pci_bus_id;
int pci_device_id;
int pci_domain_id;
}; };
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) { static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
...@@ -3372,6 +3396,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) { ...@@ -3372,6 +3396,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context; ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
ggml_cuda_set_device(ctx->device); ggml_cuda_set_device(ctx->device);
#if defined(GGML_USE_HIP)
if (ggml_hip_mgmt_init() == 0) {
int status = ggml_hip_get_device_memory(ctx->pci_bus_id, ctx->pci_device_id, free, total);
if (status == 0) {
GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total);
ggml_hip_mgmt_release();
return;
}
ggml_hip_mgmt_release();
}
#else
if (ggml_nvml_init() == 0) {
int status = ggml_nvml_get_device_memory(ctx->id.c_str(), free, total);
if (status == 0) {
GGML_LOG_DEBUG("%s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, *free, *total);
ggml_nvml_release();
return;
}
ggml_nvml_release();
}
#endif
CUDA_CHECK(cudaMemGetInfo(free, total)); CUDA_CHECK(cudaMemGetInfo(free, total));
} }
...@@ -3380,6 +3426,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend ...@@ -3380,6 +3426,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
return GGML_BACKEND_DEVICE_TYPE_GPU; return GGML_BACKEND_DEVICE_TYPE_GPU;
} }
#define GGML_HIP_NAME "HIP"
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) { static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
props->name = ggml_backend_cuda_device_get_name(dev); props->name = ggml_backend_cuda_device_get_name(dev);
props->description = ggml_backend_cuda_device_get_description(dev); props->description = ggml_backend_cuda_device_get_description(dev);
...@@ -3390,6 +3437,23 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back ...@@ -3390,6 +3437,23 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
// If you need the memory data, call ggml_backend_dev_memory() explicitly. // If you need the memory data, call ggml_backend_dev_memory() explicitly.
props->memory_total = props->memory_free = 0; props->memory_total = props->memory_free = 0;
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
#if defined(GGML_USE_HIP)
int cc = ggml_cuda_info().devices[ctx->device].cc - GGML_CUDA_CC_OFFSET_AMD;
props->compute_major = cc / 0x100;
props->compute_minor = cc - (props->compute_major * 0x100);
#else
props->compute_major = ctx->major;
props->compute_minor = ctx->minor;
#endif
props->driver_major = ctx->driver_major;
props->driver_minor = ctx->driver_minor;
props->integrated = ctx->integrated;
props->pci_bus_id = ctx->pci_bus_id;
props->pci_device_id = ctx->pci_device_id;
props->pci_domain_id = ctx->pci_domain_id;
props->library = GGML_CUDA_NAME;
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr; bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
#ifdef GGML_CUDA_NO_PEER_COPY #ifdef GGML_CUDA_NO_PEER_COPY
bool events = false; bool events = false;
...@@ -3980,6 +4044,8 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { ...@@ -3980,6 +4044,8 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
std::lock_guard<std::mutex> lock(mutex); std::lock_guard<std::mutex> lock(mutex);
if (!initialized) { if (!initialized) {
ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context; ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
int driverVersion = 0;
CUDA_CHECK(cudaDriverGetVersion(&driverVersion));
for (int i = 0; i < ggml_cuda_info().device_count; i++) { for (int i = 0; i < ggml_cuda_info().device_count; i++) {
ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context; ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
...@@ -3990,7 +4056,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { ...@@ -3990,7 +4056,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
CUDA_CHECK(cudaGetDeviceProperties(&prop, i)); CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
dev_ctx->description = prop.name; dev_ctx->description = prop.name;
dev_ctx->id = ggml_cuda_parse_uuid(prop, i); dev_ctx->id = ggml_cuda_parse_uuid(prop, i);
dev_ctx->major = prop.major;
dev_ctx->minor = prop.minor;
dev_ctx->driver_major = driverVersion / 1000;
dev_ctx->driver_minor = (driverVersion - (dev_ctx->driver_major * 1000)) / 10;
dev_ctx->integrated = prop.integrated;
dev_ctx->pci_bus_id = prop.pciBusID;
dev_ctx->pci_device_id = prop.pciDeviceID;
dev_ctx->pci_domain_id = prop.pciDomainID;
ggml_backend_dev_t dev = new ggml_backend_device { ggml_backend_dev_t dev = new ggml_backend_device {
/* .iface = */ ggml_backend_cuda_device_interface, /* .iface = */ ggml_backend_cuda_device_interface,
/* .reg = */ &reg, /* .reg = */ &reg,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment