Use runners for GPU discovery (#12090)

This revamps how we discover GPUs in the system by leveraging the Ollama runner. This should eliminate inconsistency between our GPU discovery and the runners capabilities at runtime, particularly for cases where we try to filter out unsupported GPUs. Now the runner does that implicitly based on the actual device list. In some cases free VRAM reporting can be unreliable which can leaad to scheduling mistakes, so this also includes a patch to leverage more reliable VRAM reporting libraries if available. Automatic workarounds have been removed as only one GPU leveraged this, which is now documented. This GPU will soon fall off the support matrix with the next ROCm bump. Additional cleanup of the scheduler and discovery packages can be done in the future once we have switched on the new memory management code, and removed support for the llama runner.

Use runners for GPU discovery (#12090)
This revamps how we discover GPUs in the system by leveraging the Ollama runner. This should eliminate inconsistency between our GPU discovery and the runners capabilities at runtime, particularly for cases where we try to filter out unsupported GPUs. Now the runner does that implicitly based on the actual device list. In some cases free VRAM reporting can be unreliable which can leaad to scheduling mistakes, so this also includes a patch to leverage more reliable VRAM reporting libraries if available. Automatic workarounds have been removed as only one GPU leveraged this, which is now documented. This GPU will soon fall off the support matrix with the next ROCm bump. Additional cleanup of the scheduler and discovery packages can be done in the future once we have switched on the new memory management code, and removed support for the llama runner.
bc8909fb · Daniel Hiltgen · GitHub · 6b50f2b9 · 6b50f2b9 · 6b50f2b9
Unverified Commit bc8909fb authored Oct 01, 2025 by Daniel Hiltgen Committed by GitHub Oct 01, 2025
20 changed files
--- a/discover/gpu_info_nvml.h
+++ b/discover/gpu_info_nvml.h
-#ifndef __APPLE__
-#ifndef __GPU_INFO_NVML_H__
-#define __GPU_INFO_NVML_H__
-#include "gpu_info.h"
-// Just enough typedef's to dlopen/dlsym for memory information
-typedef enum nvmlReturn_enum {
-  NVML_SUCCESS = 0,
-  // Other values omitted for now...
-} nvmlReturn_t;
-typedef void *nvmlDevice_t;  // Opaque is sufficient
-typedef struct nvmlMemory_st {
-  unsigned long long total;
-  unsigned long long free;
-  unsigned long long used;
-} nvmlMemory_t;
-typedef enum nvmlBrandType_enum
-{
-    NVML_BRAND_UNKNOWN          = 0,
-} nvmlBrandType_t;
-typedef struct nvml_handle {
-  void *handle;
-  uint16_t verbose;
-  nvmlReturn_t (*nvmlInit_v2)(void);
-  nvmlReturn_t (*nvmlShutdown)(void);
-  nvmlReturn_t (*nvmlDeviceGetHandleByUUID)(const char *, nvmlDevice_t *);
-  nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *);
-} nvml_handle_t;
-typedef struct nvml_init_resp {
-  char *err;  // If err is non-null handle is invalid
-  nvml_handle_t ch;
-} nvml_init_resp_t;
-typedef struct nvml_compute_capability {
-  char *err;
-  int major;
-  int minor;
-} nvml_compute_capability_t;
-void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp);
-void nvml_get_free(nvml_handle_t ch, char *uuid, uint64_t *free, uint64_t *total, uint64_t *used);
-void nvml_release(nvml_handle_t ch);
-#endif  // __GPU_INFO_NVML_H__
-#endif  // __APPLE__
\ No newline at end of file
--- a/discover/gpu_info_oneapi.c
+++ b/discover/gpu_info_oneapi.c
-#ifndef __APPLE__
-#include "gpu_info_oneapi.h"
-#include <string.h>
-void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) {
-  ze_result_t ret;
-  resp->err = NULL;
-  resp->oh.devices = NULL;
-  resp->oh.num_devices = NULL;
-  resp->oh.drivers = NULL;
-  resp->oh.num_drivers = 0;
-  const int buflen = 256;
-  char buf[buflen + 1];
-  int i, d;
-  struct lookup {
-    char *s;
-    void **p;
-  } l[] = {
-      {"zesInit", (void *)&resp->oh.zesInit},
-      {"zesDriverGet", (void *)&resp->oh.zesDriverGet},
-      {"zesDeviceGet", (void *)&resp->oh.zesDeviceGet},
-      {"zesDeviceGetProperties", (void *)&resp->oh.zesDeviceGetProperties},
-      {"zesDeviceEnumMemoryModules",
-       (void *)&resp->oh.zesDeviceEnumMemoryModules},
-      {"zesMemoryGetProperties", (void *)&resp->oh.zesMemoryGetProperties},
-      {"zesMemoryGetState", (void *)&resp->oh.zesMemoryGetState},
-      {NULL, NULL},
-  };
-  resp->oh.handle = LOAD_LIBRARY(oneapi_lib_path, RTLD_LAZY);
-  if (!resp->oh.handle) {
-    char *msg = LOAD_ERR();
-    snprintf(buf, buflen,
-             "Unable to load %s library to query for Intel GPUs: %s\n",
-             oneapi_lib_path, msg);
-    free(msg);
-    resp->err = strdup(buf);
-    return;
-  }
-  // TODO once we've squashed the remaining corner cases remove this log
-  LOG(resp->oh.verbose,
-      "wiring Level-Zero management library functions in %s\n",
-      oneapi_lib_path);
-  for (i = 0; l[i].s != NULL; i++) {
-    // TODO once we've squashed the remaining corner cases remove this log
-    LOG(resp->oh.verbose, "dlsym: %s\n", l[i].s);
-    *l[i].p = LOAD_SYMBOL(resp->oh.handle, l[i].s);
-    if (!*(l[i].p)) {
-      resp->oh.handle = NULL;
-      char *msg = LOAD_ERR();
-      LOG(resp->oh.verbose, "dlerr: %s\n", msg);
-      UNLOAD_LIBRARY(resp->oh.handle);
-      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s, msg);
-      free(msg);
-      resp->err = strdup(buf);
-      return;
-    }
-  }
-  LOG(resp->oh.verbose, "calling zesInit\n");
-  ret = (*resp->oh.zesInit)(0);
-  if (ret != ZE_RESULT_SUCCESS) {
-    LOG(resp->oh.verbose, "zesInit err: %x\n", ret);
-    snprintf(buf, buflen, "oneapi vram init failure: %x", ret);
-    resp->err = strdup(buf);
-    oneapi_release(resp->oh);
-    return;
-  }
-  LOG(resp->oh.verbose, "calling zesDriverGet\n");
-  ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, NULL);
-  if (ret != ZE_RESULT_SUCCESS) {
-    LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret);
-    snprintf(buf, buflen, "unable to get driver count: %x", ret);
-    resp->err = strdup(buf);
-    oneapi_release(resp->oh);
-    return;
-  }
-  LOG(resp->oh.verbose, "oneapi driver count: %d\n", resp->oh.num_drivers);
-  resp->oh.drivers = malloc(resp->oh.num_drivers * sizeof(zes_driver_handle_t));
-  resp->oh.num_devices = malloc(resp->oh.num_drivers * sizeof(uint32_t));
-  memset(&resp->oh.num_devices[0], 0, resp->oh.num_drivers * sizeof(uint32_t));
-  resp->oh.devices =
-      malloc(resp->oh.num_drivers * sizeof(zes_device_handle_t *));
-  ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, &resp->oh.drivers[0]);
-  if (ret != ZE_RESULT_SUCCESS) {
-    LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret);
-    snprintf(buf, buflen, "unable to get driver count: %x", ret);
-    resp->err = strdup(buf);
-    oneapi_release(resp->oh);
-    return;
-  }
-  for (d = 0; d < resp->oh.num_drivers; d++) {
-    LOG(resp->oh.verbose, "calling zesDeviceGet count %d: %p\n", d, resp->oh.drivers[d]);
-    ret = (*resp->oh.zesDeviceGet)(resp->oh.drivers[d],
-                                   &resp->oh.num_devices[d], NULL);
-    if (ret != ZE_RESULT_SUCCESS) {
-      LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret);
-      snprintf(buf, buflen, "unable to get device count: %x", ret);
-      resp->err = strdup(buf);
-      oneapi_release(resp->oh);
-      return;
-    }
-    resp->oh.devices[d] =
-        malloc(resp->oh.num_devices[d] * sizeof(zes_device_handle_t));
-    ret = (*resp->oh.zesDeviceGet)(
-        resp->oh.drivers[d], &resp->oh.num_devices[d], resp->oh.devices[d]);
-    if (ret != ZE_RESULT_SUCCESS) {
-      LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret);
-      snprintf(buf, buflen, "unable to get device count: %x", ret);
-      resp->err = strdup(buf);
-      oneapi_release(resp->oh);
-      return;
-    }
-  }
-  return;
-}
-void oneapi_check_vram(oneapi_handle_t h, int driver, int device,
-                       mem_info_t *resp) {
-  ze_result_t ret;
-  resp->err = NULL;
-  uint64_t totalMem = 0;
-  uint64_t usedMem = 0;
-  const int buflen = 256;
-  char buf[buflen + 1];
-  int i, d, m;
-  if (h.handle == NULL) {
-    resp->err = strdup("Level-Zero handle not initialized");
-    return;
-  }
-  if (driver > h.num_drivers || device > h.num_devices[driver]) {
-    resp->err = strdup("driver of device index out of bounds");
-    return;
-  }
-  resp->total = 0;
-  resp->free = 0;
-  zes_device_ext_properties_t ext_props;
-  ext_props.stype = ZES_STRUCTURE_TYPE_DEVICE_EXT_PROPERTIES;
-  ext_props.pNext = NULL;
-  zes_device_properties_t props;
-  props.stype = ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES;
-  props.pNext = &ext_props;
-  ret = (*h.zesDeviceGetProperties)(h.devices[driver][device], &props);
-  if (ret != ZE_RESULT_SUCCESS) {
-    snprintf(buf, buflen, "unable to get device properties: %d", ret);
-    resp->err = strdup(buf);
-    return;
-  }
-  snprintf(&resp->gpu_name[0], GPU_NAME_LEN, "%s", props.modelName);
-  // TODO this needs to map to ONEAPI_DEVICE_SELECTOR syntax
-  // (this is probably wrong...)
-  // TODO - the driver isn't included - what if there are multiple drivers?
-  snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", device);
-  if (h.verbose) {
-    // When in verbose mode, report more information about
-    // the card we discover.
-    LOG(h.verbose, "[%d:%d] oneAPI device name: %s\n", driver, device,
-        props.modelName);
-    LOG(h.verbose, "[%d:%d] oneAPI brand: %s\n", driver, device,
-        props.brandName);
-    LOG(h.verbose, "[%d:%d] oneAPI vendor: %s\n", driver, device,
-        props.vendorName);
-    LOG(h.verbose, "[%d:%d] oneAPI S/N: %s\n", driver, device,
-        props.serialNumber);
-    LOG(h.verbose, "[%d:%d] oneAPI board number: %s\n", driver, device,
-        props.boardNumber);
-  }
-  // TODO
-  // Compute Capability equivalent in resp->major, resp->minor, resp->patch
-  uint32_t memCount = 0;
-  ret = (*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount,
-                                        NULL);
-  if (ret != ZE_RESULT_SUCCESS) {
-    snprintf(buf, buflen, "unable to enumerate Level-Zero memory modules: %x",
-             ret);
-    resp->err = strdup(buf);
-    return;
-  }
-  LOG(h.verbose, "discovered %d Level-Zero memory modules\n", memCount);
-  zes_mem_handle_t *mems = malloc(memCount * sizeof(zes_mem_handle_t));
-  (*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount, mems);
-  for (m = 0; m < memCount; m++) {
-    zes_mem_state_t state;
-    state.stype = ZES_STRUCTURE_TYPE_MEM_STATE;
-    state.pNext = NULL;
-    ret = (*h.zesMemoryGetState)(mems[m], &state);
-    if (ret != ZE_RESULT_SUCCESS) {
-      snprintf(buf, buflen, "unable to get memory state: %x", ret);
-      resp->err = strdup(buf);
-      free(mems);
-      return;
-    }
-    resp->total += state.size;
-    resp->free += state.free;
-  }
-  free(mems);
-}
-void oneapi_release(oneapi_handle_t h) {
-  int d;
-  LOG(h.verbose, "releasing oneapi library\n");
-  for (d = 0; d < h.num_drivers; d++) {
-    if (h.devices != NULL && h.devices[d] != NULL) {
-      free(h.devices[d]);
-    }
-  }
-  if (h.devices != NULL) {
-    free(h.devices);
-    h.devices = NULL;
-  }
-  if (h.num_devices != NULL) {
-    free(h.num_devices);
-    h.num_devices = NULL;
-  }
-  if (h.drivers != NULL) {
-    free(h.drivers);
-    h.drivers = NULL;
-  }
-  h.num_drivers = 0;
-  UNLOAD_LIBRARY(h.handle);
-  h.handle = NULL;
-}
-int oneapi_get_device_count(oneapi_handle_t h, int driver) {
-  if (h.handle == NULL || h.num_devices == NULL) {
-    return 0;
-  }
-  if (driver > h.num_drivers) {
-    return 0;
-  }
-  return (int)h.num_devices[driver];
-}
-#endif // __APPLE__
--- a/discover/gpu_info_oneapi.h
+++ b/discover/gpu_info_oneapi.h
-#ifndef __APPLE__
-#ifndef __GPU_INFO_ONEAPI_H__
-#define __GPU_INFO_ONEAPI_H__
-#include "gpu_info.h"
-#define ZE_MAX_DEVICE_NAME 256
-#define ZE_MAX_DEVICE_UUID_SIZE 16
-#define ZES_STRING_PROPERTY_SIZE 64
-#define ZE_BIT(_i) (1 << _i)
-// Just enough typedef's to dlopen/dlsym for memory information
-typedef enum ze_result_t {
-  ZE_RESULT_SUCCESS = 0,
-  // Other values omitted for now...
-} ze_result_t;
-typedef uint8_t ze_bool_t;
-typedef struct _zes_driver_handle_t *zes_driver_handle_t;
-typedef struct _zes_device_handle_t *zes_device_handle_t;
-typedef struct _zes_mem_handle_t *zes_mem_handle_t;
-typedef enum _ze_structure_type_t {
-  ZE_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
-} ze_structure_type_t;
-typedef enum _zes_structure_type_t {
-  ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES = 0x1,
-  ZES_STRUCTURE_TYPE_MEM_PROPERTIES = 0xb,
-  ZES_STRUCTURE_TYPE_MEM_STATE = 0x1e,
-  ZES_STRUCTURE_TYPE_DEVICE_EXT_PROPERTIES = 0x2d,
-  ZES_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
-} zes_structure_type_t;
-typedef enum _zes_mem_type_t {
-  ZES_MEM_TYPE_FORCE_UINT32 = 0x7fffffff
-} zes_mem_type_t;
-typedef enum _zes_mem_loc_t {
-  ZES_MEM_LOC_SYSTEM = 0,
-  ZES_MEM_LOC_DEVICE = 1,
-  ZES_MEM_LOC_FORCE_UINT32 = 0x7fffffff
-} zes_mem_loc_t;
-typedef enum _zes_mem_health_t {
-  ZES_MEM_HEALTH_FORCE_UINT32 = 0x7fffffff
-} zes_mem_health_t;
-typedef struct _ze_device_uuid_t {
-  uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
-} ze_device_uuid_t;
-typedef struct _zes_uuid_t {
-  uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
-} zes_uuid_t;
-typedef enum _ze_device_type_t {
-  ZE_DEVICE_TYPE_GPU = 1,
-  ZE_DEVICE_TYPE_CPU = 2,
-  ZE_DEVICE_TYPE_FPGA = 3,
-  ZE_DEVICE_TYPE_MCA = 4,
-  ZE_DEVICE_TYPE_VPU = 5,
-  ZE_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff
-} ze_device_type_t;
-typedef enum _zes_device_type_t {
-  ZES_DEVICE_TYPE_GPU = 1,
-  ZES_DEVICE_TYPE_CPU = 2,
-  ZES_DEVICE_TYPE_FPGA = 3,
-  ZES_DEVICE_TYPE_MCA = 4,
-  ZES_DEVICE_TYPE_VPU = 5,
-  ZES_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff
-} zes_device_type_t;
-typedef uint32_t ze_device_property_flags_t;
-typedef enum _ze_device_property_flag_t {
-  ZE_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
-  ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
-  ZE_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
-  ZE_DEVICE_PROPERTY_FLAG_ONDEMANDPAGING = ZE_BIT(3),
-  ZE_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
-} ze_device_property_flag_t;
-typedef uint32_t zes_device_property_flags_t;
-typedef enum _zes_device_property_flag_t {
-  ZES_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
-  ZES_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
-  ZES_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
-  ZES_DEVICE_PROPERTY_FLAG_ONDEMANDPAGING = ZE_BIT(3),
-  ZES_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
-} zes_device_property_flag_t;
-typedef struct _ze_device_properties_t {
-  ze_structure_type_t stype;
-  void *pNext;
-  ze_device_type_t type;
-  uint32_t vendorId;
-  uint32_t deviceId;
-  ze_device_property_flags_t flags;
-  uint32_t subdeviceId;
-  uint32_t coreClockRate;
-  uint64_t maxMemAllocSize;
-  uint32_t maxHardwareContexts;
-  uint32_t maxCommandQueuePriority;
-  uint32_t numThreadsPerEU;
-  uint32_t physicalEUSimdWidth;
-  uint32_t numEUsPerSubslice;
-  uint32_t numSubslicesPerSlice;
-  uint32_t numSlices;
-  uint64_t timerResolution;
-  uint32_t timestampValidBits;
-  uint32_t kernelTimestampValidBits;
-  ze_device_uuid_t uuid;
-  char name[ZE_MAX_DEVICE_NAME];
-} ze_device_properties_t;
-typedef struct _zes_device_properties_t {
-  zes_structure_type_t stype;
-  void *pNext;
-  ze_device_properties_t core;
-  uint32_t numSubdevices;
-  char serialNumber[ZES_STRING_PROPERTY_SIZE];
-  char boardNumber[ZES_STRING_PROPERTY_SIZE];
-  char brandName[ZES_STRING_PROPERTY_SIZE];
-  char modelName[ZES_STRING_PROPERTY_SIZE];
-  char vendorName[ZES_STRING_PROPERTY_SIZE];
-  char driverVersion[ZES_STRING_PROPERTY_SIZE];
-} zes_device_properties_t;
-typedef struct _zes_device_ext_properties_t {
-  zes_structure_type_t stype;
-  void *pNext;
-  zes_uuid_t uuid;
-  zes_device_type_t type;
-  zes_device_property_flags_t flags;
-} zes_device_ext_properties_t;
-typedef struct _zes_mem_properties_t {
-  zes_structure_type_t stype;
-  void *pNext;
-  zes_mem_type_t type;
-  ze_bool_t onSubdevice;
-  uint32_t subdeviceId;
-  zes_mem_loc_t location;
-  uint64_t physicalSize;
-  int32_t busWidth;
-  int32_t numChannels;
-} zes_mem_properties_t;
-typedef struct _zes_mem_state_t {
-  zes_structure_type_t stype;
-  const void *pNext;
-  zes_mem_health_t health;
-  uint64_t free;
-  uint64_t size;
-} zes_mem_state_t;
-typedef struct oneapi_handle {
-  void *handle;
-  uint16_t verbose;
-  uint32_t num_drivers;
-  zes_driver_handle_t *drivers;
-  uint32_t *num_devices;
-  zes_device_handle_t **devices;
-  // TODO Driver major, minor information
-  // int driver_major;
-  // int driver_minor;
-  ze_result_t (*zesInit)(int);
-  ze_result_t (*zesDriverGet)(uint32_t *pCount, zes_driver_handle_t *phDrivers);
-  ze_result_t (*zesDeviceGet)(zes_driver_handle_t hDriver, uint32_t *pCount,
-                              zes_device_handle_t *phDevices);
-  ze_result_t (*zesDeviceGetProperties)(zes_device_handle_t hDevice,
-                                        zes_device_properties_t *pProperties);
-  ze_result_t (*zesDeviceEnumMemoryModules)(zes_device_handle_t hDevice,
-                                            uint32_t *pCount,
-                                            zes_mem_handle_t *phMemory);
-  ze_result_t (*zesMemoryGetProperties)(zes_mem_handle_t hMemory,
-                                        zes_mem_properties_t *pProperties);
-  ze_result_t (*zesMemoryGetState)(zes_mem_handle_t hMemory,
-                                   zes_mem_state_t *pState);
-} oneapi_handle_t;
-typedef struct oneapi_init_resp {
-  char *err; // If err is non-null handle is invalid
-  oneapi_handle_t oh;
-} oneapi_init_resp_t;
-typedef struct oneapi_version_resp {
-  ze_result_t status;
-  char *str; // Contains version or error string if status != 0
-} oneapi_version_resp_t;
-void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp);
-void oneapi_check_vram(oneapi_handle_t h, int driver, int device,
-                       mem_info_t *resp);
-void oneapi_release(oneapi_handle_t h);
-int oneapi_get_device_count(oneapi_handle_t h, int driver);
-#endif // __GPU_INFO_INTEL_H__
-#endif // __APPLE__
--- a/discover/gpu_test.go
+++ b/discover/gpu_test.go
-package discover
-import (
-	"runtime"
-	"testing"
-	"github.com/stretchr/testify/assert"
-	"github.com/stretchr/testify/require"
-)
-func TestBasicGetGPUInfo(t *testing.T) {
-	info := GetGPUInfo()
-	assert.NotEmpty(t, len(info))
-	assert.Contains(t, "cuda rocm cpu metal", info[0].Library)
-	if info[0].Library != "cpu" {
-		assert.Greater(t, info[0].TotalMemory, uint64(0))
-		assert.Greater(t, info[0].FreeMemory, uint64(0))
-	}
-}
-func TestCPUMemInfo(t *testing.T) {
-	info, err := GetCPUMem()
-	require.NoError(t, err)
-	switch runtime.GOOS {
-	case "darwin":
-		t.Skip("CPU memory not populated on darwin")
-	case "linux", "windows":
-		assert.Greater(t, info.TotalMemory, uint64(0))
-		assert.Greater(t, info.FreeMemory, uint64(0))
-	default:
-		return
-	}
-}
-func TestByLibrary(t *testing.T) {
-	type testCase struct {
-		input  []GpuInfo
-		expect int
-	}
-	testCases := map[string]*testCase{
-		"empty":                    {input: []GpuInfo{}, expect: 0},
-		"cpu":                      {input: []GpuInfo{{Library: "cpu"}}, expect: 1},
-		"cpu + GPU":                {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda"}}, expect: 2},
-		"cpu + 2 GPU no variant":   {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda"}, {Library: "cuda"}}, expect: 2},
-		"cpu + 2 GPU same variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda", Variant: "v11"}, {Library: "cuda", Variant: "v11"}}, expect: 2},
-		"cpu + 2 GPU diff variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda", Variant: "v11"}, {Library: "cuda", Variant: "v12"}}, expect: 3},
-	}
-	for k, v := range testCases {
-		t.Run(k, func(t *testing.T) {
-			resp := (GpuInfoList)(v.input).ByLibrary()
-			if len(resp) != v.expect {
-				t.Fatalf("expected length %d, got %d => %+v", v.expect, len(resp), resp)
-			}
-		})
-	}
-}
-// TODO - add some logic to figure out card type through other means and actually verify we got back what we expected
--- a/discover/runner.go
+++ b/discover/runner.go
+package discover
+// Runner based GPU discovery
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"log/slog"
+	"math/rand"
+	"net"
+	"net/http"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"runtime"
+	"sort"
+	"strconv"
+	"strings"
+	"sync"
+	"time"
+	"github.com/ollama/ollama/envconfig"
+	"github.com/ollama/ollama/format"
+	"github.com/ollama/ollama/logutil"
+	"github.com/ollama/ollama/ml"
+)
+var (
+	deviceMu     sync.Mutex
+	devices      []ml.DeviceInfo
+	libDirs      map[string]struct{}
+	rocmDir      string
+	exe          string
+	bootstrapped bool
+)
+func GPUDevices(ctx context.Context, runners []FilteredRunnerDiscovery) []ml.DeviceInfo {
+	deviceMu.Lock()
+	defer deviceMu.Unlock()
+	startDiscovery := time.Now()
+	msg := "overall device VRAM discovery took"
+	defer func() {
+		slog.Debug(msg, "duration", time.Since(startDiscovery))
+	}()
+	if !bootstrapped {
+		msg = "GPU bootstrap discovery took"
+		libDirs = make(map[string]struct{})
+		var err error
+		exe, err = os.Executable()
+		if err != nil {
+			slog.Error("unable to lookup executable path", "error", err)
+			return nil
+		}
+		if eval, err := filepath.EvalSymlinks(exe); err == nil {
+			exe = eval
+		}
+		files, err := filepath.Glob(filepath.Join(LibOllamaPath, "*", "*ggml-*"))
+		if err != nil {
+			slog.Debug("unable to lookup runner library directories", "error", err)
+		}
+		for _, file := range files {
+			libDirs[filepath.Dir(file)] = struct{}{}
+		}
+		// Our current packaging model places ggml-hip in the main directory
+		// but keeps rocm in an isolated directory.  We have to add it to
+		// the [LD_LIBRARY_]PATH so ggml-hip will load properly
+		rocmDir = filepath.Join(LibOllamaPath, "rocm")
+		if _, err := os.Stat(rocmDir); err != nil {
+			rocmDir = ""
+		}
+		if len(libDirs) == 0 {
+			libDirs[""] = struct{}{}
+		}
+		slog.Info("discovering available GPUs...")
+		// For our initial discovery pass, we gather all the known GPUs through
+		// all the libraries that were detected. This pass may include GPUs that
+		// are enumerated, but not actually supported.
+		// We run this in serial to avoid potentially initializing a GPU multiple
+		// times concurrently leading to memory contention
+		for dir := range libDirs {
+			var dirs []string
+			if dir == "" {
+				dirs = []string{LibOllamaPath}
+			} else {
+				dirs = []string{LibOllamaPath, dir}
+			}
+			// Typically bootstrapping takes < 1s, but on some systems, with devices
+			// in low power/idle mode, initialization can take multiple seconds.  We
+			// set a long timeout just for bootstrap discovery to reduce the chance
+			// of giving up too quickly
+			ctx1stPass, cancel := context.WithTimeout(ctx, 30*time.Second)
+			defer cancel()
+			// For this pass, we retain duplicates in case any are incompatible with some libraries
+			devices = append(devices, bootstrapDevices(ctx1stPass, dirs, nil)...)
+		}
+		// In the second pass, we more deeply initialize the GPUs to weed out devices that
+		// aren't supported by a given library.  We run this phase in parallel to speed up discovery.
+		slog.Debug("filtering out unsupported or overlapping GPU library combinations", "count", len(devices))
+		ctx2ndPass, cancel := context.WithTimeout(ctx, 30*time.Second)
+		defer cancel()
+		var wg sync.WaitGroup
+		needsDelete := make([]bool, len(devices))
+		supportedMu := sync.Mutex{}
+		supported := make(map[string]map[string]map[string]int) // [Library][libDir][ID] = pre-deletion devices index
+		for i := range devices {
+			libDir := devices[i].LibraryPath[len(devices[i].LibraryPath)-1]
+			if devices[i].Library == "Metal" {
+				continue
+			}
+			slog.Debug("verifying GPU is supported", "library", libDir, "description", devices[i].Description, "compute", devices[i].Compute(), "pci_id", devices[i].PCIID)
+			wg.Add(1)
+			go func(i int) {
+				defer wg.Done()
+				var envVar string
+				if devices[i].Library == "ROCm" {
+					if runtime.GOOS != "linux" {
+						envVar = "HIP_VISIBLE_DEVICES"
+					} else {
+						envVar = "ROCR_VISIBLE_DEVICES"
+					}
+				} else {
+					envVar = "CUDA_VISIBLE_DEVICES"
+				}
+				extraEnvs := []string{
+					"GGML_CUDA_INIT=1",           // force deep initialization to trigger crash on unsupported GPUs
+					envVar + "=" + devices[i].ID, // Filter to just this one GPU
+				}
+				if len(bootstrapDevices(ctx2ndPass, devices[i].LibraryPath, extraEnvs)) == 0 {
+					needsDelete[i] = true
+				} else {
+					supportedMu.Lock()
+					if _, ok := supported[devices[i].Library]; !ok {
+						supported[devices[i].Library] = make(map[string]map[string]int)
+					}
+					if _, ok := supported[devices[i].Library][libDir]; !ok {
+						supported[devices[i].Library][libDir] = make(map[string]int)
+					}
+					supported[devices[i].Library][libDir][devices[i].ID] = i
+					supportedMu.Unlock()
+				}
+			}(i)
+		}
+		wg.Wait()
+		logutil.Trace("supported GPU library combinations", "supported", supported)
+		// Mark for deletion any overlaps - favoring the library version that can cover all GPUs if possible
+		filterOverlapByLibrary(supported, needsDelete)
+		// TODO if we ever support multiple ROCm library versions this algorithm will need to be adjusted to keep the rocmID numeric value correct
+		rocmID := 0
+		for i := 0; i < len(needsDelete); i++ {
+			if needsDelete[i] {
+				logutil.Trace("removing unsupported or overlapping GPU combination", "libDir", devices[i].LibraryPath[len(devices[i].LibraryPath)-1], "description", devices[i].Description, "compute", devices[i].Compute(), "pci_id", devices[i].PCIID)
+				devices = append(devices[:i], devices[i+1:]...)
+				needsDelete = append(needsDelete[:i], needsDelete[i+1:]...)
+				i--
+			} else if devices[i].Library == "ROCm" {
+				if _, err := strconv.Atoi(devices[i].ID); err == nil {
+					// Replace the numeric ID with the post-filtered IDs
+					devices[i].FilteredID = devices[i].ID
+					devices[i].ID = strconv.Itoa(rocmID)
+				}
+				rocmID++
+			}
+		}
+		// Now filter out any overlap with different libraries (favor CUDA/ROCm over others)
+		for i := 0; i < len(devices); i++ {
+			for j := i + 1; j < len(devices); j++ {
+				// For this pass, we only drop exact duplicates
+				switch devices[i].Compare(devices[j]) {
+				case ml.SameBackendDevice:
+					// Same library and device, skip it
+					devices = append(devices[:j], devices[j+1:]...)
+					j--
+					continue
+				case ml.DuplicateDevice:
+					// Different library, choose based on priority
+					var droppedDevice ml.DeviceInfo
+					if devices[i].Library == "CUDA" || devices[i].Library == "ROCm" {
+						droppedDevice = devices[j]
+					} else {
+						droppedDevice = devices[i]
+						devices[i] = devices[j]
+					}
+					devices = append(devices[:j], devices[j+1:]...)
+					j--
+					typeStr := "discrete"
+					if droppedDevice.Integrated {
+						typeStr = "iGPU"
+					}
+					slog.Debug("dropping duplicate device",
+						"id", droppedDevice.ID,
+						"library", droppedDevice.Library,
+						"compute", droppedDevice.Compute(),
+						"name", droppedDevice.Name,
+						"description", droppedDevice.Description,
+						"libdirs", strings.Join(droppedDevice.LibraryPath, ","),
+						"driver", droppedDevice.Driver(),
+						"pci_id", droppedDevice.PCIID,
+						"type", typeStr,
+						"total", format.HumanBytes2(droppedDevice.TotalMemory),
+						"available", format.HumanBytes2(droppedDevice.FreeMemory),
+					)
+					continue
+				}
+			}
+		}
+		// Reset the libDirs to what we actually wind up using for future refreshes
+		libDirs = make(map[string]struct{})
+		for _, dev := range devices {
+			dir := dev.LibraryPath[len(dev.LibraryPath)-1]
+			if dir != LibOllamaPath {
+				libDirs[dir] = struct{}{}
+			}
+		}
+		if len(libDirs) == 0 {
+			libDirs[""] = struct{}{}
+		}
+		bootstrapped = true
+	} else {
+		if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
+			// metal never updates free VRAM
+			return devices
+		}
+		slog.Debug("refreshing free memory")
+		updated := make([]bool, len(devices))
+		allDone := func() bool {
+			allDone := true
+			for _, done := range updated {
+				if !done {
+					allDone = false
+					break
+				}
+			}
+			return allDone
+		}
+		// First try to use existing runners to refresh VRAM since they're already
+		// active on GPU(s)
+		for _, runner := range runners {
+			if runner == nil {
+				continue
+			}
+			deviceIDs := runner.GetActiveDeviceIDs()
+			if len(deviceIDs) == 0 {
+				// Skip this runner since it doesn't have active GPU devices
+				continue
+			}
+			// Check to see if this runner is active on any devices that need a refresh
+			skip := true
+		devCheck:
+			for _, dev := range deviceIDs {
+				for i := range devices {
+					if dev == devices[i].DeviceID {
+						if !updated[i] {
+							skip = false
+							break devCheck
+						}
+					}
+				}
+			}
+			if skip {
+				continue
+			}
+			// Typical refresh on existing runner is ~500ms but allow longer if the system
+			// is under stress before giving up and using stale data.
+			ctx, cancel := context.WithTimeout(ctx, 3*time.Second)
+			defer cancel()
+			start := time.Now()
+			updatedDevices := runner.GetDeviceInfos(ctx)
+			slog.Debug("existing runner discovery took", "duration", time.Since(start))
+			for _, u := range updatedDevices {
+				for i := range devices {
+					if u.DeviceID == devices[i].DeviceID {
+						updated[i] = true
+						devices[i].FreeMemory = u.FreeMemory
+						break
+					}
+				}
+			}
+			// Short circuit if we've updated all the devices
+			if allDone() {
+				break
+			}
+		}
+		if !allDone() {
+			slog.Debug("unable to refresh all GPUs with existing runners, performing bootstrap discovery")
+			// Bootstrapping may take longer in some cases (AMD windows), but we
+			// would rather use stale free data to get the model running sooner
+			ctx, cancel := context.WithTimeout(ctx, 3*time.Second)
+			defer cancel()
+			for dir := range libDirs {
+				updatedDevices := bootstrapDevices(ctx, []string{LibOllamaPath, dir}, nil)
+				for _, u := range updatedDevices {
+					for i := range devices {
+						if u.DeviceID == devices[i].DeviceID {
+							updated[i] = true
+							devices[i].FreeMemory = u.FreeMemory
+							break
+						}
+					}
+					// TODO - consider evaluating if new devices have appeared (e.g. hotplug)
+				}
+				if allDone() {
+					break
+				}
+			}
+			if !allDone() {
+				slog.Warn("unable to refresh free memory, using old values")
+			}
+		}
+	}
+	return devices
+}
+func filterOverlapByLibrary(supported map[string]map[string]map[string]int, needsDelete []bool) {
+	// For multi-GPU systems, use the newest version that supports all the GPUs
+	for _, byLibDirs := range supported {
+		libDirs := make([]string, 0, len(byLibDirs))
+		for libDir := range byLibDirs {
+			libDirs = append(libDirs, libDir)
+		}
+		sort.Sort(sort.Reverse(sort.StringSlice(libDirs)))
+		anyMissing := false
+		var newest string
+		for _, newest = range libDirs {
+			for _, libDir := range libDirs {
+				if libDir == newest {
+					continue
+				}
+				if len(byLibDirs[newest]) != len(byLibDirs[libDir]) {
+					anyMissing = true
+					break
+				}
+				for dev := range byLibDirs[newest] {
+					if _, found := byLibDirs[libDir][dev]; !found {
+						anyMissing = true
+						break
+					}
+				}
+			}
+			if !anyMissing {
+				break
+			}
+		}
+		// Now we can mark overlaps for deletion
+		for _, libDir := range libDirs {
+			if libDir == newest {
+				continue
+			}
+			for dev, i := range byLibDirs[libDir] {
+				if _, found := byLibDirs[newest][dev]; found {
+					needsDelete[i] = true
+				}
+			}
+		}
+	}
+}
+type bootstrapRunner struct {
+	port int
+	cmd  *exec.Cmd
+}
+func (r *bootstrapRunner) GetPort() int {
+	return r.port
+}
+func (r *bootstrapRunner) HasExited() bool {
+	if r.cmd != nil && r.cmd.ProcessState != nil {
+		return true
+	}
+	return false
+}
+func bootstrapDevices(ctx context.Context, ollamaLibDirs []string, extraEnvs []string) []ml.DeviceInfo {
+	// TODO DRY out with llm/server.go
+	slog.Debug("spawing runner with", "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "extra_envs", extraEnvs)
+	start := time.Now()
+	defer func() {
+		slog.Debug("bootstrap discovery took", "duration", time.Since(start), "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "extra_envs", extraEnvs)
+	}()
+	port := 0
+	if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
+		var l *net.TCPListener
+		if l, err = net.ListenTCP("tcp", a); err == nil {
+			port = l.Addr().(*net.TCPAddr).Port
+			l.Close()
+		}
+	}
+	if port == 0 {
+		slog.Debug("ResolveTCPAddr failed, using random port")
+		port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
+	}
+	params := []string{"runner", "--ollama-engine", "--port", strconv.Itoa(port)}
+	var pathEnv string
+	switch runtime.GOOS {
+	case "windows":
+		pathEnv = "PATH"
+	case "darwin":
+		pathEnv = "DYLD_LIBRARY_PATH"
+	default:
+		pathEnv = "LD_LIBRARY_PATH"
+	}
+	libraryPaths := append([]string{LibOllamaPath}, ollamaLibDirs...)
+	if rocmDir != "" {
+		libraryPaths = append(libraryPaths, rocmDir)
+	}
+	// Note: we always put our dependency paths first
+	// since these are the exact version we compiled/linked against
+	if libraryPath, ok := os.LookupEnv(pathEnv); ok {
+		libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
+	}
+	cmd := exec.Command(exe, params...)
+	cmd.Env = os.Environ()
+	if envconfig.LogLevel() == logutil.LevelTrace {
+		cmd.Stdout = os.Stdout
+		cmd.Stderr = os.Stderr
+	}
+	// cmd.SysProcAttr = llm.LlamaServerSysProcAttr // circular dependency - bring back once refactored
+	cmd.Env = append(cmd.Env, "OLLAMA_LIBRARY_PATH="+strings.Join(ollamaLibDirs, string(filepath.ListSeparator)))
+	pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
+	pathNeeded := true
+	extraDone := make([]bool, len(extraEnvs))
+	for i := range cmd.Env {
+		cmp := strings.SplitN(cmd.Env[i], "=", 2)
+		if strings.EqualFold(cmp[0], pathEnv) {
+			cmd.Env[i] = pathEnv + "=" + pathEnvVal
+			pathNeeded = false
+		} else {
+			for j := range extraEnvs {
+				if extraDone[j] {
+					continue
+				}
+				extra := strings.SplitN(extraEnvs[j], "=", 2)
+				if cmp[0] == extra[0] {
+					cmd.Env[i] = extraEnvs[j]
+					extraDone[i] = true
+				}
+			}
+		}
+	}
+	if pathNeeded {
+		cmd.Env = append(cmd.Env, pathEnv+"="+pathEnvVal)
+	}
+	for i := range extraDone {
+		if !extraDone[i] {
+			cmd.Env = append(cmd.Env, extraEnvs[i])
+		}
+	}
+	logutil.Trace("starting runner for device discovery", "env", cmd.Env, "cmd", cmd)
+	if err := cmd.Start(); err != nil {
+		slog.Warn("unable to start discovery subprocess", "cmd", cmd, "error", err)
+		return nil
+	}
+	go func() {
+		cmd.Wait() // exit status ignored
+	}()
+	defer cmd.Process.Kill()
+	devices, err := GetDevicesFromRunner(ctx, &bootstrapRunner{port: port, cmd: cmd})
+	if err != nil {
+		if cmd.ProcessState != nil && cmd.ProcessState.ExitCode() >= 0 {
+			// Expected during bootstrapping while we filter out unsupported AMD GPUs
+			logutil.Trace("runner exited", "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "extra_envs", extraEnvs, "code", cmd.ProcessState.ExitCode())
+		} else {
+			slog.Info("failure during GPU discovery", "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "extra_envs", extraEnvs, "error", err)
+		}
+	}
+	logutil.Trace("runner enumerated devices", "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "devices", devices)
+	return devices
+}
+func GetDevicesFromRunner(ctx context.Context, runner BaseRunner) ([]ml.DeviceInfo, error) {
+	var moreDevices []ml.DeviceInfo
+	port := runner.GetPort()
+	tick := time.Tick(10 * time.Millisecond)
+	for {
+		select {
+		case <-ctx.Done():
+			return nil, fmt.Errorf("failed to finish discovery before timeout")
+		case <-tick:
+			r, err := http.NewRequestWithContext(ctx, http.MethodGet, fmt.Sprintf("http://127.0.0.1:%d/info", port), nil)
+			if err != nil {
+				return nil, fmt.Errorf("failed to create request: %w", err)
+			}
+			r.Header.Set("Content-Type", "application/json")
+			resp, err := http.DefaultClient.Do(r)
+			if err != nil {
+				// slog.Warn("failed to send request", "error", err)
+				if runner.HasExited() {
+					return nil, fmt.Errorf("runner crashed")
+				}
+				continue
+			}
+			defer resp.Body.Close()
+			if resp.StatusCode == http.StatusNotFound {
+				// old runner, fall back to bootstrapping model
+				return nil, fmt.Errorf("llamarunner free vram reporting not supported")
+			}
+			body, err := io.ReadAll(resp.Body)
+			if err != nil {
+				slog.Warn("failed to read response", "error", err)
+				continue
+			}
+			if resp.StatusCode != 200 {
+				logutil.Trace("runner failed to discover free VRAM", "status", resp.StatusCode, "response", body)
+				return nil, fmt.Errorf("runner error: %s", string(body))
+			}
+			if err := json.Unmarshal(body, &moreDevices); err != nil {
+				slog.Warn("unmarshal encode response", "error", err)
+				continue
+			}
+			return moreDevices, nil
+		}
+	}
+}
--- a/discover/runner_test.go
+++ b/discover/runner_test.go
+package discover
+import (
+	"testing"
+	"github.com/ollama/ollama/app/lifecycle"
+)
+func init() {
+	lifecycle.InitLogging()
+}
+func TestFilterOverlapByLibrary(t *testing.T) {
+	type testcase struct {
+		name string
+		inp  map[string]map[string]map[string]int
+		exp  []bool
+	}
+	for _, tc := range []testcase{
+		{
+			name: "empty",
+			inp:  map[string]map[string]map[string]int{},
+			exp:  []bool{}, // needs deletion
+		},
+		{
+			name: "single no overlap",
+			inp: map[string]map[string]map[string]int{
+				"CUDA": {
+					"cuda_v12": {
+						"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 0,
+					},
+				},
+			},
+			exp: []bool{false},
+		},
+		{
+			name: "100% overlap pick 2nd",
+			inp: map[string]map[string]map[string]int{
+				"CUDA": {
+					"cuda_v12": {
+						"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 0,
+						"GPU-cd6c3216-03d2-a8eb-8235-2ffbf571712e": 1,
+					},
+					"cuda_v13": {
+						"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 2,
+						"GPU-cd6c3216-03d2-a8eb-8235-2ffbf571712e": 3,
+					},
+				},
+			},
+			exp: []bool{true, true, false, false},
+		},
+		{
+			name: "100% overlap pick 1st",
+			inp: map[string]map[string]map[string]int{
+				"CUDA": {
+					"cuda_v13": {
+						"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 0,
+						"GPU-cd6c3216-03d2-a8eb-8235-2ffbf571712e": 1,
+					},
+					"cuda_v12": {
+						"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 2,
+						"GPU-cd6c3216-03d2-a8eb-8235-2ffbf571712e": 3,
+					},
+				},
+			},
+			exp: []bool{false, false, true, true},
+		},
+		{
+			name: "partial overlap pick older",
+			inp: map[string]map[string]map[string]int{
+				"CUDA": {
+					"cuda_v13": {
+						"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 0,
+					},
+					"cuda_v12": {
+						"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 1,
+						"GPU-cd6c3216-03d2-a8eb-8235-2ffbf571712e": 2,
+					},
+				},
+			},
+			exp: []bool{true, false, false},
+		},
+		{
+			name: "no overlap",
+			inp: map[string]map[string]map[string]int{
+				"CUDA": {
+					"cuda_v13": {
+						"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 0,
+					},
+					"cuda_v12": {
+						"GPU-cd6c3216-03d2-a8eb-8235-2ffbf571712e": 1,
+					},
+				},
+			},
+			exp: []bool{false, false},
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			needsDelete := make([]bool, len(tc.exp))
+			filterOverlapByLibrary(tc.inp, needsDelete)
+			for i, exp := range tc.exp {
+				if needsDelete[i] != exp {
+					t.Fatalf("expected: %v\ngot: %v", tc.exp, needsDelete)
+				}
+			}
+		})
+	}
+}
--- a/discover/types.go
+++ b/discover/types.go
 package discover
 import (
-	"fmt"
+	"context"
 	"log/slog"
+	"path/filepath"
+	"runtime"
+	"strings"
 	"github.com/ollama/ollama/format"
+	"github.com/ollama/ollama/ml"
 )
 type memInfo struct {
@@ -15,8 +19,8 @@ type memInfo struct {
 // Beginning of an `ollama info` command
 type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
+	ml.DeviceID
 	memInfo
-	Library string `json:"library,omitempty"`
 	// Optional variant to select (e.g. versions, cpu feature flags)
 	Variant string `json:"variant"`
@@ -27,17 +31,13 @@ type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
 	// Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly
 	DependencyPath []string `json:"lib_path,omitempty"`
-	// Extra environment variables specific to the GPU as list of [key=value]
-	EnvWorkarounds []string `json:"envs,omitempty"`
 	// Set to true if we can NOT reliably discover FreeMemory.  A value of true indicates
 	// the FreeMemory is best effort, and may over or under report actual memory usage
 	// False indicates FreeMemory can generally be trusted on this GPU
 	UnreliableFreeMemory bool
 	// GPU information
-	ID       string `json:"gpu_id"` // string to use for selection of this specific GPU
+	filterID string // AMD Workaround: The numeric ID of the device used to filter out other devices
-	filterID int    //nolint:unused,nolintlint // AMD Workaround: The numeric ID of the device used to filter out other devices
 	Name     string `json:"name"`    // user friendly name if available
 	Compute  string `json:"compute"` // Compute Capability or gfx
@@ -70,37 +70,8 @@ type CPU struct {
 	ThreadCount         int
 }
-type CudaGPUInfo struct {
-	GpuInfo
-	OSOverhead   uint64 // Memory overhead between the driver library and management library
-	index        int    //nolint:unused,nolintlint
-	computeMajor int    //nolint:unused,nolintlint
-	computeMinor int    //nolint:unused,nolintlint
-}
-type CudaGPUInfoList []CudaGPUInfo
-type RocmGPUInfo struct {
-	GpuInfo
-	usedFilepath string //nolint:unused,nolintlint
-	index        int    //nolint:unused,nolintlint
-}
-type RocmGPUInfoList []RocmGPUInfo
-type OneapiGPUInfo struct {
-	GpuInfo
-	driverIndex int //nolint:unused,nolintlint
-	gpuIndex    int //nolint:unused,nolintlint
-}
-type OneapiGPUInfoList []OneapiGPUInfo
 type GpuInfoList []GpuInfo
-type UnsupportedGPUInfo struct {
-	GpuInfo
-	Reason string `json:"reason"`
-}
-// Split up the set of gpu info's by Library and variant
 func (l GpuInfoList) ByLibrary() []GpuInfoList {
 	resp := []GpuInfoList{}
 	libs := []string{}
@@ -125,18 +96,47 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
 	return resp
 }
-// Report the GPU information into the log an Info level
+func LogDetails(devices []ml.DeviceInfo) {
-func (l GpuInfoList) LogDetails() {
+	for _, dev := range devices {
-	for _, g := range l {
+		var libs []string
+		for _, dir := range dev.LibraryPath {
+			if strings.Contains(dir, filepath.Join("lib", "ollama")) {
+				libs = append(libs, filepath.Base(dir))
+			}
+		}
+		typeStr := "discrete"
+		if dev.Integrated {
+			typeStr = "iGPU"
+		}
 		slog.Info("inference compute",
-			"id", g.ID,
+			"id", dev.ID,
-			"library", g.Library,
+			"library", dev.Library,
-			"variant", g.Variant,
+			"compute", dev.Compute(),
-			"compute", g.Compute,
+			"name", dev.Name,
-			"driver", fmt.Sprintf("%d.%d", g.DriverMajor, g.DriverMinor),
+			"description", dev.Description,
-			"name", g.Name,
+			"libdirs", strings.Join(libs, ","),
-			"total", format.HumanBytes2(g.TotalMemory),
+			"driver", dev.Driver(),
-			"available", format.HumanBytes2(g.FreeMemory),
+			"pci_id", dev.PCIID,
+			"type", typeStr,
+			"total", format.HumanBytes2(dev.TotalMemory),
+			"available", format.HumanBytes2(dev.FreeMemory),
+		)
+	}
+	// CPU inference
+	if len(devices) == 0 {
+		dev, _ := GetCPUMem()
+		slog.Info("inference compute",
+			"id", "cpu",
+			"library", "cpu",
+			"compute", "",
+			"name", "cpu",
+			"description", "cpu",
+			"libdirs", "ollama",
+			"driver", "",
+			"pci_id", "",
+			"type", "",
+			"total", format.HumanBytes2(dev.TotalMemory),
+			"available", format.HumanBytes2(dev.FreeMemory),
 		)
 	}
 }
@@ -149,16 +149,15 @@ func (a ByFreeMemory) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
 func (a ByFreeMemory) Less(i, j int) bool { return a[i].FreeMemory < a[j].FreeMemory }
 type SystemInfo struct {
-	System          CPUInfo              `json:"system"`
+	System CPUInfo   `json:"system"`
-	GPUs            []GpuInfo            `json:"gpus"`
+	GPUs   []GpuInfo `json:"gpus"`
-	UnsupportedGPUs []UnsupportedGPUInfo `json:"unsupported_gpus"`
-	DiscoveryErrors []string             `json:"discovery_errors"`
 }
 // Return the optimal number of threads to use for inference
 func (si SystemInfo) GetOptimalThreadCount() int {
 	if len(si.System.CPUs) == 0 {
-		return 0
+		// Fall back to Go's num CPU
+		return runtime.NumCPU()
 	}
 	coreCount := 0
@@ -173,9 +172,9 @@ func (si SystemInfo) GetOptimalThreadCount() int {
 func (l GpuInfoList) FlashAttentionSupported() bool {
 	for _, gpu := range l {
 		supportsFA := gpu.Library == "cpu" ||
-			gpu.Library == "metal" ||
+			gpu.Name == "Metal" || gpu.Library == "Metal" ||
-			(gpu.Library == "cuda" && gpu.DriverMajor >= 7) ||
+			(gpu.Library == "CUDA" && gpu.DriverMajor >= 7) ||
-			gpu.Library == "rocm"
+			gpu.Library == "ROCm"
 		if !supportsFA {
 			return false
@@ -183,3 +182,31 @@ func (l GpuInfoList) FlashAttentionSupported() bool {
 	}
 	return true
 }
+type BaseRunner interface {
+	// GetPort returns the localhost port number the runner is running on
+	GetPort() int
+	// HasExited indicates if the runner is no longer running.  This can be used during
+	// bootstrap to detect if a given filtered device is incompatible and triggered an assert
+	HasExited() bool
+}
+type RunnerDiscovery interface {
+	BaseRunner
+	// GetDeviceInfos will perform a query of the underlying device libraries
+	// for device identification and free VRAM information
+	// During bootstrap scenarios, this routine may take seconds to complete
+	GetDeviceInfos(ctx context.Context) []ml.DeviceInfo
+}
+type FilteredRunnerDiscovery interface {
+	RunnerDiscovery
+	// GetActiveDeviceIDs returns the filtered set of devices actively in
+	// use by this runner for running models.  If the runner is a bootstrap runner, no devices
+	// will be active yet so no device IDs are returned.
+	// This routine will not query the underlying device and will return immediately
+	GetActiveDeviceIDs() []ml.DeviceID
+}
--- a/docs/gpu.md
+++ b/docs/gpu.md
@@ -65,6 +65,9 @@ With ROCm v6.1, the following GPUs are supported on Windows.
 | AMD Radeon RX  | `7900 XTX` `7900 XT` `7900 GRE` `7800 XT` `7700 XT` `7600 XT` `7600` `6950 XT` `6900 XTX` `6900XT` `6800 XT` `6800`    |
 | AMD Radeon PRO | `W7900` `W7800` `W7700` `W7600` `W7500` `W6900X` `W6800X Duo` `W6800X` `W6800` `V620` |
+### Known Workarounds
+- The RX Vega 56 requires `HSA_ENABLE_SDMA=0` to disable SDMA
 ### Overrides on Linux
 Ollama leverages the AMD ROCm library, which does not support all AMD GPUs. In

--- a/integration/utils_test.go
+++ b/integration/utils_test.go
@@ -264,14 +264,13 @@ var (
 	rainbowFollowups = []string{
 		"Explain the physics involved in them.  Be breif in your reply",
 		"Explain the chemistry involved in them.  Be breif in your reply",
-		"Explain the quantum mechanics involved in them. Be breif in your reply",
 		"What are common myths related to them? Be brief in your reply",
 		"What are common fairytales related to them? Be brief in your reply",
 		"Can they form if there is no rain?  Be breif in your reply",
 		"Can they form if there are no clouds?  Be breif in your reply",
 		"Do they happen on other planets? Be brief in your reply",
 	}
-	rainbowExpected = []string{"water", "droplet", "mist", "glow", "refracted", "reflect", "color", "spectrum", "frequency", "end", "gold", "fortune", "blessing", "prosperity"}
+	rainbowExpected = []string{"water", "droplet", "mist", "glow", "refract", "reflect", "scatter", "wave", "color", "spectrum", "raindrop", "atmosphere", "frequency", "end", "gold", "fortune", "blessing", "prosperity", "magic", "shower", "sky", "shimmer", "light", "storm", "sunny"}
 )
 func init() {
@@ -456,6 +455,24 @@ func InitServerConnection(ctx context.Context, t *testing.T) (*api.Client, strin
 			t.Fatal(err)
 		}
 	}
+	// Make sure server is online and healthy before returning
+	listCtx, cancel := context.WithDeadlineCause(
+		ctx,
+		time.Now().Add(120*time.Second),
+		fmt.Errorf("list models took too long"),
+	)
+	defer cancel()
+	models, err := client.ListRunning(listCtx)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(models.Models) > 0 {
+		names := make([]string, len(models.Models))
+		for i, m := range models.Models {
+			names[i] = m.Name
+		}
+		slog.Info("currently loaded", "models", names)
+	}
 	return client, testEndpoint, func() {
 		if os.Getenv("OLLAMA_TEST_EXISTING") == "" {
@@ -577,7 +594,7 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) {
 				KeepAlive: &api.Duration{Duration: 10 * time.Second},
 			}, {
 				Model:     smol,
-				Prompt:    "how do rainbows form? Be brief but factual in your reply",
+				Prompt:    rainbowPrompt,
 				Stream:    &stream,
 				KeepAlive: &api.Duration{Duration: 10 * time.Second},
 			}, {
@@ -595,7 +612,7 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) {
 		[][]string{
 			{"sunlight", "scatter", "interact", "color", "surface", "depth", "red", "orange", "yellow", "absorb", "wavelength", "water", "molecule"},
 			{"soil", "organic", "earth", "black", "tan", "chemical", "processes", "pigment", "particle", "iron oxide", "rust", "air", "water", "wet", "mixture", "mixing", "mineral", "element", "decomposed", "matter", "wavelength"},
-			{"water", "droplet", "refract", "reflect", "color", "spectrum", "raindrop"},
+			rainbowExpected,
 			{"fourth", "july", "declaration", "independence"},
 			{"nitrogen", "oxygen", "carbon", "dioxide", "water", "vapor", "fluid", "particles", "gas"},
 		}

--- a/llama/llama.go
+++ b/llama/llama.go
@@ -42,6 +42,7 @@ import (
 	_ "github.com/ollama/ollama/llama/llama.cpp/common"
 	_ "github.com/ollama/ollama/llama/llama.cpp/src"
 	_ "github.com/ollama/ollama/llama/llama.cpp/tools/mtmd"
+	"github.com/ollama/ollama/ml"
 	ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
 )
@@ -62,8 +63,8 @@ func BackendInit() {
 	C.llama_backend_init()
 }
-func EnumerateGPUs() []string {
+func EnumerateGPUs() []ml.DeviceID {
-	var ids []string
+	var ids []ml.DeviceID
 	for i := range C.ggml_backend_dev_count() {
 		device := C.ggml_backend_dev_get(i)
@@ -71,7 +72,10 @@ func EnumerateGPUs() []string {
 		if C.ggml_backend_dev_type(device) == C.GGML_BACKEND_DEVICE_TYPE_GPU {
 			var props C.struct_ggml_backend_dev_props
 			C.ggml_backend_dev_get_props(device, &props)
-			ids = append(ids, C.GoString(props.id))
+			ids = append(ids, ml.DeviceID{
+				ID:      C.GoString(props.id),
+				Library: C.GoString(props.library),
+			})
 		}
 	}

--- a/llama/patches/0026-GPU-discovery-enhancements.patch
+++ b/llama/patches/0026-GPU-discovery-enhancements.patch
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -196,7 +196,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	}
 	useFlashAttention := (envconfig.FlashAttention() || f.FlashAttention()) &&
-		discover.GetGPUInfo().FlashAttentionSupported() &&
+		(discover.GpuInfoList)(gpus).FlashAttentionSupported() &&
 		f.SupportsFlashAttention()
 	var kvct string
@@ -231,7 +231,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	}
 	// on metal there's no partial offload overhead
-	if gpus[0].Library == "metal" {
+	if gpus[0].Library == "Metal" {
 		graphPartialOffload = graphFullOffload
 	} else if len(gpus) > 1 {
 		// multigpu should always use the partial graph size

--- a/llm/memory_test.go
+++ b/llm/memory_test.go
@@ -12,6 +12,7 @@ import (
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/fs/ggml"
+	"github.com/ollama/ollama/ml"
 )
 func TestEstimateGPULayers(t *testing.T) {
@@ -55,7 +56,9 @@ func TestEstimateGPULayers(t *testing.T) {
 	// Simple CPU scenario
 	gpus := []discover.GpuInfo{
 		{
-			Library: "cpu",
+			DeviceID: ml.DeviceID{
+				Library: "cpu",
+			},
 		},
 	}
 	projectors := []string{}
@@ -77,11 +80,15 @@ func TestEstimateGPULayers(t *testing.T) {
 	gpuMinimumMemory := uint64(2048)
 	gpus = []discover.GpuInfo{
 		{
-			Library:       "cuda",
+			DeviceID: ml.DeviceID{
+				Library: "cuda",
+			},
 			MinimumMemory: gpuMinimumMemory,
 		},
 		{
-			Library:       "cuda",
+			DeviceID: ml.DeviceID{
+				Library: "cuda",
+			},
 			MinimumMemory: gpuMinimumMemory,
 		},
 	}

--- a/llm/server.go
+++ b/llm/server.go
--- a/llm/server_test.go
+++ b/llm/server_test.go
@@ -16,8 +16,8 @@ import (
 func TestLLMServerFitGPU(t *testing.T) {
 	type gpu struct {
-		library string
+		id   ml.DeviceID
-		free    int
+		free int
 	}
 	tests := []struct {
@@ -37,91 +37,91 @@ func TestLLMServerFitGPU(t *testing.T) {
 		},
 		{
 			name:     "Full single GPU",
-			gpus:     []gpu{{free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
 			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   -1,
-			expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{0, 1, 2}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0, 1, 2}}},
 		},
 		{
 			name:     "Partial single GPU",
-			gpus:     []gpu{{free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
 			layers:   []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
 			numGPU:   -1,
-			expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{1, 2}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1, 2}}},
 		},
 		{
 			name:     "Single GPU with numGPU 1",
-			gpus:     []gpu{{free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
 			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   1,
-			expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{1}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1}}},
 		},
 		{
 			name:     "Single GPU with numGPU 0",
-			gpus:     []gpu{{free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
 			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   0,
 			expected: ml.GPULayersList{},
 		},
 		{
 			name:     "Single GPU with numGPU 999",
-			gpus:     []gpu{{free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
 			layers:   []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
 			numGPU:   999,
-			expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{0, 1, 2, 3}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0, 1, 2, 3}}},
 		},
 		{
 			name:     "Multi GPU fits on one",
-			gpus:     []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
 			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   -1,
-			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1, 2}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0, 1, 2}}},
 		},
 		{
 			name:     "Multi GPU split",
-			gpus:     []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
 			layers:   []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   -1,
-			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0}}, {ID: "gpu0", Layers: []int{1, 2}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1, 2}}},
 		},
 		{
 			name:     "Multi GPU partial",
-			gpus:     []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
 			layers:   []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   -1,
-			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{1}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{1}}},
 		},
 		{
 			name:     "Multi GPU numGPU 1",
-			gpus:     []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
 			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   1,
-			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{1}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{1}}},
 		},
 		{
 			name:     "Multi GPU numGPU 2",
-			gpus:     []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
 			layers:   []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   2,
-			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0}}, {ID: "gpu0", Layers: []int{1}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1}}},
 		},
 		{
 			name:     "Multi GPU numGPU 999",
-			gpus:     []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
 			layers:   []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   999,
-			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1}}, {ID: "gpu0", Layers: []int{2}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0, 1}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{2}}},
 		},
 		{
 			name:     "Multi GPU different libraries",
-			gpus:     []gpu{{library: "cuda", free: 128 * format.MebiByte}, {library: "rocm", free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{Library: "CUDA", ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{Library: "ROCm", ID: "gpu1"}, free: 256 * format.MebiByte}},
 			layers:   []int{128 * format.MebiByte, 128 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   -1,
-			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1", Library: "ROCm"}, Layers: []int{0, 1}}},
 		},
 		{
 			name:        "requireFull",
-			gpus:        []gpu{{free: 256 * format.MebiByte}},
+			gpus:        []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
 			layers:      []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
 			numGPU:      -1,
 			requireFull: true,
@@ -138,8 +138,7 @@ func TestLLMServerFitGPU(t *testing.T) {
 			gpus := make(discover.GpuInfoList, len(tt.gpus))
 			for i := range tt.gpus {
-				gpus[i].ID = fmt.Sprintf("gpu%d", i)
+				gpus[i].DeviceID = tt.gpus[i].id
-				gpus[i].Library = tt.gpus[i].library
 				gpus[i].FreeMemory = uint64(tt.gpus[i].free)
 			}
@@ -164,7 +163,7 @@ func TestLLMServerFitGPU(t *testing.T) {
 			}
 			for i := range s.mem.GPUs {
-				s.mem.GPUs[i].ID = fmt.Sprintf("gpu%d", i)
+				s.mem.GPUs[i].DeviceID = gpus[i].DeviceID
 				s.mem.GPUs[i].Weights = make([]uint64, s.totalLayers)
 				s.mem.GPUs[i].Cache = make([]uint64, s.totalLayers)
 			}

--- a/ml/backend.go
+++ b/ml/backend.go
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
 package ggml
+// #cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm
+// #cgo windows LDFLAGS: -lpthread
 // #cgo CPPFLAGS: -I${SRCDIR}/ggml/include
 // #include <stdlib.h>
 // #include <stdint.h>
@@ -168,6 +170,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 	var props C.struct_ggml_backend_dev_props
 	C.ggml_backend_dev_get_props(cpuDeviceBufferType.d, &props)
 	requiredMemory.CPU.ID = C.GoString(props.id)
+	requiredMemory.CPU.Library = C.GoString(props.library)
 	requiredMemory.CPU.Weights = make([]uint64, blocks+1)
 	requiredMemory.CPU.Cache = make([]uint64, blocks+1)
@@ -186,6 +189,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		var props C.struct_ggml_backend_dev_props
 		C.ggml_backend_dev_get_props(d, &props)
 		requiredMemory.GPUs[i].ID = C.GoString(props.id)
+		requiredMemory.GPUs[i].Library = C.GoString(props.library)
 		requiredMemory.GPUs[i].Weights = make([]uint64, blocks+1)
 		requiredMemory.GPUs[i].Cache = make([]uint64, blocks+1)
 	}
@@ -198,7 +202,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 			for _, l := range p.Layers {
 				if l == layer {
 					for i := range requiredMemory.GPUs {
-						if requiredMemory.GPUs[i].ID == p.ID {
+						if requiredMemory.GPUs[i].DeviceID == p.DeviceID {
 							return gpuDeviceBufferTypes[i]
 						}
 					}
@@ -682,6 +686,52 @@ func (b *Backend) CacheConfig() ml.CacheConfig {
 	}
 }
+func (b *Backend) BackendDevices() []ml.DeviceInfo {
+	deviceInfos := []ml.DeviceInfo{}
+	for _, dev := range gpus {
+		// If we have a model loaded, and it's only loaded on a subset of the devices
+		// skip idle/unused devices to avoid initializing them and causing VRAM allocations
+		if b.allocMemory {
+			idleDev := true
+			for _, backend := range b.schedBackends {
+				if dev == C.ggml_backend_get_device(backend) {
+					idleDev = false
+					break
+				}
+			}
+			if idleDev {
+				slog.Debug("skipping unused backend device", "description", C.GoString(C.ggml_backend_dev_description(dev)))
+				continue
+			}
+		}
+		info := ml.DeviceInfo{}
+		props := C.struct_ggml_backend_dev_props{}
+		C.ggml_backend_dev_get_props(dev, &props)
+		info.Name = C.GoString(props.name)
+		info.Description = C.GoString(props.description)
+		info.ID = C.GoString(props.id)
+		info.Library = C.GoString(props.library)
+		info.ComputeMajor = (int)(props.compute_major)
+		info.ComputeMinor = (int)(props.compute_minor)
+		info.DriverMajor = (int)(props.driver_major)
+		info.DriverMinor = (int)(props.driver_minor)
+		info.Integrated = props.integrated != 0
+		if props.library != nil {
+			info.Library = C.GoString(props.library)
+		}
+		info.PCIID = fmt.Sprintf("%02x:%02x.%x", props.pci_bus_id, props.pci_device_id, props.pci_domain_id)
+		info.LibraryPath = ggml.LibPaths()
+		C.ggml_backend_dev_memory(dev, &props.memory_free, &props.memory_total)
+		info.TotalMemory = (uint64)(props.memory_total)
+		info.FreeMemory = (uint64)(props.memory_free)
+		deviceInfos = append(deviceInfos, info)
+	}
+	return deviceInfos
+}
 type Context struct {
 	b *Backend

--- a/ml/backend/ggml/ggml/include/ggml-backend.h
+++ b/ml/backend/ggml/ggml/include/ggml-backend.h
@@ -157,6 +157,15 @@ extern "C" {
        size_t memory_total;
        enum ggml_backend_dev_type type;
        struct ggml_backend_dev_caps caps;
+        int driver_major;
+        int driver_minor;
+        int compute_major;
+        int compute_minor;
+        int integrated;
+        int pci_bus_id;
+        int pci_device_id;
+        int pci_domain_id;
+        const char *library;
    };
    GGML_API const char *                  ggml_backend_dev_name(ggml_backend_dev_t device);

--- a/ml/backend/ggml/ggml/src/CMakeLists.txt
+++ b/ml/backend/ggml/ggml/src/CMakeLists.txt
@@ -203,6 +203,8 @@ add_library(ggml-base
            ggml-threading.h
            ggml-quants.c
            ggml-quants.h
+            mem_hip.cpp
+            mem_nvml.cpp
            gguf.cpp)
 target_include_directories(ggml-base PRIVATE .)

--- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu