Use runners for GPU discovery (#12090)

This revamps how we discover GPUs in the system by leveraging the Ollama runner. This should eliminate inconsistency between our GPU discovery and the runners capabilities at runtime, particularly for cases where we try to filter out unsupported GPUs. Now the runner does that implicitly based on the actual device list. In some cases free VRAM reporting can be unreliable which can leaad to scheduling mistakes, so this also includes a patch to leverage more reliable VRAM reporting libraries if available. Automatic workarounds have been removed as only one GPU leveraged this, which is now documented. This GPU will soon fall off the support matrix with the next ROCm bump. Additional cleanup of the scheduler and discovery packages can be done in the future once we have switched on the new memory management code, and removed support for the llama runner.

Use runners for GPU discovery (#12090)
This revamps how we discover GPUs in the system by leveraging the Ollama runner. This should eliminate inconsistency between our GPU discovery and the runners capabilities at runtime, particularly for cases where we try to filter out unsupported GPUs. Now the runner does that implicitly based on the actual device list. In some cases free VRAM reporting can be unreliable which can leaad to scheduling mistakes, so this also includes a patch to leverage more reliable VRAM reporting libraries if available. Automatic workarounds have been removed as only one GPU leveraged this, which is now documented. This GPU will soon fall off the support matrix with the next ROCm bump. Additional cleanup of the scheduler and discovery packages can be done in the future once we have switched on the new memory management code, and removed support for the llama runner.
bc8909fb · Daniel Hiltgen · GitHub · 6b50f2b9 · 6b50f2b9 · 6b50f2b9
Unverified Commit bc8909fb authored Oct 01, 2025 by Daniel Hiltgen Committed by GitHub Oct 01, 2025
20 changed files
--- a/discover/gpu_info_nvml.h
+++ b/discover/gpu_info_nvml.h
-#ifndef __APPLE__
-#ifndef __GPU_INFO_NVML_H__
-#define __GPU_INFO_NVML_H__
-#include "gpu_info.h"
-// Just enough typedef's to dlopen/dlsym for memory information
-typedef enum nvmlReturn_enum {
-  NVML_SUCCESS = 0,
-  // Other values omitted for now...
-} nvmlReturn_t;
-typedef void *nvmlDevice_t;  // Opaque is sufficient
-typedef struct nvmlMemory_st {
-  unsigned long long total;
-  unsigned long long free;
-  unsigned long long used;
-} nvmlMemory_t;
-typedef enum nvmlBrandType_enum
-{
-    NVML_BRAND_UNKNOWN          = 0,
-} nvmlBrandType_t;
-typedef struct nvml_handle {
-  void *handle;
-  uint16_t verbose;
-  nvmlReturn_t (*nvmlInit_v2)(void);
-  nvmlReturn_t (*nvmlShutdown)(void);
-  nvmlReturn_t (*nvmlDeviceGetHandleByUUID)(const char *, nvmlDevice_t *);
-  nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *);
-} nvml_handle_t;
-typedef struct nvml_init_resp {
-  char *err;  // If err is non-null handle is invalid
-  nvml_handle_t ch;
-} nvml_init_resp_t;
-typedef struct nvml_compute_capability {
-  char *err;
-  int major;
-  int minor;
-} nvml_compute_capability_t;
-void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp);
-void nvml_get_free(nvml_handle_t ch, char *uuid, uint64_t *free, uint64_t *total, uint64_t *used);
-void nvml_release(nvml_handle_t ch);
-#endif  // __GPU_INFO_NVML_H__
-#endif  // __APPLE__
\ No newline at end of file
--- a/discover/gpu_info_oneapi.c
+++ b/discover/gpu_info_oneapi.c
-#ifndef __APPLE__
-#include "gpu_info_oneapi.h"
-#include <string.h>
-void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) {
-  ze_result_t ret;
-  resp->err = NULL;
-  resp->oh.devices = NULL;
-  resp->oh.num_devices = NULL;
-  resp->oh.drivers = NULL;
-  resp->oh.num_drivers = 0;
-  const int buflen = 256;
-  char buf[buflen + 1];
-  int i, d;
-  struct lookup {
-    char *s;
-    void **p;
-  } l[] = {
-      {"zesInit", (void *)&resp->oh.zesInit},
-      {"zesDriverGet", (void *)&resp->oh.zesDriverGet},
-      {"zesDeviceGet", (void *)&resp->oh.zesDeviceGet},
-      {"zesDeviceGetProperties", (void *)&resp->oh.zesDeviceGetProperties},
-      {"zesDeviceEnumMemoryModules",
-       (void *)&resp->oh.zesDeviceEnumMemoryModules},
-      {"zesMemoryGetProperties", (void *)&resp->oh.zesMemoryGetProperties},
-      {"zesMemoryGetState", (void *)&resp->oh.zesMemoryGetState},
-      {NULL, NULL},
-  };
-  resp->oh.handle = LOAD_LIBRARY(oneapi_lib_path, RTLD_LAZY);
-  if (!resp->oh.handle) {
-    char *msg = LOAD_ERR();
-    snprintf(buf, buflen,
-             "Unable to load %s library to query for Intel GPUs: %s\n",
-             oneapi_lib_path, msg);
-    free(msg);
-    resp->err = strdup(buf);
-    return;
-  }
-  // TODO once we've squashed the remaining corner cases remove this log
-  LOG(resp->oh.verbose,
-      "wiring Level-Zero management library functions in %s\n",
-      oneapi_lib_path);
-  for (i = 0; l[i].s != NULL; i++) {
-    // TODO once we've squashed the remaining corner cases remove this log
-    LOG(resp->oh.verbose, "dlsym: %s\n", l[i].s);
-    *l[i].p = LOAD_SYMBOL(resp->oh.handle, l[i].s);
-    if (!*(l[i].p)) {
-      resp->oh.handle = NULL;
-      char *msg = LOAD_ERR();
-      LOG(resp->oh.verbose, "dlerr: %s\n", msg);
-      UNLOAD_LIBRARY(resp->oh.handle);
-      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s, msg);
-      free(msg);
-      resp->err = strdup(buf);
-      return;
-    }
-  }
-  LOG(resp->oh.verbose, "calling zesInit\n");
-  ret = (*resp->oh.zesInit)(0);
-  if (ret != ZE_RESULT_SUCCESS) {
-    LOG(resp->oh.verbose, "zesInit err: %x\n", ret);
-    snprintf(buf, buflen, "oneapi vram init failure: %x", ret);
-    resp->err = strdup(buf);
-    oneapi_release(resp->oh);
-    return;
-  }
-  LOG(resp->oh.verbose, "calling zesDriverGet\n");
-  ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, NULL);
-  if (ret != ZE_RESULT_SUCCESS) {
-    LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret);
-    snprintf(buf, buflen, "unable to get driver count: %x", ret);
-    resp->err = strdup(buf);
-    oneapi_release(resp->oh);
-    return;
-  }
-  LOG(resp->oh.verbose, "oneapi driver count: %d\n", resp->oh.num_drivers);
-  resp->oh.drivers = malloc(resp->oh.num_drivers * sizeof(zes_driver_handle_t));
-  resp->oh.num_devices = malloc(resp->oh.num_drivers * sizeof(uint32_t));
-  memset(&resp->oh.num_devices[0], 0, resp->oh.num_drivers * sizeof(uint32_t));
-  resp->oh.devices =
-      malloc(resp->oh.num_drivers * sizeof(zes_device_handle_t *));
-  ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, &resp->oh.drivers[0]);
-  if (ret != ZE_RESULT_SUCCESS) {
-    LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret);
-    snprintf(buf, buflen, "unable to get driver count: %x", ret);
-    resp->err = strdup(buf);
-    oneapi_release(resp->oh);
-    return;
-  }
-  for (d = 0; d < resp->oh.num_drivers; d++) {
-    LOG(resp->oh.verbose, "calling zesDeviceGet count %d: %p\n", d, resp->oh.drivers[d]);
-    ret = (*resp->oh.zesDeviceGet)(resp->oh.drivers[d],
-                                   &resp->oh.num_devices[d], NULL);
-    if (ret != ZE_RESULT_SUCCESS) {
-      LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret);
-      snprintf(buf, buflen, "unable to get device count: %x", ret);
-      resp->err = strdup(buf);
-      oneapi_release(resp->oh);
-      return;
-    }
-    resp->oh.devices[d] =
-        malloc(resp->oh.num_devices[d] * sizeof(zes_device_handle_t));
-    ret = (*resp->oh.zesDeviceGet)(
-        resp->oh.drivers[d], &resp->oh.num_devices[d], resp->oh.devices[d]);
-    if (ret != ZE_RESULT_SUCCESS) {
-      LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret);
-      snprintf(buf, buflen, "unable to get device count: %x", ret);
-      resp->err = strdup(buf);
-      oneapi_release(resp->oh);
-      return;
-    }
-  }
-  return;
-}
-void oneapi_check_vram(oneapi_handle_t h, int driver, int device,
-                       mem_info_t *resp) {
-  ze_result_t ret;
-  resp->err = NULL;
-  uint64_t totalMem = 0;
-  uint64_t usedMem = 0;
-  const int buflen = 256;
-  char buf[buflen + 1];
-  int i, d, m;
-  if (h.handle == NULL) {
-    resp->err = strdup("Level-Zero handle not initialized");
-    return;
-  }
-  if (driver > h.num_drivers || device > h.num_devices[driver]) {
-    resp->err = strdup("driver of device index out of bounds");
-    return;
-  }
-  resp->total = 0;
-  resp->free = 0;
-  zes_device_ext_properties_t ext_props;
-  ext_props.stype = ZES_STRUCTURE_TYPE_DEVICE_EXT_PROPERTIES;
-  ext_props.pNext = NULL;
-  zes_device_properties_t props;
-  props.stype = ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES;
-  props.pNext = &ext_props;
-  ret = (*h.zesDeviceGetProperties)(h.devices[driver][device], &props);
-  if (ret != ZE_RESULT_SUCCESS) {
-    snprintf(buf, buflen, "unable to get device properties: %d", ret);
-    resp->err = strdup(buf);
-    return;
-  }
-  snprintf(&resp->gpu_name[0], GPU_NAME_LEN, "%s", props.modelName);
-  // TODO this needs to map to ONEAPI_DEVICE_SELECTOR syntax
-  // (this is probably wrong...)
-  // TODO - the driver isn't included - what if there are multiple drivers?
-  snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", device);
-  if (h.verbose) {
-    // When in verbose mode, report more information about
-    // the card we discover.
-    LOG(h.verbose, "[%d:%d] oneAPI device name: %s\n", driver, device,
-        props.modelName);
-    LOG(h.verbose, "[%d:%d] oneAPI brand: %s\n", driver, device,
-        props.brandName);
-    LOG(h.verbose, "[%d:%d] oneAPI vendor: %s\n", driver, device,
-        props.vendorName);
-    LOG(h.verbose, "[%d:%d] oneAPI S/N: %s\n", driver, device,
-        props.serialNumber);
-    LOG(h.verbose, "[%d:%d] oneAPI board number: %s\n", driver, device,
-        props.boardNumber);
-  }
-  // TODO
-  // Compute Capability equivalent in resp->major, resp->minor, resp->patch
-  uint32_t memCount = 0;
-  ret = (*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount,
-                                        NULL);
-  if (ret != ZE_RESULT_SUCCESS) {
-    snprintf(buf, buflen, "unable to enumerate Level-Zero memory modules: %x",
-             ret);
-    resp->err = strdup(buf);
-    return;
-  }
-  LOG(h.verbose, "discovered %d Level-Zero memory modules\n", memCount);
-  zes_mem_handle_t *mems = malloc(memCount * sizeof(zes_mem_handle_t));
-  (*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount, mems);
-  for (m = 0; m < memCount; m++) {
-    zes_mem_state_t state;
-    state.stype = ZES_STRUCTURE_TYPE_MEM_STATE;
-    state.pNext = NULL;
-    ret = (*h.zesMemoryGetState)(mems[m], &state);
-    if (ret != ZE_RESULT_SUCCESS) {
-      snprintf(buf, buflen, "unable to get memory state: %x", ret);
-      resp->err = strdup(buf);
-      free(mems);
-      return;
-    }
-    resp->total += state.size;
-    resp->free += state.free;
-  }
-  free(mems);
-}
-void oneapi_release(oneapi_handle_t h) {
-  int d;
-  LOG(h.verbose, "releasing oneapi library\n");
-  for (d = 0; d < h.num_drivers; d++) {
-    if (h.devices != NULL && h.devices[d] != NULL) {
-      free(h.devices[d]);
-    }
-  }
-  if (h.devices != NULL) {
-    free(h.devices);
-    h.devices = NULL;
-  }
-  if (h.num_devices != NULL) {
-    free(h.num_devices);
-    h.num_devices = NULL;
-  }
-  if (h.drivers != NULL) {
-    free(h.drivers);
-    h.drivers = NULL;
-  }
-  h.num_drivers = 0;
-  UNLOAD_LIBRARY(h.handle);
-  h.handle = NULL;
-}
-int oneapi_get_device_count(oneapi_handle_t h, int driver) {
-  if (h.handle == NULL || h.num_devices == NULL) {
-    return 0;
-  }
-  if (driver > h.num_drivers) {
-    return 0;
-  }
-  return (int)h.num_devices[driver];
-}
-#endif // __APPLE__
--- a/discover/gpu_info_oneapi.h
+++ b/discover/gpu_info_oneapi.h
-#ifndef __APPLE__
-#ifndef __GPU_INFO_ONEAPI_H__
-#define __GPU_INFO_ONEAPI_H__
-#include "gpu_info.h"
-#define ZE_MAX_DEVICE_NAME 256
-#define ZE_MAX_DEVICE_UUID_SIZE 16
-#define ZES_STRING_PROPERTY_SIZE 64
-#define ZE_BIT(_i) (1 << _i)
-// Just enough typedef's to dlopen/dlsym for memory information
-typedef enum ze_result_t {
-  ZE_RESULT_SUCCESS = 0,
-  // Other values omitted for now...
-} ze_result_t;
-typedef uint8_t ze_bool_t;
-typedef struct _zes_driver_handle_t *zes_driver_handle_t;
-typedef struct _zes_device_handle_t *zes_device_handle_t;
-typedef struct _zes_mem_handle_t *zes_mem_handle_t;
-typedef enum _ze_structure_type_t {
-  ZE_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
-} ze_structure_type_t;
-typedef enum _zes_structure_type_t {
-  ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES = 0x1,
-  ZES_STRUCTURE_TYPE_MEM_PROPERTIES = 0xb,
-  ZES_STRUCTURE_TYPE_MEM_STATE = 0x1e,
-  ZES_STRUCTURE_TYPE_DEVICE_EXT_PROPERTIES = 0x2d,
-  ZES_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
-} zes_structure_type_t;
-typedef enum _zes_mem_type_t {
-  ZES_MEM_TYPE_FORCE_UINT32 = 0x7fffffff
-} zes_mem_type_t;
-typedef enum _zes_mem_loc_t {
-  ZES_MEM_LOC_SYSTEM = 0,
-  ZES_MEM_LOC_DEVICE = 1,
-  ZES_MEM_LOC_FORCE_UINT32 = 0x7fffffff
-} zes_mem_loc_t;
-typedef enum _zes_mem_health_t {
-  ZES_MEM_HEALTH_FORCE_UINT32 = 0x7fffffff
-} zes_mem_health_t;
-typedef struct _ze_device_uuid_t {
-  uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
-} ze_device_uuid_t;
-typedef struct _zes_uuid_t {
-  uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
-} zes_uuid_t;
-typedef enum _ze_device_type_t {
-  ZE_DEVICE_TYPE_GPU = 1,
-  ZE_DEVICE_TYPE_CPU = 2,
-  ZE_DEVICE_TYPE_FPGA = 3,
-  ZE_DEVICE_TYPE_MCA = 4,
-  ZE_DEVICE_TYPE_VPU = 5,
-  ZE_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff
-} ze_device_type_t;
-typedef enum _zes_device_type_t {
-  ZES_DEVICE_TYPE_GPU = 1,
-  ZES_DEVICE_TYPE_CPU = 2,
-  ZES_DEVICE_TYPE_FPGA = 3,
-  ZES_DEVICE_TYPE_MCA = 4,
-  ZES_DEVICE_TYPE_VPU = 5,
-  ZES_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff
-} zes_device_type_t;
-typedef uint32_t ze_device_property_flags_t;
-typedef enum _ze_device_property_flag_t {
-  ZE_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
-  ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
-  ZE_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
-  ZE_DEVICE_PROPERTY_FLAG_ONDEMANDPAGING = ZE_BIT(3),
-  ZE_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
-} ze_device_property_flag_t;
-typedef uint32_t zes_device_property_flags_t;
-typedef enum _zes_device_property_flag_t {
-  ZES_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
-  ZES_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
-  ZES_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
-  ZES_DEVICE_PROPERTY_FLAG_ONDEMANDPAGING = ZE_BIT(3),
-  ZES_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
-} zes_device_property_flag_t;
-typedef struct _ze_device_properties_t {
-  ze_structure_type_t stype;
-  void *pNext;
-  ze_device_type_t type;
-  uint32_t vendorId;
-  uint32_t deviceId;
-  ze_device_property_flags_t flags;
-  uint32_t subdeviceId;
-  uint32_t coreClockRate;
-  uint64_t maxMemAllocSize;
-  uint32_t maxHardwareContexts;
-  uint32_t maxCommandQueuePriority;
-  uint32_t numThreadsPerEU;
-  uint32_t physicalEUSimdWidth;
-  uint32_t numEUsPerSubslice;
-  uint32_t numSubslicesPerSlice;
-  uint32_t numSlices;
-  uint64_t timerResolution;
-  uint32_t timestampValidBits;
-  uint32_t kernelTimestampValidBits;
-  ze_device_uuid_t uuid;
-  char name[ZE_MAX_DEVICE_NAME];
-} ze_device_properties_t;
-typedef struct _zes_device_properties_t {
-  zes_structure_type_t stype;
-  void *pNext;
-  ze_device_properties_t core;
-  uint32_t numSubdevices;
-  char serialNumber[ZES_STRING_PROPERTY_SIZE];
-  char boardNumber[ZES_STRING_PROPERTY_SIZE];
-  char brandName[ZES_STRING_PROPERTY_SIZE];
-  char modelName[ZES_STRING_PROPERTY_SIZE];
-  char vendorName[ZES_STRING_PROPERTY_SIZE];
-  char driverVersion[ZES_STRING_PROPERTY_SIZE];
-} zes_device_properties_t;
-typedef struct _zes_device_ext_properties_t {
-  zes_structure_type_t stype;
-  void *pNext;
-  zes_uuid_t uuid;
-  zes_device_type_t type;
-  zes_device_property_flags_t flags;
-} zes_device_ext_properties_t;
-typedef struct _zes_mem_properties_t {
-  zes_structure_type_t stype;
-  void *pNext;
-  zes_mem_type_t type;
-  ze_bool_t onSubdevice;
-  uint32_t subdeviceId;
-  zes_mem_loc_t location;
-  uint64_t physicalSize;
-  int32_t busWidth;
-  int32_t numChannels;
-} zes_mem_properties_t;
-typedef struct _zes_mem_state_t {
-  zes_structure_type_t stype;
-  const void *pNext;
-  zes_mem_health_t health;
-  uint64_t free;
-  uint64_t size;
-} zes_mem_state_t;
-typedef struct oneapi_handle {
-  void *handle;
-  uint16_t verbose;
-  uint32_t num_drivers;
-  zes_driver_handle_t *drivers;
-  uint32_t *num_devices;
-  zes_device_handle_t **devices;
-  // TODO Driver major, minor information
-  // int driver_major;
-  // int driver_minor;
-  ze_result_t (*zesInit)(int);
-  ze_result_t (*zesDriverGet)(uint32_t *pCount, zes_driver_handle_t *phDrivers);
-  ze_result_t (*zesDeviceGet)(zes_driver_handle_t hDriver, uint32_t *pCount,
-                              zes_device_handle_t *phDevices);
-  ze_result_t (*zesDeviceGetProperties)(zes_device_handle_t hDevice,
-                                        zes_device_properties_t *pProperties);
-  ze_result_t (*zesDeviceEnumMemoryModules)(zes_device_handle_t hDevice,
-                                            uint32_t *pCount,
-                                            zes_mem_handle_t *phMemory);
-  ze_result_t (*zesMemoryGetProperties)(zes_mem_handle_t hMemory,
-                                        zes_mem_properties_t *pProperties);
-  ze_result_t (*zesMemoryGetState)(zes_mem_handle_t hMemory,
-                                   zes_mem_state_t *pState);
-} oneapi_handle_t;
-typedef struct oneapi_init_resp {
-  char *err; // If err is non-null handle is invalid
-  oneapi_handle_t oh;
-} oneapi_init_resp_t;
-typedef struct oneapi_version_resp {
-  ze_result_t status;
-  char *str; // Contains version or error string if status != 0
-} oneapi_version_resp_t;
-void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp);
-void oneapi_check_vram(oneapi_handle_t h, int driver, int device,
-                       mem_info_t *resp);
-void oneapi_release(oneapi_handle_t h);
-int oneapi_get_device_count(oneapi_handle_t h, int driver);
-#endif // __GPU_INFO_INTEL_H__
-#endif // __APPLE__
--- a/discover/gpu_test.go
+++ b/discover/gpu_test.go
-package discover
-import (
-	"runtime"
-	"testing"
-	"github.com/stretchr/testify/assert"
-	"github.com/stretchr/testify/require"
-)
-func TestBasicGetGPUInfo(t *testing.T) {
-	info := GetGPUInfo()
-	assert.NotEmpty(t, len(info))
-	assert.Contains(t, "cuda rocm cpu metal", info[0].Library)
-	if info[0].Library != "cpu" {
-		assert.Greater(t, info[0].TotalMemory, uint64(0))
-		assert.Greater(t, info[0].FreeMemory, uint64(0))
-	}
-}
-func TestCPUMemInfo(t *testing.T) {
-	info, err := GetCPUMem()
-	require.NoError(t, err)
-	switch runtime.GOOS {
-	case "darwin":
-		t.Skip("CPU memory not populated on darwin")
-	case "linux", "windows":
-		assert.Greater(t, info.TotalMemory, uint64(0))
-		assert.Greater(t, info.FreeMemory, uint64(0))
-	default:
-		return
-	}
-}
-func TestByLibrary(t *testing.T) {
-	type testCase struct {
-		input  []GpuInfo
-		expect int
-	}
-	testCases := map[string]*testCase{
-		"empty":                    {input: []GpuInfo{}, expect: 0},
-		"cpu":                      {input: []GpuInfo{{Library: "cpu"}}, expect: 1},
-		"cpu + GPU":                {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda"}}, expect: 2},
-		"cpu + 2 GPU no variant":   {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda"}, {Library: "cuda"}}, expect: 2},
-		"cpu + 2 GPU same variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda", Variant: "v11"}, {Library: "cuda", Variant: "v11"}}, expect: 2},
-		"cpu + 2 GPU diff variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda", Variant: "v11"}, {Library: "cuda", Variant: "v12"}}, expect: 3},
-	}
-	for k, v := range testCases {
-		t.Run(k, func(t *testing.T) {
-			resp := (GpuInfoList)(v.input).ByLibrary()
-			if len(resp) != v.expect {
-				t.Fatalf("expected length %d, got %d => %+v", v.expect, len(resp), resp)
-			}
-		})
-	}
-}
-// TODO - add some logic to figure out card type through other means and actually verify we got back what we expected
--- a/discover/runner.go
+++ b/discover/runner.go
+package discover
+// Runner based GPU discovery
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"log/slog"
+	"math/rand"
+	"net"
+	"net/http"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"runtime"
+	"sort"
+	"strconv"
+	"strings"
+	"sync"
+	"time"
+	"github.com/ollama/ollama/envconfig"
+	"github.com/ollama/ollama/format"
+	"github.com/ollama/ollama/logutil"
+	"github.com/ollama/ollama/ml"
+)
+var (
+	deviceMu     sync.Mutex
+	devices      []ml.DeviceInfo
+	libDirs      map[string]struct{}
+	rocmDir      string
+	exe          string
+	bootstrapped bool
+)
+func GPUDevices(ctx context.Context, runners []FilteredRunnerDiscovery) []ml.DeviceInfo {
+	deviceMu.Lock()
+	defer deviceMu.Unlock()
+	startDiscovery := time.Now()
+	msg := "overall device VRAM discovery took"
+	defer func() {
+		slog.Debug(msg, "duration", time.Since(startDiscovery))
+	}()
+	if !bootstrapped {
+		msg = "GPU bootstrap discovery took"
+		libDirs = make(map[string]struct{})
+		var err error
+		exe, err = os.Executable()
+		if err != nil {
+			slog.Error("unable to lookup executable path", "error", err)
+			return nil
+		}
+		if eval, err := filepath.EvalSymlinks(exe); err == nil {
+			exe = eval
+		}
+		files, err := filepath.Glob(filepath.Join(LibOllamaPath, "*", "*ggml-*"))
+		if err != nil {
+			slog.Debug("unable to lookup runner library directories", "error", err)
+		}
+		for _, file := range files {
+			libDirs[filepath.Dir(file)] = struct{}{}
+		}
+		// Our current packaging model places ggml-hip in the main directory
+		// but keeps rocm in an isolated directory.  We have to add it to
+		// the [LD_LIBRARY_]PATH so ggml-hip will load properly
+		rocmDir = filepath.Join(LibOllamaPath, "rocm")
+		if _, err := os.Stat(rocmDir); err != nil {
+			rocmDir = ""
+		}
+		if len(libDirs) == 0 {
+			libDirs[""] = struct{}{}
+		}
+		slog.Info("discovering available GPUs...")
+		// For our initial discovery pass, we gather all the known GPUs through
+		// all the libraries that were detected. This pass may include GPUs that
+		// are enumerated, but not actually supported.
+		// We run this in serial to avoid potentially initializing a GPU multiple
+		// times concurrently leading to memory contention
+		for dir := range libDirs {
+			var dirs []string
+			if dir == "" {
+				dirs = []string{LibOllamaPath}
+			} else {
+				dirs = []string{LibOllamaPath, dir}
+			}
+			// Typically bootstrapping takes < 1s, but on some systems, with devices
+			// in low power/idle mode, initialization can take multiple seconds.  We
+			// set a long timeout just for bootstrap discovery to reduce the chance
+			// of giving up too quickly
+			ctx1stPass, cancel := context.WithTimeout(ctx, 30*time.Second)
+			defer cancel()
+			// For this pass, we retain duplicates in case any are incompatible with some libraries
+			devices = append(devices, bootstrapDevices(ctx1stPass, dirs, nil)...)
+		}
+		// In the second pass, we more deeply initialize the GPUs to weed out devices that
+		// aren't supported by a given library.  We run this phase in parallel to speed up discovery.
+		slog.Debug("filtering out unsupported or overlapping GPU library combinations", "count", len(devices))
+		ctx2ndPass, cancel := context.WithTimeout(ctx, 30*time.Second)
+		defer cancel()
+		var wg sync.WaitGroup
+		needsDelete := make([]bool, len(devices))
+		supportedMu := sync.Mutex{}
+		supported := make(map[string]map[string]map[string]int) // [Library][libDir][ID] = pre-deletion devices index
+		for i := range devices {
+			libDir := devices[i].LibraryPath[len(devices[i].LibraryPath)-1]
+			if devices[i].Library == "Metal" {
+				continue
+			}
+			slog.Debug("verifying GPU is supported", "library", libDir, "description", devices[i].Description, "compute", devices[i].Compute(), "pci_id", devices[i].PCIID)
+			wg.Add(1)
+			go func(i int) {
+				defer wg.Done()
+				var envVar string
+				if devices[i].Library == "ROCm" {
+					if runtime.GOOS != "linux" {
+						envVar = "HIP_VISIBLE_DEVICES"
+					} else {
+						envVar = "ROCR_VISIBLE_DEVICES"
+					}
+				} else {
+					envVar = "CUDA_VISIBLE_DEVICES"
+				}
+				extraEnvs := []string{
+					"GGML_CUDA_INIT=1",           // force deep initialization to trigger crash on unsupported GPUs
+					envVar + "=" + devices[i].ID, // Filter to just this one GPU
+				}
+				if len(bootstrapDevices(ctx2ndPass, devices[i].LibraryPath, extraEnvs)) == 0 {
+					needsDelete[i] = true
+				} else {
+					supportedMu.Lock()
+					if _, ok := supported[devices[i].Library]; !ok {
+						supported[devices[i].Library] = make(map[string]map[string]int)
+					}
+					if _, ok := supported[devices[i].Library][libDir]; !ok {
+						supported[devices[i].Library][libDir] = make(map[string]int)
+					}
+					supported[devices[i].Library][libDir][devices[i].ID] = i
+					supportedMu.Unlock()
+				}
+			}(i)
+		}
+		wg.Wait()
+		logutil.Trace("supported GPU library combinations", "supported", supported)
+		// Mark for deletion any overlaps - favoring the library version that can cover all GPUs if possible
+		filterOverlapByLibrary(supported, needsDelete)
+		// TODO if we ever support multiple ROCm library versions this algorithm will need to be adjusted to keep the rocmID numeric value correct
+		rocmID := 0
+		for i := 0; i < len(needsDelete); i++ {
+			if needsDelete[i] {
+				logutil.Trace("removing unsupported or overlapping GPU combination", "libDir", devices[i].LibraryPath[len(devices[i].LibraryPath)-1], "description", devices[i].Description, "compute", devices[i].Compute(), "pci_id", devices[i].PCIID)
+				devices = append(devices[:i], devices[i+1:]...)
+				needsDelete = append(needsDelete[:i], needsDelete[i+1:]...)
+				i--
+			} else if devices[i].Library == "ROCm" {
+				if _, err := strconv.Atoi(devices[i].ID); err == nil {
+					// Replace the numeric ID with the post-filtered IDs
+					devices[i].FilteredID = devices[i].ID
+					devices[i].ID = strconv.Itoa(rocmID)
+				}
+				rocmID++
+			}
+		}
+		// Now filter out any overlap with different libraries (favor CUDA/ROCm over others)
+		for i := 0; i < len(devices); i++ {
+			for j := i + 1; j < len(devices); j++ {
+				// For this pass, we only drop exact duplicates
+				switch devices[i].Compare(devices[j]) {
+				case ml.SameBackendDevice:
+					// Same library and device, skip it
+					devices = append(devices[:j], devices[j+1:]...)
+					j--
+					continue
+				case ml.DuplicateDevice:
+					// Different library, choose based on priority
+					var droppedDevice ml.DeviceInfo
+					if devices[i].Library == "CUDA" || devices[i].Library == "ROCm" {
+						droppedDevice = devices[j]
+					} else {
+						droppedDevice = devices[i]
+						devices[i] = devices[j]
+					}
+					devices = append(devices[:j], devices[j+1:]...)
+					j--
+					typeStr := "discrete"
+					if droppedDevice.Integrated {
+						typeStr = "iGPU"
+					}
+					slog.Debug("dropping duplicate device",
+						"id", droppedDevice.ID,
+						"library", droppedDevice.Library,
+						"compute", droppedDevice.Compute(),
+						"name", droppedDevice.Name,
+						"description", droppedDevice.Description,
+						"libdirs", strings.Join(droppedDevice.LibraryPath, ","),
+						"driver", droppedDevice.Driver(),
+						"pci_id", droppedDevice.PCIID,
+						"type", typeStr,
+						"total", format.HumanBytes2(droppedDevice.TotalMemory),
+						"available", format.HumanBytes2(droppedDevice.FreeMemory),
+					)
+					continue
+				}
+			}
+		}
+		// Reset the libDirs to what we actually wind up using for future refreshes
+		libDirs = make(map[string]struct{})
+		for _, dev := range devices {
+			dir := dev.LibraryPath[len(dev.LibraryPath)-1]
+			if dir != LibOllamaPath {
+				libDirs[dir] = struct{}{}
+			}
+		}
+		if len(libDirs) == 0 {
+			libDirs[""] = struct{}{}
+		}
+		bootstrapped = true
+	} else {
+		if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
+			// metal never updates free VRAM
+			return devices
+		}
+		slog.Debug("refreshing free memory")
+		updated := make([]bool, len(devices))
+		allDone := func() bool {
+			allDone := true
+			for _, done := range updated {
+				if !done {
+					allDone = false
+					break
+				}
+			}
+			return allDone
+		}
+		// First try to use existing runners to refresh VRAM since they're already
+		// active on GPU(s)
+		for _, runner := range runners {
+			if runner == nil {
+				continue
+			}
+			deviceIDs := runner.GetActiveDeviceIDs()
+			if len(deviceIDs) == 0 {
+				// Skip this runner since it doesn't have active GPU devices
+				continue
+			}
+			// Check to see if this runner is active on any devices that need a refresh
+			skip := true
+		devCheck:
+			for _, dev := range deviceIDs {
+				for i := range devices {
+					if dev == devices[i].DeviceID {
+						if !updated[i] {
+							skip = false
+							break devCheck
+						}
+					}
+				}
+			}
+			if skip {
+				continue
+			}
+			// Typical refresh on existing runner is ~500ms but allow longer if the system
+			// is under stress before giving up and using stale data.
+			ctx, cancel := context.WithTimeout(ctx, 3*time.Second)
+			defer cancel()
+			start := time.Now()
+			updatedDevices := runner.GetDeviceInfos(ctx)
+			slog.Debug("existing runner discovery took", "duration", time.Since(start))
+			for _, u := range updatedDevices {
+				for i := range devices {
+					if u.DeviceID == devices[i].DeviceID {
+						updated[i] = true
+						devices[i].FreeMemory = u.FreeMemory
+						break
+					}
+				}
+			}
+			// Short circuit if we've updated all the devices
+			if allDone() {
+				break
+			}
+		}
+		if !allDone() {
+			slog.Debug("unable to refresh all GPUs with existing runners, performing bootstrap discovery")
+			// Bootstrapping may take longer in some cases (AMD windows), but we
+			// would rather use stale free data to get the model running sooner
+			ctx, cancel := context.WithTimeout(ctx, 3*time.Second)
+			defer cancel()
+			for dir := range libDirs {
+				updatedDevices := bootstrapDevices(ctx, []string{LibOllamaPath, dir}, nil)
+				for _, u := range updatedDevices {
+					for i := range devices {
+						if u.DeviceID == devices[i].DeviceID {
+							updated[i] = true
+							devices[i].FreeMemory = u.FreeMemory
+							break
+						}
+					}
+					// TODO - consider evaluating if new devices have appeared (e.g. hotplug)
+				}
+				if allDone() {
+					break
+				}
+			}
+			if !allDone() {
+				slog.Warn("unable to refresh free memory, using old values")
+			}
+		}
+	}
+	return devices
+}
+func filterOverlapByLibrary(supported map[string]map[string]map[string]int, needsDelete []bool) {
+	// For multi-GPU systems, use the newest version that supports all the GPUs
+	for _, byLibDirs := range supported {
+		libDirs := make([]string, 0, len(byLibDirs))
+		for libDir := range byLibDirs {
+			libDirs = append(libDirs, libDir)
+		}
+		sort.Sort(sort.Reverse(sort.StringSlice(libDirs)))
+		anyMissing := false
+		var newest string
+		for _, newest = range libDirs {
+			for _, libDir := range libDirs {
+				if libDir == newest {
+					continue
+				}
+				if len(byLibDirs[newest]) != len(byLibDirs[libDir]) {
+					anyMissing = true
+					break
+				}
+				for dev := range byLibDirs[newest] {
+					if _, found := byLibDirs[libDir][dev]; !found {
+						anyMissing = true
+						break
+					}
+				}
+			}
+			if !anyMissing {
+				break
+			}
+		}
+		// Now we can mark overlaps for deletion
+		for _, libDir := range libDirs {
+			if libDir == newest {
+				continue
+			}
+			for dev, i := range byLibDirs[libDir] {
+				if _, found := byLibDirs[newest][dev]; found {
+					needsDelete[i] = true
+				}
+			}
+		}
+	}
+}
+type bootstrapRunner struct {
+	port int
+	cmd  *exec.Cmd
+}
+func (r *bootstrapRunner) GetPort() int {
+	return r.port
+}
+func (r *bootstrapRunner) HasExited() bool {
+	if r.cmd != nil && r.cmd.ProcessState != nil {
+		return true
+	}
+	return false
+}
+func bootstrapDevices(ctx context.Context, ollamaLibDirs []string, extraEnvs []string) []ml.DeviceInfo {
+	// TODO DRY out with llm/server.go
+	slog.Debug("spawing runner with", "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "extra_envs", extraEnvs)
+	start := time.Now()
+	defer func() {
+		slog.Debug("bootstrap discovery took", "duration", time.Since(start), "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "extra_envs", extraEnvs)
+	}()
+	port := 0
+	if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
+		var l *net.TCPListener
+		if l, err = net.ListenTCP("tcp", a); err == nil {
+			port = l.Addr().(*net.TCPAddr).Port
+			l.Close()
+		}
+	}
+	if port == 0 {
+		slog.Debug("ResolveTCPAddr failed, using random port")
+		port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
+	}
+	params := []string{"runner", "--ollama-engine", "--port", strconv.Itoa(port)}
+	var pathEnv string
+	switch runtime.GOOS {
+	case "windows":
+		pathEnv = "PATH"
+	case "darwin":
+		pathEnv = "DYLD_LIBRARY_PATH"
+	default:
+		pathEnv = "LD_LIBRARY_PATH"
+	}
+	libraryPaths := append([]string{LibOllamaPath}, ollamaLibDirs...)
+	if rocmDir != "" {
+		libraryPaths = append(libraryPaths, rocmDir)
+	}
+	// Note: we always put our dependency paths first
+	// since these are the exact version we compiled/linked against
+	if libraryPath, ok := os.LookupEnv(pathEnv); ok {
+		libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
+	}
+	cmd := exec.Command(exe, params...)
+	cmd.Env = os.Environ()
+	if envconfig.LogLevel() == logutil.LevelTrace {
+		cmd.Stdout = os.Stdout
+		cmd.Stderr = os.Stderr
+	}
+	// cmd.SysProcAttr = llm.LlamaServerSysProcAttr // circular dependency - bring back once refactored
+	cmd.Env = append(cmd.Env, "OLLAMA_LIBRARY_PATH="+strings.Join(ollamaLibDirs, string(filepath.ListSeparator)))
+	pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
+	pathNeeded := true
+	extraDone := make([]bool, len(extraEnvs))
+	for i := range cmd.Env {
+		cmp := strings.SplitN(cmd.Env[i], "=", 2)
+		if strings.EqualFold(cmp[0], pathEnv) {
+			cmd.Env[i] = pathEnv + "=" + pathEnvVal
+			pathNeeded = false
+		} else {
+			for j := range extraEnvs {
+				if extraDone[j] {
+					continue
+				}
+				extra := strings.SplitN(extraEnvs[j], "=", 2)
+				if cmp[0] == extra[0] {
+					cmd.Env[i] = extraEnvs[j]
+					extraDone[i] = true
+				}
+			}
+		}
+	}
+	if pathNeeded {
+		cmd.Env = append(cmd.Env, pathEnv+"="+pathEnvVal)
+	}
+	for i := range extraDone {
+		if !extraDone[i] {
+			cmd.Env = append(cmd.Env, extraEnvs[i])
+		}
+	}
+	logutil.Trace("starting runner for device discovery", "env", cmd.Env, "cmd", cmd)
+	if err := cmd.Start(); err != nil {
+		slog.Warn("unable to start discovery subprocess", "cmd", cmd, "error", err)
+		return nil
+	}
+	go func() {
+		cmd.Wait() // exit status ignored
+	}()
+	defer cmd.Process.Kill()
+	devices, err := GetDevicesFromRunner(ctx, &bootstrapRunner{port: port, cmd: cmd})
+	if err != nil {
+		if cmd.ProcessState != nil && cmd.ProcessState.ExitCode() >= 0 {
+			// Expected during bootstrapping while we filter out unsupported AMD GPUs
+			logutil.Trace("runner exited", "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "extra_envs", extraEnvs, "code", cmd.ProcessState.ExitCode())
+		} else {
+			slog.Info("failure during GPU discovery", "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "extra_envs", extraEnvs, "error", err)
+		}
+	}
+	logutil.Trace("runner enumerated devices", "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "devices", devices)
+	return devices
+}
+func GetDevicesFromRunner(ctx context.Context, runner BaseRunner) ([]ml.DeviceInfo, error) {
+	var moreDevices []ml.DeviceInfo
+	port := runner.GetPort()
+	tick := time.Tick(10 * time.Millisecond)
+	for {
+		select {
+		case <-ctx.Done():
+			return nil, fmt.Errorf("failed to finish discovery before timeout")
+		case <-tick:
+			r, err := http.NewRequestWithContext(ctx, http.MethodGet, fmt.Sprintf("http://127.0.0.1:%d/info", port), nil)
+			if err != nil {
+				return nil, fmt.Errorf("failed to create request: %w", err)
+			}
+			r.Header.Set("Content-Type", "application/json")
+			resp, err := http.DefaultClient.Do(r)
+			if err != nil {
+				// slog.Warn("failed to send request", "error", err)
+				if runner.HasExited() {
+					return nil, fmt.Errorf("runner crashed")
+				}
+				continue
+			}
+			defer resp.Body.Close()
+			if resp.StatusCode == http.StatusNotFound {
+				// old runner, fall back to bootstrapping model
+				return nil, fmt.Errorf("llamarunner free vram reporting not supported")
+			}
+			body, err := io.ReadAll(resp.Body)
+			if err != nil {
+				slog.Warn("failed to read response", "error", err)
+				continue
+			}
+			if resp.StatusCode != 200 {
+				logutil.Trace("runner failed to discover free VRAM", "status", resp.StatusCode, "response", body)
+				return nil, fmt.Errorf("runner error: %s", string(body))
+			}
+			if err := json.Unmarshal(body, &moreDevices); err != nil {
+				slog.Warn("unmarshal encode response", "error", err)
+				continue
+			}
+			return moreDevices, nil
+		}
+	}
+}
--- a/discover/runner_test.go
+++ b/discover/runner_test.go
+package discover
+import (
+	"testing"
+	"github.com/ollama/ollama/app/lifecycle"
+)
+func init() {
+	lifecycle.InitLogging()
+}
+func TestFilterOverlapByLibrary(t *testing.T) {
+	type testcase struct {
+		name string
+		inp  map[string]map[string]map[string]int
+		exp  []bool
+	}
+	for _, tc := range []testcase{
+		{
+			name: "empty",
+			inp:  map[string]map[string]map[string]int{},
+			exp:  []bool{}, // needs deletion
+		},
+		{
+			name: "single no overlap",
+			inp: map[string]map[string]map[string]int{
+				"CUDA": {
+					"cuda_v12": {
+						"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 0,
+					},
+				},
+			},
+			exp: []bool{false},
+		},
+		{
+			name: "100% overlap pick 2nd",
+			inp: map[string]map[string]map[string]int{
+				"CUDA": {
+					"cuda_v12": {
+						"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 0,
+						"GPU-cd6c3216-03d2-a8eb-8235-2ffbf571712e": 1,
+					},
+					"cuda_v13": {
+						"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 2,
+						"GPU-cd6c3216-03d2-a8eb-8235-2ffbf571712e": 3,
+					},
+				},
+			},
+			exp: []bool{true, true, false, false},
+		},
+		{
+			name: "100% overlap pick 1st",
+			inp: map[string]map[string]map[string]int{
+				"CUDA": {
+					"cuda_v13": {
+						"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 0,
+						"GPU-cd6c3216-03d2-a8eb-8235-2ffbf571712e": 1,
+					},
+					"cuda_v12": {
+						"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 2,
+						"GPU-cd6c3216-03d2-a8eb-8235-2ffbf571712e": 3,
+					},
+				},
+			},
+			exp: []bool{false, false, true, true},
+		},
+		{
+			name: "partial overlap pick older",
+			inp: map[string]map[string]map[string]int{
+				"CUDA": {
+					"cuda_v13": {
+						"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 0,
+					},
+					"cuda_v12": {
+						"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 1,
+						"GPU-cd6c3216-03d2-a8eb-8235-2ffbf571712e": 2,
+					},
+				},
+			},
+			exp: []bool{true, false, false},
+		},
+		{
+			name: "no overlap",
+			inp: map[string]map[string]map[string]int{
+				"CUDA": {
+					"cuda_v13": {
+						"GPU-d7b00605-c0c8-152d-529d-e03726d5dc52": 0,
+					},
+					"cuda_v12": {
+						"GPU-cd6c3216-03d2-a8eb-8235-2ffbf571712e": 1,
+					},
+				},
+			},
+			exp: []bool{false, false},
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			needsDelete := make([]bool, len(tc.exp))
+			filterOverlapByLibrary(tc.inp, needsDelete)
+			for i, exp := range tc.exp {
+				if needsDelete[i] != exp {
+					t.Fatalf("expected: %v\ngot: %v", tc.exp, needsDelete)
+				}
+			}
+		})
+	}
+}
--- a/discover/types.go
+++ b/discover/types.go
 package discover
 import (
-	"fmt"
+	"context"
 	"log/slog"
+	"path/filepath"
+	"runtime"
+	"strings"
 	"github.com/ollama/ollama/format"
+	"github.com/ollama/ollama/ml"
 )
 type memInfo struct {
@@ -15,8 +19,8 @@ type memInfo struct {
 // Beginning of an `ollama info` command
 type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
+	ml.DeviceID
 	memInfo
-	Library string `json:"library,omitempty"`
 	// Optional variant to select (e.g. versions, cpu feature flags)
 	Variant string `json:"variant"`
@@ -27,17 +31,13 @@ type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
 	// Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly
 	DependencyPath []string `json:"lib_path,omitempty"`
-	// Extra environment variables specific to the GPU as list of [key=value]
-	EnvWorkarounds []string `json:"envs,omitempty"`
 	// Set to true if we can NOT reliably discover FreeMemory.  A value of true indicates
 	// the FreeMemory is best effort, and may over or under report actual memory usage
 	// False indicates FreeMemory can generally be trusted on this GPU
 	UnreliableFreeMemory bool
 	// GPU information
-	ID       string `json:"gpu_id"` // string to use for selection of this specific GPU
+	filterID string // AMD Workaround: The numeric ID of the device used to filter out other devices
-	filterID int    //nolint:unused,nolintlint // AMD Workaround: The numeric ID of the device used to filter out other devices
 	Name     string `json:"name"`    // user friendly name if available
 	Compute  string `json:"compute"` // Compute Capability or gfx
@@ -70,37 +70,8 @@ type CPU struct {
 	ThreadCount         int
 }
-type CudaGPUInfo struct {
-	GpuInfo
-	OSOverhead   uint64 // Memory overhead between the driver library and management library
-	index        int    //nolint:unused,nolintlint
-	computeMajor int    //nolint:unused,nolintlint
-	computeMinor int    //nolint:unused,nolintlint
-}
-type CudaGPUInfoList []CudaGPUInfo
-type RocmGPUInfo struct {
-	GpuInfo
-	usedFilepath string //nolint:unused,nolintlint
-	index        int    //nolint:unused,nolintlint
-}
-type RocmGPUInfoList []RocmGPUInfo
-type OneapiGPUInfo struct {
-	GpuInfo
-	driverIndex int //nolint:unused,nolintlint
-	gpuIndex    int //nolint:unused,nolintlint
-}
-type OneapiGPUInfoList []OneapiGPUInfo
 type GpuInfoList []GpuInfo
-type UnsupportedGPUInfo struct {
-	GpuInfo
-	Reason string `json:"reason"`
-}
-// Split up the set of gpu info's by Library and variant
 func (l GpuInfoList) ByLibrary() []GpuInfoList {
 	resp := []GpuInfoList{}
 	libs := []string{}
@@ -125,18 +96,47 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
 	return resp
 }
-// Report the GPU information into the log an Info level
+func LogDetails(devices []ml.DeviceInfo) {
-func (l GpuInfoList) LogDetails() {
+	for _, dev := range devices {
-	for _, g := range l {
+		var libs []string
+		for _, dir := range dev.LibraryPath {
+			if strings.Contains(dir, filepath.Join("lib", "ollama")) {
+				libs = append(libs, filepath.Base(dir))
+			}
+		}
+		typeStr := "discrete"
+		if dev.Integrated {
+			typeStr = "iGPU"
+		}
 		slog.Info("inference compute",
-			"id", g.ID,
+			"id", dev.ID,
-			"library", g.Library,
+			"library", dev.Library,
-			"variant", g.Variant,
+			"compute", dev.Compute(),
-			"compute", g.Compute,
+			"name", dev.Name,
-			"driver", fmt.Sprintf("%d.%d", g.DriverMajor, g.DriverMinor),
+			"description", dev.Description,
-			"name", g.Name,
+			"libdirs", strings.Join(libs, ","),
-			"total", format.HumanBytes2(g.TotalMemory),
+			"driver", dev.Driver(),
-			"available", format.HumanBytes2(g.FreeMemory),
+			"pci_id", dev.PCIID,
+			"type", typeStr,
+			"total", format.HumanBytes2(dev.TotalMemory),
+			"available", format.HumanBytes2(dev.FreeMemory),
+		)
+	}
+	// CPU inference
+	if len(devices) == 0 {
+		dev, _ := GetCPUMem()
+		slog.Info("inference compute",
+			"id", "cpu",
+			"library", "cpu",
+			"compute", "",
+			"name", "cpu",
+			"description", "cpu",
+			"libdirs", "ollama",
+			"driver", "",
+			"pci_id", "",
+			"type", "",
+			"total", format.HumanBytes2(dev.TotalMemory),
+			"available", format.HumanBytes2(dev.FreeMemory),
 		)
 	}
 }
@@ -149,16 +149,15 @@ func (a ByFreeMemory) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
 func (a ByFreeMemory) Less(i, j int) bool { return a[i].FreeMemory < a[j].FreeMemory }
 type SystemInfo struct {
-	System          CPUInfo              `json:"system"`
+	System CPUInfo   `json:"system"`
-	GPUs            []GpuInfo            `json:"gpus"`
+	GPUs   []GpuInfo `json:"gpus"`
-	UnsupportedGPUs []UnsupportedGPUInfo `json:"unsupported_gpus"`
-	DiscoveryErrors []string             `json:"discovery_errors"`
 }
 // Return the optimal number of threads to use for inference
 func (si SystemInfo) GetOptimalThreadCount() int {
 	if len(si.System.CPUs) == 0 {
-		return 0
+		// Fall back to Go's num CPU
+		return runtime.NumCPU()
 	}
 	coreCount := 0
@@ -173,9 +172,9 @@ func (si SystemInfo) GetOptimalThreadCount() int {
 func (l GpuInfoList) FlashAttentionSupported() bool {
 	for _, gpu := range l {
 		supportsFA := gpu.Library == "cpu" ||
-			gpu.Library == "metal" ||
+			gpu.Name == "Metal" || gpu.Library == "Metal" ||
-			(gpu.Library == "cuda" && gpu.DriverMajor >= 7) ||
+			(gpu.Library == "CUDA" && gpu.DriverMajor >= 7) ||
-			gpu.Library == "rocm"
+			gpu.Library == "ROCm"
 		if !supportsFA {
 			return false
@@ -183,3 +182,31 @@ func (l GpuInfoList) FlashAttentionSupported() bool {
 	}
 	return true
 }
+type BaseRunner interface {
+	// GetPort returns the localhost port number the runner is running on
+	GetPort() int
+	// HasExited indicates if the runner is no longer running.  This can be used during
+	// bootstrap to detect if a given filtered device is incompatible and triggered an assert
+	HasExited() bool
+}
+type RunnerDiscovery interface {
+	BaseRunner
+	// GetDeviceInfos will perform a query of the underlying device libraries
+	// for device identification and free VRAM information
+	// During bootstrap scenarios, this routine may take seconds to complete
+	GetDeviceInfos(ctx context.Context) []ml.DeviceInfo
+}
+type FilteredRunnerDiscovery interface {
+	RunnerDiscovery
+	// GetActiveDeviceIDs returns the filtered set of devices actively in
+	// use by this runner for running models.  If the runner is a bootstrap runner, no devices
+	// will be active yet so no device IDs are returned.
+	// This routine will not query the underlying device and will return immediately
+	GetActiveDeviceIDs() []ml.DeviceID
+}
--- a/docs/gpu.md
+++ b/docs/gpu.md
@@ -65,6 +65,9 @@ With ROCm v6.1, the following GPUs are supported on Windows.
 | AMD Radeon RX  | `7900 XTX` `7900 XT` `7900 GRE` `7800 XT` `7700 XT` `7600 XT` `7600` `6950 XT` `6900 XTX` `6900XT` `6800 XT` `6800`    |
 | AMD Radeon PRO | `W7900` `W7800` `W7700` `W7600` `W7500` `W6900X` `W6800X Duo` `W6800X` `W6800` `V620` |
+### Known Workarounds
+- The RX Vega 56 requires `HSA_ENABLE_SDMA=0` to disable SDMA
 ### Overrides on Linux
 Ollama leverages the AMD ROCm library, which does not support all AMD GPUs. In

--- a/integration/utils_test.go
+++ b/integration/utils_test.go
@@ -264,14 +264,13 @@ var (
 	rainbowFollowups = []string{
 		"Explain the physics involved in them.  Be breif in your reply",
 		"Explain the chemistry involved in them.  Be breif in your reply",
-		"Explain the quantum mechanics involved in them. Be breif in your reply",
 		"What are common myths related to them? Be brief in your reply",
 		"What are common fairytales related to them? Be brief in your reply",
 		"Can they form if there is no rain?  Be breif in your reply",
 		"Can they form if there are no clouds?  Be breif in your reply",
 		"Do they happen on other planets? Be brief in your reply",
 	}
-	rainbowExpected = []string{"water", "droplet", "mist", "glow", "refracted", "reflect", "color", "spectrum", "frequency", "end", "gold", "fortune", "blessing", "prosperity"}
+	rainbowExpected = []string{"water", "droplet", "mist", "glow", "refract", "reflect", "scatter", "wave", "color", "spectrum", "raindrop", "atmosphere", "frequency", "end", "gold", "fortune", "blessing", "prosperity", "magic", "shower", "sky", "shimmer", "light", "storm", "sunny"}
 )
 func init() {
@@ -456,6 +455,24 @@ func InitServerConnection(ctx context.Context, t *testing.T) (*api.Client, strin
 			t.Fatal(err)
 		}
 	}
+	// Make sure server is online and healthy before returning
+	listCtx, cancel := context.WithDeadlineCause(
+		ctx,
+		time.Now().Add(120*time.Second),
+		fmt.Errorf("list models took too long"),
+	)
+	defer cancel()
+	models, err := client.ListRunning(listCtx)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(models.Models) > 0 {
+		names := make([]string, len(models.Models))
+		for i, m := range models.Models {
+			names[i] = m.Name
+		}
+		slog.Info("currently loaded", "models", names)
+	}
 	return client, testEndpoint, func() {
 		if os.Getenv("OLLAMA_TEST_EXISTING") == "" {
@@ -577,7 +594,7 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) {
 				KeepAlive: &api.Duration{Duration: 10 * time.Second},
 			}, {
 				Model:     smol,
-				Prompt:    "how do rainbows form? Be brief but factual in your reply",
+				Prompt:    rainbowPrompt,
 				Stream:    &stream,
 				KeepAlive: &api.Duration{Duration: 10 * time.Second},
 			}, {
@@ -595,7 +612,7 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) {
 		[][]string{
 			{"sunlight", "scatter", "interact", "color", "surface", "depth", "red", "orange", "yellow", "absorb", "wavelength", "water", "molecule"},
 			{"soil", "organic", "earth", "black", "tan", "chemical", "processes", "pigment", "particle", "iron oxide", "rust", "air", "water", "wet", "mixture", "mixing", "mineral", "element", "decomposed", "matter", "wavelength"},
-			{"water", "droplet", "refract", "reflect", "color", "spectrum", "raindrop"},
+			rainbowExpected,
 			{"fourth", "july", "declaration", "independence"},
 			{"nitrogen", "oxygen", "carbon", "dioxide", "water", "vapor", "fluid", "particles", "gas"},
 		}

--- a/llama/llama.go
+++ b/llama/llama.go
@@ -42,6 +42,7 @@ import (
 	_ "github.com/ollama/ollama/llama/llama.cpp/common"
 	_ "github.com/ollama/ollama/llama/llama.cpp/src"
 	_ "github.com/ollama/ollama/llama/llama.cpp/tools/mtmd"
+	"github.com/ollama/ollama/ml"
 	ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
 )
@@ -62,8 +63,8 @@ func BackendInit() {
 	C.llama_backend_init()
 }
-func EnumerateGPUs() []string {
+func EnumerateGPUs() []ml.DeviceID {
-	var ids []string
+	var ids []ml.DeviceID
 	for i := range C.ggml_backend_dev_count() {
 		device := C.ggml_backend_dev_get(i)
@@ -71,7 +72,10 @@ func EnumerateGPUs() []string {
 		if C.ggml_backend_dev_type(device) == C.GGML_BACKEND_DEVICE_TYPE_GPU {
 			var props C.struct_ggml_backend_dev_props
 			C.ggml_backend_dev_get_props(device, &props)
-			ids = append(ids, C.GoString(props.id))
+			ids = append(ids, ml.DeviceID{
+				ID:      C.GoString(props.id),
+				Library: C.GoString(props.library),
+			})
 		}
 	}

--- a/llama/patches/0026-GPU-discovery-enhancements.patch
+++ b/llama/patches/0026-GPU-discovery-enhancements.patch
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Daniel Hiltgen <daniel@ollama.com>
+Date: Tue, 26 Aug 2025 12:48:29 -0700
+Subject: [PATCH] GPU discovery enhancements
+Expose more information about the devices through backend props, and leverage
+management libraries for more accurate VRAM usage reporting if available.
+---
+ ggml/include/ggml-backend.h      |   9 +
+ ggml/src/CMakeLists.txt          |   2 +
+ ggml/src/ggml-cuda/ggml-cuda.cu  |  75 +++++-
+ ggml/src/ggml-cuda/vendors/hip.h |   1 +
+ ggml/src/ggml-impl.h             |   8 +
+ ggml/src/ggml-metal/ggml-metal.m |   2 +
+ ggml/src/mem_hip.cpp             | 449 +++++++++++++++++++++++++++++++
+ ggml/src/mem_nvml.cpp            | 172 ++++++++++++
+ 8 files changed, 717 insertions(+), 1 deletion(-)
+ create mode 100644 ggml/src/mem_hip.cpp
+ create mode 100644 ggml/src/mem_nvml.cpp
+diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
+index fda5ceb24..7c2d86703 100644
+--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
+@@ -158,6 +158,15 @@ extern "C" {
+         size_t memory_total;
+         enum ggml_backend_dev_type type;
+         struct ggml_backend_dev_caps caps;
+        int driver_major;
+        int driver_minor;
+        int compute_major;
+        int compute_minor;
+        int integrated;
+        int pci_bus_id;
+        int pci_device_id;
+        int pci_domain_id;
+        const char *library;
+     };
+     GGML_API const char *                  ggml_backend_dev_name(ggml_backend_dev_t device);
+diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
+index 5158acd6a..3a428a22d 100644
+--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
+@@ -203,6 +203,8 @@ add_library(ggml-base
+             ggml-threading.h
+             ggml-quants.c
+             ggml-quants.h
+            mem_hip.cpp
+            mem_nvml.cpp
+             gguf.cpp)
+ target_include_directories(ggml-base PRIVATE .)
+diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
+index e43fde523..14baf0fb1 100644
+--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
+@@ -279,6 +279,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
+     for (int id = 0; id < info.device_count; ++id) {
+         int device_vmm = 0;
+#if defined(GGML_USE_HIP)
+        if (std::getenv("GGML_CUDA_INIT") != NULL) {
+            GGML_LOG_INFO("%s: initializing rocBLAS on device %d\n", __func__, id);
+            CUDA_CHECK(cudaSetDevice(id));
+            // rocblas_initialize will SIGABRT if the GPU isn't supported
+            rocblas_initialize();
+            GGML_LOG_INFO("%s: rocBLAS initialized on device %d\n", __func__, id);
+        }
+#endif
+
+ #if defined(GGML_USE_VMM)
+         CUdevice device;
+         CU_CHECK(cuDeviceGet(&device, id));
+@@ -332,9 +342,15 @@ static ggml_cuda_device_info ggml_cuda_init() {
+ #else
+         info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
+         info.devices[id].cc = 100*prop.major + 10*prop.minor;
+#ifdef __CUDA_ARCH_LIST__
+        if (std::getenv("GGML_CUDA_INIT") != NULL) {
+            GGML_ASSERT(ggml_cuda_has_arch(info.devices[id].cc) && "ggml was not compiled with support for this arch");
+        }
+#endif // defined(__CUDA_ARCH_LIST__)
+         GGML_LOG_INFO("  Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
+                         id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
+                         ggml_cuda_parse_uuid(prop, id).c_str());
+
+ #endif // defined(GGML_USE_HIP)
+     }
+@@ -3215,6 +3231,14 @@ struct ggml_backend_cuda_device_context {
+     std::string name;
+     std::string description;
+     std::string id;
+    int major;
+    int minor;
+    int driver_major;
+    int driver_minor;
+    int integrated;
+    int pci_bus_id;
+    int pci_device_id;
+    int pci_domain_id;
+ };
+ static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
+@@ -3235,6 +3259,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
+ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+     ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
+     ggml_cuda_set_device(ctx->device);
+
+#if defined(GGML_USE_HIP)
+    if (ggml_hip_mgmt_init() == 0) {
+        int status = ggml_hip_get_device_memory(ctx->pci_bus_id, ctx->pci_device_id, free, total);
+        if (status == 0) {
+            GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total);
+            ggml_hip_mgmt_release();
+            return;
+        }
+        ggml_hip_mgmt_release();
+    }
+#else
+    if (ggml_nvml_init() == 0) {
+        int status = ggml_nvml_get_device_memory(ctx->id.c_str(), free, total);
+        if (status == 0) {
+            GGML_LOG_DEBUG("%s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, *free, *total);
+            ggml_nvml_release();
+            return;
+        }
+        ggml_nvml_release();
+    }
+#endif
+     CUDA_CHECK(cudaMemGetInfo(free, total));
+ }
+@@ -3243,6 +3289,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
+     return GGML_BACKEND_DEVICE_TYPE_GPU;
+ }
+#define GGML_HIP_NAME "HIP"
+ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
+     props->name        = ggml_backend_cuda_device_get_name(dev);
+     props->description = ggml_backend_cuda_device_get_description(dev);
+@@ -3253,6 +3300,23 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
+     // If you need the memory data, call ggml_backend_dev_memory() explicitly.
+     props->memory_total = props->memory_free = 0;
+    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
+#if defined(GGML_USE_HIP)
+    int cc = ggml_cuda_info().devices[ctx->device].cc - GGML_CUDA_CC_OFFSET_AMD;
+    props->compute_major = cc / 0x100;
+    props->compute_minor = cc - (props->compute_major * 0x100);
+#else
+    props->compute_major = ctx->major;
+    props->compute_minor = ctx->minor;
+#endif
+    props->driver_major = ctx->driver_major;
+    props->driver_minor = ctx->driver_minor;
+    props->integrated = ctx->integrated;
+    props->pci_bus_id = ctx->pci_bus_id;
+    props->pci_device_id = ctx->pci_device_id;
+    props->pci_domain_id = ctx->pci_domain_id;
+    props->library = GGML_CUDA_NAME;
+
+     bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
+ #ifdef GGML_CUDA_NO_PEER_COPY
+     bool events = false;
+@@ -3843,6 +3907,8 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
+         std::lock_guard<std::mutex> lock(mutex);
+         if (!initialized) {
+             ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
+            int driverVersion = 0;
+            CUDA_CHECK(cudaDriverGetVersion(&driverVersion));
+             for (int i = 0; i < ggml_cuda_info().device_count; i++) {
+                 ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
+@@ -3853,7 +3919,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
+                 CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
+                 dev_ctx->description = prop.name;
+                 dev_ctx->id = ggml_cuda_parse_uuid(prop, i);
+-
+                dev_ctx->major = prop.major;
+                dev_ctx->minor = prop.minor;
+                dev_ctx->driver_major = driverVersion / 1000;
+                dev_ctx->driver_minor = (driverVersion - (dev_ctx->driver_major * 1000)) / 10;
+                dev_ctx->integrated = prop.integrated;
+                dev_ctx->pci_bus_id = prop.pciBusID;
+                dev_ctx->pci_device_id = prop.pciDeviceID;
+                dev_ctx->pci_domain_id = prop.pciDomainID;
+                 ggml_backend_dev_t dev = new ggml_backend_device {
+                     /* .iface   = */ ggml_backend_cuda_device_interface,
+                     /* .reg     = */ &reg,
+diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
+index cf22e60d2..957a795f2 100644
+--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
+@@ -42,6 +42,7 @@
+ #define cudaDeviceProp hipDeviceProp_t
+ #define cudaDeviceReset hipDeviceReset
+ #define cudaDeviceSynchronize hipDeviceSynchronize
+#define cudaDriverGetVersion hipDriverGetVersion
+ #define cudaError_t hipError_t
+ #define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
+ #define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
+diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
+index 19a7adb2d..b9b102a5e 100644
+--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
+@@ -602,6 +602,14 @@ static inline bool ggml_can_fuse(const struct ggml_cgraph * cgraph, int node_idx
+     return true;
+ }
+// Management libraries for fetching more accurate free VRAM data
+GGML_API int ggml_nvml_init();
+GGML_API int ggml_nvml_get_device_memory(const char *uuid, size_t *free, size_t *total);
+GGML_API void ggml_nvml_release();
+GGML_API int ggml_hip_mgmt_init();
+GGML_API int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total);
+GGML_API void ggml_hip_mgmt_release();
+
+ #ifdef __cplusplus
+ }
+ #endif
+diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
+index e4c31268f..ec6b385ba 100644
+--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
+@@ -6523,12 +6523,14 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
+     GGML_UNUSED(dev);
+ }
+#define GGML_METAL_NAME "Metal"
+ static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
+     props->name        = ggml_backend_metal_device_get_name(dev);
+     props->description = ggml_backend_metal_device_get_description(dev);
+     props->id          = "0";
+     props->type        = ggml_backend_metal_device_get_type(dev);
+     ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
+    props->library = GGML_METAL_NAME;
+     props->caps = (struct ggml_backend_dev_caps) {
+         /* .async                 = */ false,
+         /* .host_buffer           = */ false,
+diff --git a/ggml/src/mem_hip.cpp b/ggml/src/mem_hip.cpp
+new file mode 100644
+index 000000000..8ef19b8cf
+--- /dev/null
+++ b/ggml/src/mem_hip.cpp
+@@ -0,0 +1,449 @@
+#include "ggml.h"
+
+#ifdef _WIN32
+// AMD Device Library eXtra (ADLX)
+//
+// https://github.com/GPUOpen-LibrariesAndSDKs/ADLX
+//
+// This Windows-only library provides accurate VRAM reporting for AMD GPUs.
+// The runtime DLL is installed with every AMD Driver on Windows, however
+// the SDK isn't a part of the HIP SDK packaging.  As such, we avoid including
+// the headers from the SDK to simplify building from source.
+//
+// ADLX relies heavily on function pointer tables.
+// Only the minimal set of types are defined below to facilitate
+// finding the target AMD GPU(s) and querying their current VRAM usage
+// Unused function parameters are commented out to avoid unnecessary type
+// definitions.
+
+#include "ggml-impl.h"
+#include <filesystem>
+#include <mutex>
+
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#  define NOMINMAX
+#endif
+#include <windows.h>
+
+namespace fs = std::filesystem;
+
+#include <stdio.h>
+#include <stdint.h>
+
+// Begin minimal ADLX definitions - derived from tag v1.0 (Dec 2022)
+typedef     uint64_t            adlx_uint64;
+typedef     uint32_t            adlx_uint32;
+typedef     int32_t             adlx_int32;
+typedef     adlx_int32          adlx_int;
+typedef     adlx_uint32         adlx_uint;
+typedef     long                adlx_long;
+typedef     uint8_t             adlx_uint8;
+typedef enum
+{
+    ADLX_OK = 0,                    /**< @ENG_START_DOX This result indicates success. @ENG_END_DOX */
+    ADLX_ALREADY_ENABLED,           /**< @ENG_START_DOX This result indicates that the asked action is already enabled. @ENG_END_DOX */
+    ADLX_ALREADY_INITIALIZED,       /**< @ENG_START_DOX This result indicates that ADLX has a unspecified type of initialization. @ENG_END_DOX */
+    ADLX_FAIL,                      /**< @ENG_START_DOX This result indicates an unspecified failure. @ENG_END_DOX */
+    ADLX_INVALID_ARGS,              /**< @ENG_START_DOX This result indicates that the arguments are invalid. @ENG_END_DOX */
+    ADLX_BAD_VER,                   /**< @ENG_START_DOX This result indicates that the asked version is incompatible with the current version. @ENG_END_DOX */
+    ADLX_UNKNOWN_INTERFACE,         /**< @ENG_START_DOX This result indicates that an unknown interface was asked. @ENG_END_DOX */
+    ADLX_TERMINATED,                /**< @ENG_START_DOX This result indicates that the calls were made in an interface after ADLX was terminated. @ENG_END_DOX */
+    ADLX_ADL_INIT_ERROR,            /**< @ENG_START_DOX This result indicates that the ADL initialization failed. @ENG_END_DOX */
+    ADLX_NOT_FOUND,                 /**< @ENG_START_DOX This result indicates that the item is not found. @ENG_END_DOX */
+    ADLX_INVALID_OBJECT,            /**< @ENG_START_DOX This result indicates that the method was called into an invalid object. @ENG_END_DOX */
+    ADLX_ORPHAN_OBJECTS,            /**< @ENG_START_DOX This result indicates that ADLX was terminated with outstanding ADLX objects. Any interface obtained from ADLX points to invalid memory and calls in their methods will result in unexpected behavior. @ENG_END_DOX */
+    ADLX_NOT_SUPPORTED,             /**< @ENG_START_DOX This result indicates that the asked feature is not supported. @ENG_END_DOX */
+    ADLX_PENDING_OPERATION,         /**< @ENG_START_DOX This result indicates a failure due to an operation currently in progress. @ENG_END_DOX */
+    ADLX_GPU_INACTIVE               /**< @ENG_START_DOX This result indicates that the GPU is inactive. @ENG_END_DOX */
+} ADLX_RESULT;
+#define ADLX_SUCCEEDED(x) (ADLX_OK == (x) || ADLX_ALREADY_ENABLED == (x) || ADLX_ALREADY_INITIALIZED == (x))
+#define ADLX_FAILED(x) (ADLX_OK != (x)  && ADLX_ALREADY_ENABLED != (x) && ADLX_ALREADY_INITIALIZED != (x))
+#define ADLX_VER_MAJOR       1
+#define ADLX_VER_MINOR       0
+#define ADLX_VER_RELEASE     5
+#define ADLX_VER_BUILD_NUM   30
+#define ADLX_MAKE_FULL_VER(VERSION_MAJOR, VERSION_MINOR, VERSION_RELEASE, VERSION_BUILD_NUM)    ( ((adlx_uint64)(VERSION_MAJOR) << 48ull) | ((adlx_uint64)(VERSION_MINOR) << 32ull) | ((adlx_uint64)(VERSION_RELEASE) << 16ull)  | (adlx_uint64)(VERSION_BUILD_NUM))
+#define ADLX_FULL_VERSION ADLX_MAKE_FULL_VER(ADLX_VER_MAJOR, ADLX_VER_MINOR, ADLX_VER_RELEASE, ADLX_VER_BUILD_NUM)
+#define ADLX_CORE_LINK          __declspec(dllexport)
+#define ADLX_STD_CALL           __stdcall
+#define ADLX_CDECL_CALL         __cdecl
+#define ADLX_FAST_CALL          __fastcall
+#define ADLX_INLINE              __inline
+#define ADLX_FORCEINLINE         __forceinline
+#define ADLX_NO_VTABLE          __declspec(novtable)
+
+#if defined(__cplusplus)
+typedef     bool                adlx_bool;
+#else
+typedef     adlx_uint8           adlx_bool;
+#define     true                1
+#define     false               0
+#endif
+
+typedef struct IADLXSystem IADLXSystem;
+typedef struct IADLXGPUList IADLXGPUList;
+typedef struct IADLXGPU IADLXGPU;
+typedef struct IADLXInterface IADLXInterface;
+typedef struct IADLXPerformanceMonitoringServices IADLXPerformanceMonitoringServices;
+typedef struct IADLXGPUMetrics IADLXGPUMetrics;
+typedef struct IADLXGPUMetricsSupport IADLXGPUMetricsSupport;
+
+typedef struct IADLXSystemVtbl
+{
+    // IADLXSystem interface
+    ADLX_RESULT (ADLX_STD_CALL *GetHybridGraphicsType)(/* IADLXSystem* pThis, ADLX_HG_TYPE* hgType */);
+    ADLX_RESULT (ADLX_STD_CALL *GetGPUs)(IADLXSystem* pThis, IADLXGPUList** ppGPUs); // Used
+    ADLX_RESULT (ADLX_STD_CALL *QueryInterface)(/* IADLXSystem* pThis, const wchar_t* interfaceId, void** ppInterface */);
+    ADLX_RESULT (ADLX_STD_CALL *GetDisplaysServices)(/* IADLXSystem* pThis, IADLXDisplayServices** ppDispServices */);
+    ADLX_RESULT (ADLX_STD_CALL *GetDesktopsServices)(/* IADLXSystem* pThis, IADLXDesktopServices** ppDeskServices */);
+    ADLX_RESULT (ADLX_STD_CALL *GetGPUsChangedHandling)(/* IADLXSystem* pThis, IADLXGPUsChangedHandling** ppGPUsChangedHandling */);
+    ADLX_RESULT (ADLX_STD_CALL *EnableLog)(/* IADLXSystem* pThis, ADLX_LOG_DESTINATION mode, ADLX_LOG_SEVERITY severity, IADLXLog* pLogger, const wchar_t* fileName */);
+    ADLX_RESULT (ADLX_STD_CALL *Get3DSettingsServices)(/* IADLXSystem* pThis, IADLX3DSettingsServices** pp3DSettingsServices */);
+    ADLX_RESULT (ADLX_STD_CALL *GetGPUTuningServices)(/* IADLXSystem* pThis, IADLXGPUTuningServices** ppGPUTuningServices */);
+    ADLX_RESULT (ADLX_STD_CALL *GetPerformanceMonitoringServices)(IADLXSystem* pThis, IADLXPerformanceMonitoringServices** ppPerformanceMonitoringServices); // Used
+    ADLX_RESULT (ADLX_STD_CALL *TotalSystemRAM)(/* IADLXSystem* pThis, adlx_uint* ramMB */);
+    ADLX_RESULT (ADLX_STD_CALL *GetI2C)(/* IADLXSystem* pThis, IADLXGPU* pGPU, IADLXI2C** ppI2C */);
+} IADLXSystemVtbl;
+struct IADLXSystem { const IADLXSystemVtbl *pVtbl; };
+
+typedef struct IADLXGPUVtbl
+{
+    //IADLXInterface
+    adlx_long (ADLX_STD_CALL *Acquire)(/* IADLXGPU* pThis */);
+    adlx_long (ADLX_STD_CALL *Release)(IADLXGPU* pThis); // Used
+    ADLX_RESULT (ADLX_STD_CALL *QueryInterface)(/* IADLXGPU* pThis, const wchar_t* interfaceId, void** ppInterface */);
+
+    //IADLXGPU
+    ADLX_RESULT (ADLX_STD_CALL *VendorId)(/* IADLXGPU* pThis, const char** vendorId */);
+    ADLX_RESULT (ADLX_STD_CALL *ASICFamilyType)(/* IADLXGPU* pThis, ADLX_ASIC_FAMILY_TYPE* asicFamilyType */);
+    ADLX_RESULT (ADLX_STD_CALL *Type)(/* IADLXGPU* pThis, ADLX_GPU_TYPE* gpuType */);
+    ADLX_RESULT (ADLX_STD_CALL *IsExternal)(/* IADLXGPU* pThis, adlx_bool* isExternal */);
+    ADLX_RESULT (ADLX_STD_CALL *Name)(/* IADLXGPU* pThis, const char** gpuName */);
+    ADLX_RESULT (ADLX_STD_CALL *DriverPath)(/* IADLXGPU* pThis, const char** driverPath */);
+    ADLX_RESULT (ADLX_STD_CALL *PNPString)(/* IADLXGPU* pThis, const char** pnpString */);
+    ADLX_RESULT (ADLX_STD_CALL *HasDesktops)(/* IADLXGPU* pThis, adlx_bool* hasDesktops */);
+    ADLX_RESULT (ADLX_STD_CALL *TotalVRAM)(IADLXGPU* pThis, adlx_uint* vramMB); // Used
+    ADLX_RESULT (ADLX_STD_CALL *VRAMType)(/* IADLXGPU* pThis, const char** type */);
+    ADLX_RESULT (ADLX_STD_CALL *BIOSInfo)(/* IADLXGPU* pThis, const char** partNumber, const char** version, const char** date */);
+    ADLX_RESULT (ADLX_STD_CALL *DeviceId)(/* IADLXGPU* pThis, const char** deviceId */);
+    ADLX_RESULT (ADLX_STD_CALL *RevisionId)(/* IADLXGPU* pThis, const char** revisionId */);
+    ADLX_RESULT (ADLX_STD_CALL *SubSystemId)(/* IADLXGPU* pThis, const char** subSystemId */);
+    ADLX_RESULT (ADLX_STD_CALL *SubSystemVendorId)(/* IADLXGPU* pThis, const char** subSystemVendorId */);
+    ADLX_RESULT (ADLX_STD_CALL *UniqueId)(IADLXGPU* pThis, adlx_int* uniqueId); // Used
+} IADLXGPUVtbl;
+struct IADLXGPU { const IADLXGPUVtbl *pVtbl; };
+
+typedef struct IADLXGPUListVtbl
+{
+    //IADLXInterface
+    adlx_long (ADLX_STD_CALL *Acquire)(/* IADLXGPUList* pThis */);
+    adlx_long (ADLX_STD_CALL *Release)(IADLXGPUList* pThis); // Used
+    ADLX_RESULT (ADLX_STD_CALL *QueryInterface)(/* IADLXGPUList* pThis, const wchar_t* interfaceId, void** ppInterface */);
+
+    //IADLXList
+    adlx_uint (ADLX_STD_CALL *Size)(/* IADLXGPUList* pThis */);
+    adlx_uint8 (ADLX_STD_CALL *Empty)(/* IADLXGPUList* pThis */);
+    adlx_uint (ADLX_STD_CALL *Begin)(IADLXGPUList* pThis); // Used
+    adlx_uint (ADLX_STD_CALL *End)(IADLXGPUList* pThis); // Used
+    ADLX_RESULT (ADLX_STD_CALL *At)(/* IADLXGPUList* pThis, const adlx_uint location, IADLXInterface** ppItem */);
+    ADLX_RESULT (ADLX_STD_CALL *Clear)(/* IADLXGPUList* pThis */);
+    ADLX_RESULT (ADLX_STD_CALL *Remove_Back)(/* IADLXGPUList* pThis */);
+    ADLX_RESULT (ADLX_STD_CALL *Add_Back)(/* IADLXGPUList* pThis, IADLXInterface* pItem */);
+
+    //IADLXGPUList
+    ADLX_RESULT (ADLX_STD_CALL *At_GPUList)(IADLXGPUList* pThis, const adlx_uint location, IADLXGPU** ppItem); // Used
+    ADLX_RESULT (ADLX_STD_CALL *Add_Back_GPUList)(/* IADLXGPUList* pThis, IADLXGPU* pItem */);
+
+} IADLXGPUListVtbl;
+struct IADLXGPUList { const IADLXGPUListVtbl *pVtbl; };
+
+typedef struct IADLXPerformanceMonitoringServicesVtbl
+{
+    //IADLXInterface
+    adlx_long (ADLX_STD_CALL *Acquire)(/* IADLXPerformanceMonitoringServices* pThis */);
+    adlx_long (ADLX_STD_CALL *Release)(IADLXPerformanceMonitoringServices* pThis); // Used
+    ADLX_RESULT (ADLX_STD_CALL *QueryInterface)(/* IADLXPerformanceMonitoringServices* pThis, const wchar_t* interfaceId, void** ppInterface */);
+
+    //IADLXPerformanceMonitoringServices
+    ADLX_RESULT (ADLX_STD_CALL *GetSamplingIntervalRange)(/* IADLXPerformanceMonitoringServices* pThis, ADLX_IntRange* range */);
+    ADLX_RESULT (ADLX_STD_CALL *SetSamplingInterval)(/* IADLXPerformanceMonitoringServices* pThis, adlx_int intervalMs */);
+    ADLX_RESULT (ADLX_STD_CALL *GetSamplingInterval)(/* IADLXPerformanceMonitoringServices* pThis, adlx_int* intervalMs */);
+    ADLX_RESULT (ADLX_STD_CALL *GetMaxPerformanceMetricsHistorySizeRange)(/* IADLXPerformanceMonitoringServices* pThis, ADLX_IntRange* range */);
+    ADLX_RESULT (ADLX_STD_CALL *SetMaxPerformanceMetricsHistorySize)(/* IADLXPerformanceMonitoringServices* pThis, adlx_int sizeSec */);
+    ADLX_RESULT (ADLX_STD_CALL *GetMaxPerformanceMetricsHistorySize)(/* IADLXPerformanceMonitoringServices* pThis, adlx_int* sizeSec */);
+    ADLX_RESULT (ADLX_STD_CALL *ClearPerformanceMetricsHistory)(/* IADLXPerformanceMonitoringServices* pThis */);
+    ADLX_RESULT (ADLX_STD_CALL *GetCurrentPerformanceMetricsHistorySize)(/* IADLXPerformanceMonitoringServices* pThis, adlx_int* sizeSec */);
+    ADLX_RESULT (ADLX_STD_CALL *StartPerformanceMetricsTracking)(/* IADLXPerformanceMonitoringServices* pThis */);
+    ADLX_RESULT (ADLX_STD_CALL *StopPerformanceMetricsTracking)(/* IADLXPerformanceMonitoringServices* pThis */);
+    ADLX_RESULT (ADLX_STD_CALL *GetAllMetricsHistory)(/* IADLXPerformanceMonitoringServices* pThis, adlx_int startMs, adlx_int stopMs, IADLXAllMetricsList** ppMetricsList */);
+    ADLX_RESULT (ADLX_STD_CALL *GetGPUMetricsHistory)(/* IADLXPerformanceMonitoringServices* pThis, IADLXGPU* pGPU, adlx_int startMs, adlx_int stopMs, IADLXGPUMetricsList** ppMetricsList */);
+    ADLX_RESULT (ADLX_STD_CALL *GetSystemMetricsHistory)(/* IADLXPerformanceMonitoringServices* pThis, adlx_int startMs, adlx_int stopMs, IADLXSystemMetricsList** ppMetricsList */);
+    ADLX_RESULT (ADLX_STD_CALL *GetFPSHistory)(/* IADLXPerformanceMonitoringServices* pThis, adlx_int startMs, adlx_int stopMs, IADLXFPSList** ppMetricsList */);
+    ADLX_RESULT (ADLX_STD_CALL *GetCurrentAllMetrics)(/* IADLXPerformanceMonitoringServices* pThis, IADLXAllMetrics** ppMetrics */);
+    ADLX_RESULT (ADLX_STD_CALL *GetCurrentGPUMetrics)(IADLXPerformanceMonitoringServices* pThis, IADLXGPU* pGPU, IADLXGPUMetrics** ppMetrics); // Used
+    ADLX_RESULT (ADLX_STD_CALL *GetCurrentSystemMetrics)(/* IADLXPerformanceMonitoringServices* pThis, IADLXSystemMetrics** ppMetrics */);
+    ADLX_RESULT (ADLX_STD_CALL *GetCurrentFPS)(/* IADLXPerformanceMonitoringServices* pThis, IADLXFPS** ppMetrics */);
+    ADLX_RESULT (ADLX_STD_CALL *GetSupportedGPUMetrics)(IADLXPerformanceMonitoringServices* pThis, IADLXGPU* pGPU, IADLXGPUMetricsSupport** ppMetricsSupported); // Used
+    ADLX_RESULT (ADLX_STD_CALL *GetSupportedSystemMetrics)(/* IADLXPerformanceMonitoringServices* pThis, IADLXSystemMetricsSupport** ppMetricsSupported */);
+}IADLXPerformanceMonitoringServicesVtbl;
+struct IADLXPerformanceMonitoringServices { const IADLXPerformanceMonitoringServicesVtbl *pVtbl; };
+
+typedef struct IADLXGPUMetricsSupportVtbl
+{
+    //IADLXInterface
+    adlx_long (ADLX_STD_CALL* Acquire)(/* IADLXGPUMetricsSupport* pThis */);
+    adlx_long (ADLX_STD_CALL* Release)(IADLXGPUMetricsSupport* pThis); // Used
+    ADLX_RESULT (ADLX_STD_CALL* QueryInterface)(/* IADLXGPUMetricsSupport* pThis, const wchar_t* interfaceId, void** ppInterface */);
+
+    //IADLXGPUMetricsSupport
+    ADLX_RESULT (ADLX_STD_CALL* IsSupportedGPUUsage)(/* IADLXGPUMetricsSupport* pThis, adlx_bool* supported */);
+    ADLX_RESULT (ADLX_STD_CALL* IsSupportedGPUClockSpeed)(/* IADLXGPUMetricsSupport* pThis, adlx_bool* supported */);
+    ADLX_RESULT (ADLX_STD_CALL* IsSupportedGPUVRAMClockSpeed)(/* IADLXGPUMetricsSupport* pThis, adlx_bool* supported */);
+    ADLX_RESULT (ADLX_STD_CALL* IsSupportedGPUTemperature)(/* IADLXGPUMetricsSupport* pThis, adlx_bool* supported */);
+    ADLX_RESULT (ADLX_STD_CALL* IsSupportedGPUHotspotTemperature)(/* IADLXGPUMetricsSupport* pThis, adlx_bool* supported */);
+    ADLX_RESULT (ADLX_STD_CALL* IsSupportedGPUPower)(/* IADLXGPUMetricsSupport* pThis, adlx_bool* supported */);
+    ADLX_RESULT (ADLX_STD_CALL* IsSupportedGPUTotalBoardPower)(/* IADLXGPUMetricsSupport* pThis, adlx_bool* supported */);
+    ADLX_RESULT (ADLX_STD_CALL* IsSupportedGPUFanSpeed)(/* IADLXGPUMetricsSupport* pThis, adlx_bool* supported */);
+    ADLX_RESULT (ADLX_STD_CALL* IsSupportedGPUVRAM)(IADLXGPUMetricsSupport* pThis, adlx_bool* supported); // Used
+    ADLX_RESULT (ADLX_STD_CALL* IsSupportedGPUVoltage)(/* IADLXGPUMetricsSupport* pThis, adlx_bool* supported */);
+
+    ADLX_RESULT (ADLX_STD_CALL* GetGPUUsageRange)(/* IADLXGPUMetricsSupport* pThis, adlx_int* minValue, adlx_int* maxValue */);
+    ADLX_RESULT (ADLX_STD_CALL* GetGPUClockSpeedRange)(/* IADLXGPUMetricsSupport* pThis, adlx_int* minValue, adlx_int* maxValue */);
+    ADLX_RESULT (ADLX_STD_CALL* GetGPUVRAMClockSpeedRange)(/* IADLXGPUMetricsSupport* pThis, adlx_int* minValue, adlx_int* maxValue */);
+    ADLX_RESULT (ADLX_STD_CALL* GetGPUTemperatureRange)(/* IADLXGPUMetricsSupport* pThis, adlx_int* minValue, adlx_int* maxValue */);
+    ADLX_RESULT (ADLX_STD_CALL* GetGPUHotspotTemperatureRange)(/* IADLXGPUMetricsSupport* pThis, adlx_int* minValue, adlx_int* maxValue */);
+    ADLX_RESULT (ADLX_STD_CALL* GetGPUPowerRange)(/* IADLXGPUMetricsSupport* pThis, adlx_int* minValue, adlx_int* maxValue */);
+    ADLX_RESULT (ADLX_STD_CALL* GetGPUFanSpeedRange)(/* IADLXGPUMetricsSupport* pThis, adlx_int* minValue, adlx_int* maxValue */);
+    ADLX_RESULT (ADLX_STD_CALL* GetGPUVRAMRange)(/* IADLXGPUMetricsSupport* pThis, adlx_int* minValue, adlx_int* maxValue */);
+    ADLX_RESULT (ADLX_STD_CALL* GetGPUVoltageRange)(/* IADLXGPUMetricsSupport* pThis, adlx_int* minValue, adlx_int* maxValue */);
+    ADLX_RESULT (ADLX_STD_CALL* GetGPUTotalBoardPowerRange)(/* IADLXGPUMetricsSupport* pThis, adlx_int* minValue, adlx_int* maxValue */);
+} IADLXGPUMetricsSupportVtbl;
+struct IADLXGPUMetricsSupport { const IADLXGPUMetricsSupportVtbl *pVtbl; };
+
+typedef struct IADLXGPUMetricsVtbl
+{
+    //IADLXInterface
+    adlx_long (ADLX_STD_CALL* Acquire)(/* IADLXGPUMetrics* pThis */);
+    adlx_long (ADLX_STD_CALL* Release)(IADLXGPUMetrics* pThis); // Used
+    ADLX_RESULT (ADLX_STD_CALL* QueryInterface)(/* IADLXGPUMetrics* pThis, const wchar_t* interfaceId, void** ppInterface */);
+
+    //IADLXGPUMetrics
+    ADLX_RESULT (ADLX_STD_CALL* TimeStamp)(/* IADLXGPUMetrics* pThis, adlx_int64* ms */);
+    ADLX_RESULT (ADLX_STD_CALL* GPUUsage)(/* IADLXGPUMetrics* pThis, adlx_double* data */);
+    ADLX_RESULT (ADLX_STD_CALL* GPUClockSpeed)(/* IADLXGPUMetrics* pThis, adlx_int* data */);
+    ADLX_RESULT (ADLX_STD_CALL* GPUVRAMClockSpeed)(/* IADLXGPUMetrics* pThis, adlx_int* data */);
+    ADLX_RESULT (ADLX_STD_CALL* GPUTemperature)(/* IADLXGPUMetrics* pThis, adlx_double* data */);
+    ADLX_RESULT (ADLX_STD_CALL* GPUHotspotTemperature)(/* IADLXGPUMetrics* pThis, adlx_double* data */);
+    ADLX_RESULT (ADLX_STD_CALL* GPUPower)(/* IADLXGPUMetrics* pThis, adlx_double* data */);
+    ADLX_RESULT (ADLX_STD_CALL* GPUTotalBoardPower)(/* IADLXGPUMetrics* pThis, adlx_double* data */);
+    ADLX_RESULT (ADLX_STD_CALL* GPUFanSpeed)(/* IADLXGPUMetrics* pThis, adlx_int* data */);
+    ADLX_RESULT (ADLX_STD_CALL* GPUVRAM)(IADLXGPUMetrics* pThis, adlx_int* data); // Used
+    ADLX_RESULT (ADLX_STD_CALL* GPUVoltage)(/* IADLXGPUMetrics* pThis, adlx_int* data */);
+} IADLXGPUMetricsVtbl;
+struct IADLXGPUMetrics { const IADLXGPUMetricsVtbl *pVtbl; };
+
+struct {
+  void *handle;
+  ADLX_RESULT (*ADLXInitialize)(adlx_uint64 version, IADLXSystem** ppSystem);
+  ADLX_RESULT (*ADLXInitializeWithIncompatibleDriver)(adlx_uint64 version, IADLXSystem** ppSystem);
+  ADLX_RESULT (*ADLXQueryVersion)(const char** version);
+  ADLX_RESULT (*ADLXTerminate)();
+  IADLXSystem *sys;
+} adlx { NULL, NULL, NULL, NULL, NULL, NULL };
+static std::mutex ggml_adlx_lock;
+
+extern "C" {
+
+int ggml_hip_mgmt_init() {
+    std::lock_guard<std::mutex> lock(ggml_adlx_lock);
+    if (adlx.handle != NULL) {
+        // Already initialized
+        return 0;
+    }
+    DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
+    SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
+    fs::path libPath = fs::path("\\Windows") / fs::path("System32") / fs::path("amdadlx64.dll");
+
+    adlx.handle = (void*)LoadLibraryW(libPath.wstring().c_str());
+    if (adlx.handle == NULL) {
+        return ADLX_NOT_FOUND;
+    }
+
+    adlx.ADLXInitialize = (ADLX_RESULT (*)(adlx_uint64 version, IADLXSystem **ppSystem)) GetProcAddress((HMODULE)(adlx.handle), "ADLXInitialize");
+    adlx.ADLXInitializeWithIncompatibleDriver = (ADLX_RESULT (*)(adlx_uint64 version, IADLXSystem **ppSystem)) GetProcAddress((HMODULE)(adlx.handle), "ADLXInitializeWithIncompatibleDriver");
+    adlx.ADLXTerminate = (ADLX_RESULT (*)()) GetProcAddress((HMODULE)(adlx.handle), "ADLXTerminate");
+    adlx.ADLXQueryVersion = (ADLX_RESULT (*)(const char **version)) GetProcAddress((HMODULE)(adlx.handle), "ADLXQueryVersion");
+    if (adlx.ADLXInitialize == NULL || adlx.ADLXInitializeWithIncompatibleDriver == NULL || adlx.ADLXTerminate == NULL) {
+        GGML_LOG_INFO("%s unable to locate required symbols in amdadlx64.dll, falling back to hip free memory reporting", __func__);
+        FreeLibrary((HMODULE)(adlx.handle));
+        adlx.handle = NULL;
+        return ADLX_NOT_FOUND;
+    }
+
+    SetErrorMode(old_mode);
+
+    // Aid in troubleshooting...
+    if (adlx.ADLXQueryVersion != NULL) {
+        const char *version = NULL;
+        ADLX_RESULT status = adlx.ADLXQueryVersion(&version);
+        if (ADLX_SUCCEEDED(status)) {
+            GGML_LOG_DEBUG("%s located ADLX version %s\n", __func__, version);  
+        }
+    }
+
+    ADLX_RESULT status = adlx.ADLXInitialize(ADLX_FULL_VERSION, &adlx.sys);
+    if (ADLX_FAILED(status)) {
+        // GGML_LOG_DEBUG("%s failed to initialize ADLX error=%d - attempting with incompatible driver...\n", __func__, status);
+        // Try with the incompatible driver
+        status = adlx.ADLXInitializeWithIncompatibleDriver(ADLX_FULL_VERSION, &adlx.sys);
+        if (ADLX_FAILED(status)) {
+            GGML_LOG_INFO("%s failed to initialize ADLX error=%d\n", __func__, status);
+            FreeLibrary((HMODULE)(adlx.handle));
+            adlx.handle = NULL;
+            adlx.sys = NULL;
+            return status;
+        }
+        // GGML_LOG_DEBUG("%s initialized ADLX with incpomatible driver\n", __func__);
+    }
+    return ADLX_OK;
+}
+
+void ggml_hip_mgmt_release() {
+    std::lock_guard<std::mutex> lock(ggml_adlx_lock);
+    if (adlx.handle == NULL) {
+        // Already free
+        return;
+    }
+    ADLX_RESULT status = adlx.ADLXTerminate();
+    if (ADLX_FAILED(status)) {
+        GGML_LOG_INFO("%s failed to terminate Adlx %d\n", __func__, status);
+        // Unload anyway...
+    }
+    FreeLibrary((HMODULE)(adlx.handle));
+    adlx.handle = NULL;
+}
+
+#define adlx_gdm_cleanup \
+    if (gpuMetricsSupport != NULL) gpuMetricsSupport->pVtbl->Release(gpuMetricsSupport); \
+    if (gpuMetrics != NULL) gpuMetrics->pVtbl->Release(gpuMetrics); \
+    if (perfMonitoringServices != NULL) perfMonitoringServices->pVtbl->Release(perfMonitoringServices); \
+    if (gpus != NULL) gpus->pVtbl->Release(gpus); \
+    if (gpu != NULL) gpu->pVtbl->Release(gpu)
+
+int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total) {
+    std::lock_guard<std::mutex> lock(ggml_adlx_lock);
+    if (adlx.handle == NULL) {
+        GGML_LOG_INFO("%s ADLX was not initialized\n", __func__);
+        return ADLX_ADL_INIT_ERROR;
+    }
+    IADLXGPUMetricsSupport *gpuMetricsSupport = NULL;
+    IADLXPerformanceMonitoringServices *perfMonitoringServices = NULL;
+    IADLXGPUList* gpus = NULL;
+    IADLXGPU* gpu = NULL;
+    IADLXGPUMetrics *gpuMetrics = NULL;
+    ADLX_RESULT status;
+    // The "UniqueID" exposed in ADLX is the PCI Bus and Device IDs 
+    adlx_int target = (pci_bus_id << 8) | (pci_device_id & 0xff);
+
+    status = adlx.sys->pVtbl->GetPerformanceMonitoringServices(adlx.sys, &perfMonitoringServices);
+    if (ADLX_FAILED(status)) {
+        GGML_LOG_INFO("%s GetPerformanceMonitoringServices failed %d\n", __func__, status);
+        return status;
+    }
+
+    status = adlx.sys->pVtbl->GetGPUs(adlx.sys, &gpus);
+    if (ADLX_FAILED(status)) {
+        GGML_LOG_INFO("%s GetGPUs failed %d\n", __func__, status);
+        adlx_gdm_cleanup;
+        return status;
+    }
+
+    // Get GPU list
+    for (adlx_uint crt = gpus->pVtbl->Begin(gpus); crt != gpus->pVtbl->End(gpus); ++crt)
+    {
+        status = gpus->pVtbl->At_GPUList(gpus, crt, &gpu);
+        if (ADLX_FAILED(status))
+        {
+            GGML_LOG_INFO("%s %d] At_GPUList failed %d\n", __func__, crt, status);
+            continue;
+        }
+        adlx_int id;
+        status = gpu->pVtbl->UniqueId(gpu, &id);
+        if (ADLX_FAILED(status)) {
+            GGML_LOG_INFO("%s %d] UniqueId lookup failed %d\n", __func__, crt, status);
+            gpu->pVtbl->Release(gpu);
+            gpu = NULL;
+            continue;
+        }
+        if (id != target) {
+            GGML_LOG_DEBUG("%s %d] GPU UniqueId: %x does not match target %02x %02x\n", __func__, crt, id, pci_bus_id, pci_device_id);
+            gpu->pVtbl->Release(gpu);
+            gpu = NULL;
+            continue;
+        }
+        // Any failures at this point should cause a fall-back to other APIs
+        status = perfMonitoringServices->pVtbl->GetSupportedGPUMetrics(perfMonitoringServices, gpu, &gpuMetricsSupport);
+        if (ADLX_FAILED(status)) {
+            GGML_LOG_INFO("%s GetSupportedGPUMetrics failed %d\n", __func__, status);
+            adlx_gdm_cleanup;
+            return status;
+        }
+        status = perfMonitoringServices->pVtbl->GetCurrentGPUMetrics(perfMonitoringServices, gpu, &gpuMetrics);
+        if (ADLX_FAILED(status)) {
+            GGML_LOG_INFO("%s GetCurrentGPUMetrics failed %d\n", __func__, status);
+            adlx_gdm_cleanup;
+            return status;
+        }
+
+        adlx_bool supported = false;
+        status = gpuMetricsSupport->pVtbl->IsSupportedGPUVRAM(gpuMetricsSupport, &supported);
+        if (ADLX_FAILED(status)) {
+            GGML_LOG_INFO("%s IsSupportedGPUVRAM failed %d\n", __func__, status);
+            adlx_gdm_cleanup;
+            return status;
+        }
+        
+        adlx_uint totalVRAM = 0;
+        status = gpu->pVtbl->TotalVRAM(gpu, &totalVRAM);
+        if (ADLX_FAILED(status)) {
+            GGML_LOG_INFO("%s TotalVRAM failed %d\n", __func__, status);
+            adlx_gdm_cleanup;
+            return status;
+        }
+
+        adlx_int usedVRAM = 0;
+        status = gpuMetrics->pVtbl->GPUVRAM(gpuMetrics, &usedVRAM);
+        if (ADLX_FAILED(status)) {
+            GGML_LOG_INFO("%s GPUVRAM failed %d\n", __func__, status);
+            adlx_gdm_cleanup;
+            return status;
+        }
+        *total = size_t(totalVRAM) * 1024 * 1024;
+        *free = size_t(totalVRAM-usedVRAM) * 1024 * 1024;
+
+        adlx_gdm_cleanup;
+        return ADLX_OK;
+    }
+    adlx_gdm_cleanup;
+    return ADLX_NOT_FOUND;
+}
+
+} // extern "C"
+
+#else // #ifdef _WIN32
+
+extern "C" {
+
+// TODO Linux implementation of accurate VRAM reporting
+int ggml_hip_mgmt_init() {
+    return -1;
+}
+void ggml_hip_mgmt_release() {}
+int ggml_hip_get_device_memory(int pci_bus_id, int pci_device_id, size_t *free, size_t *total) {
+    return -1;
+}
+
+} // extern "C"
+
+#endif // #ifdef _WIN32
+\ No newline at end of file
+diff --git a/ggml/src/mem_nvml.cpp b/ggml/src/mem_nvml.cpp
+new file mode 100644
+index 000000000..aa05e9dc1
+--- /dev/null
+++ b/ggml/src/mem_nvml.cpp
+@@ -0,0 +1,172 @@
+// NVIDIA Management Library (NVML)
+//
+// https://developer.nvidia.com/management-library-nvml
+//
+// This library provides accurate VRAM reporting for NVIDIA GPUs, particularly
+// on Windows, where the cuda library provides inaccurate VRAM usage metrics. The
+// runtime DLL is installed with every driver on Windows, and most Linux
+// systems, and the headers are included in the standard CUDA SDK install.  As
+// such, we can include the header here to simplify the code.
+
+
+#include "ggml-impl.h"
+#include <filesystem>
+#include <mutex>
+
+#ifdef _WIN32
+#    define WIN32_LEAN_AND_MEAN
+#    ifndef NOMINMAX
+#        define NOMINMAX
+#    endif
+#    include <windows.h>
+#else
+#    include <dlfcn.h>
+#    include <unistd.h>
+#endif
+
+namespace fs = std::filesystem;
+
+// Minimal definitions to avoid including the nvml.h header
+typedef enum nvmlReturn_enum
+{
+    // cppcheck-suppress *
+    NVML_SUCCESS = 0,                          //!< The operation was successful
+    NVML_ERROR_UNINITIALIZED = 1,              //!< NVML was not first initialized with nvmlInit()
+    NVML_ERROR_INVALID_ARGUMENT = 2,           //!< A supplied argument is invalid
+    NVML_ERROR_NOT_SUPPORTED = 3,              //!< The requested operation is not available on target device
+    NVML_ERROR_NO_PERMISSION = 4,              //!< The current user does not have permission for operation
+    NVML_ERROR_ALREADY_INITIALIZED = 5,        //!< Deprecated: Multiple initializations are now allowed through ref counting
+    NVML_ERROR_NOT_FOUND = 6,                  //!< A query to find an object was unsuccessful
+    NVML_ERROR_INSUFFICIENT_SIZE = 7,          //!< An input argument is not large enough
+    NVML_ERROR_INSUFFICIENT_POWER = 8,         //!< A device's external power cables are not properly attached
+    NVML_ERROR_DRIVER_NOT_LOADED = 9,          //!< NVIDIA driver is not loaded
+    NVML_ERROR_TIMEOUT = 10,                   //!< User provided timeout passed
+    NVML_ERROR_IRQ_ISSUE = 11,                 //!< NVIDIA Kernel detected an interrupt issue with a GPU
+    NVML_ERROR_LIBRARY_NOT_FOUND = 12,         //!< NVML Shared Library couldn't be found or loaded
+    NVML_ERROR_FUNCTION_NOT_FOUND = 13,        //!< Local version of NVML doesn't implement this function
+    NVML_ERROR_CORRUPTED_INFOROM = 14,         //!< infoROM is corrupted
+    NVML_ERROR_GPU_IS_LOST = 15,               //!< The GPU has fallen off the bus or has otherwise become inaccessible
+    NVML_ERROR_RESET_REQUIRED = 16,            //!< The GPU requires a reset before it can be used again
+    NVML_ERROR_OPERATING_SYSTEM = 17,          //!< The GPU control device has been blocked by the operating system/cgroups
+    NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18,   //!< RM detects a driver/library version mismatch
+    NVML_ERROR_IN_USE = 19,                    //!< An operation cannot be performed because the GPU is currently in use
+    NVML_ERROR_MEMORY = 20,                    //!< Insufficient memory
+    NVML_ERROR_NO_DATA = 21,                   //!< No data
+    NVML_ERROR_VGPU_ECC_NOT_SUPPORTED = 22,    //!< The requested vgpu operation is not available on target device, becasue ECC is enabled
+    NVML_ERROR_INSUFFICIENT_RESOURCES = 23,    //!< Ran out of critical resources, other than memory
+    NVML_ERROR_FREQ_NOT_SUPPORTED = 24,        //!< Ran out of critical resources, other than memory
+    NVML_ERROR_ARGUMENT_VERSION_MISMATCH = 25, //!< The provided version is invalid/unsupported
+    NVML_ERROR_DEPRECATED  = 26,               //!< The requested functionality has been deprecated
+    NVML_ERROR_NOT_READY = 27,                 //!< The system is not ready for the request
+    NVML_ERROR_GPU_NOT_FOUND = 28,             //!< No GPUs were found
+    NVML_ERROR_INVALID_STATE = 29,             //!< Resource not in correct state to perform requested operation
+    NVML_ERROR_UNKNOWN = 999                   //!< An internal driver error occurred
+} nvmlReturn_t;
+typedef struct nvmlDevice_st* nvmlDevice_t;
+typedef struct nvmlMemory_st
+{
+    unsigned long long total;        //!< Total physical device memory (in bytes)
+    unsigned long long free;         //!< Unallocated device memory (in bytes)
+    unsigned long long used;         //!< Sum of Reserved and Allocated device memory (in bytes).
+                                     //!< Note that the driver/GPU always sets aside a small amount of memory for bookkeeping
+} nvmlMemory_t;
+// end nvml.h definitions
+
+struct {
+  void *handle;
+  nvmlReturn_t (*nvmlInit_v2)(void);
+  nvmlReturn_t (*nvmlShutdown)(void);
+  nvmlReturn_t (*nvmlDeviceGetHandleByUUID)(const char *, nvmlDevice_t *);
+  nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *);
+} nvml { NULL, NULL, NULL, NULL, NULL };
+static std::mutex ggml_nvml_lock;
+
+extern "C" {
+
+int ggml_nvml_init() {
+    std::lock_guard<std::mutex> lock(ggml_nvml_lock);
+    if (nvml.handle != NULL) {
+        // Already initialized
+        return 0;
+    }
+#ifdef _WIN32
+    DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
+    SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
+    fs::path libPath[2];
+    const char * programDir = std::getenv("ProgramW6432");
+    if (programDir == NULL) {
+        libPath[0] = fs::path("Program Files") / fs::path("NVIDIA Corporation") / fs::path("NVSMI") / fs::path("NVML.dll");
+    } else {
+        libPath[0] = fs::path(programDir) / fs::path("NVIDIA Corporation") / fs::path("NVSMI") / fs::path("NVML.dll");
+    }
+    libPath[1] = fs::path("\\Windows") / fs::path("System32") / fs::path("NVML.dll");
+
+    for (int i = 0; i < 2; i++) {
+        nvml.handle = (void*)LoadLibraryW(libPath[i].wstring().c_str());
+        if (nvml.handle != NULL) {
+            break;
+        }
+    }
+    if (nvml.handle == NULL) {
+        return NVML_ERROR_NOT_FOUND;
+    }
+
+    nvml.nvmlInit_v2 = (nvmlReturn_enum (*)()) GetProcAddress((HMODULE)(nvml.handle), "nvmlInit_v2");
+    nvml.nvmlShutdown = (nvmlReturn_enum (*)()) GetProcAddress((HMODULE)(nvml.handle), "nvmlShutdown");
+    nvml.nvmlDeviceGetHandleByUUID = (nvmlReturn_t (*)(const char *, nvmlDevice_t *)) GetProcAddress((HMODULE)(nvml.handle), "nvmlDeviceGetHandleByUUID");
+    nvml.nvmlDeviceGetMemoryInfo = (nvmlReturn_t (*)(nvmlDevice_t, nvmlMemory_t *)) GetProcAddress((HMODULE)(nvml.handle), "nvmlDeviceGetMemoryInfo");
+    if (nvml.nvmlInit_v2 == NULL || nvml.nvmlShutdown == NULL || nvml.nvmlDeviceGetHandleByUUID == NULL || nvml.nvmlDeviceGetMemoryInfo == NULL) {
+        GGML_LOG_INFO("%s unable to locate required symbols in NVML.dll", __func__);
+        FreeLibrary((HMODULE)(nvml.handle));
+        nvml.handle = NULL;
+        return NVML_ERROR_NOT_FOUND;
+    }
+
+    SetErrorMode(old_mode);
+
+#else
+    // Not currently wired up on Linux
+    return NVML_ERROR_NOT_SUPPORTED;
+#endif
+    int status = nvml.nvmlInit_v2();
+    return NVML_SUCCESS;
+}
+
+void ggml_nvml_release() {
+    std::lock_guard<std::mutex> lock(ggml_nvml_lock);
+    if (nvml.handle == NULL) {
+        // Already free
+        return;
+    }
+    nvmlReturn_enum status = nvml.nvmlShutdown();
+    if (status != NVML_SUCCESS) {
+        GGML_LOG_INFO("%s failed to shutdown NVML: %d\n", __func__, status);
+    }
+#ifdef _WIN32
+    FreeLibrary((HMODULE)(nvml.handle));
+    nvml.handle = NULL;
+#else
+    // Not currently wired up on Linux
+#endif
+}
+
+int ggml_nvml_get_device_memory(const char *uuid, size_t *free, size_t *total) {
+    std::lock_guard<std::mutex> lock(ggml_nvml_lock);
+    if (nvml.handle == NULL) {
+        return NVML_ERROR_UNINITIALIZED;
+    }
+    nvmlDevice_t device;
+    auto status = nvml.nvmlDeviceGetHandleByUUID(uuid, &device);
+    if (status != NVML_SUCCESS) {
+        return status;
+    }
+    nvmlMemory_t memInfo = {0};
+    status = nvml.nvmlDeviceGetMemoryInfo(device, &memInfo);
+    if (status == NVML_SUCCESS) {
+        *free = memInfo.free;
+        *total = memInfo.total;
+    }
+    return status;
+}
+
+}
+\ No newline at end of file
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -196,7 +196,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	}
 	useFlashAttention := (envconfig.FlashAttention() || f.FlashAttention()) &&
-		discover.GetGPUInfo().FlashAttentionSupported() &&
+		(discover.GpuInfoList)(gpus).FlashAttentionSupported() &&
 		f.SupportsFlashAttention()
 	var kvct string
@@ -231,7 +231,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	}
 	// on metal there's no partial offload overhead
-	if gpus[0].Library == "metal" {
+	if gpus[0].Library == "Metal" {
 		graphPartialOffload = graphFullOffload
 	} else if len(gpus) > 1 {
 		// multigpu should always use the partial graph size

--- a/llm/memory_test.go
+++ b/llm/memory_test.go
@@ -12,6 +12,7 @@ import (
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/fs/ggml"
+	"github.com/ollama/ollama/ml"
 )
 func TestEstimateGPULayers(t *testing.T) {
@@ -55,7 +56,9 @@ func TestEstimateGPULayers(t *testing.T) {
 	// Simple CPU scenario
 	gpus := []discover.GpuInfo{
 		{
-			Library: "cpu",
+			DeviceID: ml.DeviceID{
+				Library: "cpu",
+			},
 		},
 	}
 	projectors := []string{}
@@ -77,11 +80,15 @@ func TestEstimateGPULayers(t *testing.T) {
 	gpuMinimumMemory := uint64(2048)
 	gpus = []discover.GpuInfo{
 		{
-			Library:       "cuda",
+			DeviceID: ml.DeviceID{
+				Library: "cuda",
+			},
 			MinimumMemory: gpuMinimumMemory,
 		},
 		{
-			Library:       "cuda",
+			DeviceID: ml.DeviceID{
+				Library: "cuda",
+			},
 			MinimumMemory: gpuMinimumMemory,
 		},
 	}

--- a/llm/server.go
+++ b/llm/server.go
@@ -66,7 +66,7 @@ func (e filteredEnv) LogValue() slog.Value {
 type LlamaServer interface {
 	ModelPath() string
-	Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) error
+	Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error)
 	Ping(ctx context.Context) error
 	WaitUntilRunning(ctx context.Context) error
 	Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
@@ -76,8 +76,11 @@ type LlamaServer interface {
 	Close() error
 	VRAMSize() uint64 // Total VRAM across all GPUs
 	TotalSize() uint64
-	VRAMByGPU(gpuID string) uint64
+	VRAMByGPU(id ml.DeviceID) uint64
 	Pid() int
+	GetPort() int
+	GetDeviceInfos(ctx context.Context) []ml.DeviceInfo
+	HasExited() bool
 }
 // llmServer is an instance of a runner hosting a single model
@@ -331,6 +334,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 			if gpu.DependencyPath != nil {
 				slog.Debug("adding gpu dependency paths", "paths", gpu.DependencyPath)
 				libraryPaths = append(gpu.DependencyPath, libraryPaths...)
+				ggmlPaths = append(ggmlPaths, gpu.DependencyPath...)
 			}
 		}
@@ -361,12 +365,8 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		s.cmd.Env = append(s.cmd.Env, "OLLAMA_LIBRARY_PATH="+strings.Join(ggmlPaths, string(filepath.ListSeparator)))
-		envWorkarounds := []string{}
-		for _, gpu := range gpus {
-			envWorkarounds = append(envWorkarounds, gpu.EnvWorkarounds...)
-		}
 		// Always filter down the set of GPUs in case there are any unsupported devices that might crash
-		envWorkarounds = append(envWorkarounds, gpus.GetVisibleDevicesEnv()...)
+		envWorkarounds := gpus.GetVisibleDevicesEnv()
 		pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
 		// Update or add the path variable with our adjusted version
@@ -496,7 +496,7 @@ type LoadResponse struct {
 var ErrLoadRequiredFull = errors.New("unable to load full model on GPU")
-func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) error {
+func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error) {
 	systemInfo := discover.GetSystemInfo()
 	systemTotalMemory := systemInfo.System.TotalMemory
 	systemFreeMemory := systemInfo.System.FreeMemory
@@ -509,7 +509,7 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
 			g = pickBestPartialFitByLibrary(s.ggml, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, gpus, s.numParallel)
 		} else {
 			slog.Info("model requires more memory than is currently available, evicting a model to make space", "estimate", s.estimate)
-			return ErrLoadRequiredFull
+			return nil, ErrLoadRequiredFull
 		}
 	}
@@ -518,13 +518,13 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
 	if len(gpus) > 1 || gpus[0].Library != "cpu" {
 		switch {
-		case gpus[0].Library == "metal" && s.estimate.VRAMSize > systemInfo.System.TotalMemory:
+		case gpus[0].Library == "Metal" && s.estimate.VRAMSize > systemInfo.System.TotalMemory:
 			// disable partial offloading when model is greater than total system memory as this
 			// can lead to locking up the system
 			s.options.NumGPU = 0
-		case gpus[0].Library != "metal" && s.estimate.Layers == 0:
+		case gpus[0].Library != "Metal" && s.estimate.Layers == 0:
 			// Don't bother loading into the GPU if no layers can fit
-			gpus = discover.GetCPUInfo()
+			gpus = discover.GpuInfoList{discover.GetCPUInfo()}
 		case s.options.NumGPU < 0 && s.estimate.Layers > 0 && gpus[0].Library != "cpu":
 			s.options.NumGPU = s.estimate.Layers
 		}
@@ -537,7 +537,7 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
 		available := systemInfo.System.FreeMemory + systemInfo.System.FreeSwap
 		if systemMemoryRequired > available {
 			slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "available", format.HumanBytes2(available), "total", format.HumanBytes2(systemInfo.System.TotalMemory), "free", format.HumanBytes2(systemInfo.System.FreeMemory), "swap", format.HumanBytes2(systemInfo.System.FreeSwap))
-			return fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available))
+			return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available))
 		}
 	}
@@ -552,7 +552,7 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
 		// mmap has issues with partial offloading on metal
 		for _, g := range gpus {
-			if g.Library == "metal" &&
+			if g.Library == "Metal" &&
 				uint64(s.options.NumGPU) > 0 &&
 				uint64(s.options.NumGPU) < s.ggml.KV().BlockCount()+1 {
 				s.options.UseMMap = new(bool)
@@ -563,7 +563,7 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
 		// Windows CUDA should not use mmap for best performance
 		// Linux  with a model larger than free space, mmap leads to thrashing
 		// For CPU loads we want the memory to be allocated, not FS cache
-		if (runtime.GOOS == "windows" && gpus[0].Library == "cuda" && s.options.UseMMap == nil) ||
+		if (runtime.GOOS == "windows" && gpus[0].Library == "CUDA" && s.options.UseMMap == nil) ||
 			(runtime.GOOS == "linux" && systemInfo.System.FreeMemory < s.estimate.TotalSize && s.options.UseMMap == nil) ||
 			(gpus[0].Library == "cpu" && s.options.UseMMap == nil) ||
 			(s.options.UseMMap != nil && !*s.options.UseMMap) {
@@ -572,12 +572,12 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
 	}
 	if err := s.waitUntilRunnerLaunched(ctx); err != nil {
-		return err
+		return nil, err
 	}
 	resp, err := s.initModel(ctx, s.loadRequest, LoadOperationCommit)
 	if err != nil {
-		return err
+		return nil, err
 	}
 	// On the Ollama engine, we can print out a summary of the memory allocations.
@@ -588,16 +588,16 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
 	if !resp.Success {
 		slog.Warn("failed to allocate memory for model", "memory", resp.Memory)
-		return errors.New("failed to allocate memory for model")
+		return nil, errors.New("failed to allocate memory for model")
 	}
 	// The llama engine does its memory allocations together with model loading, so we
 	// need to wait until it is done to ensure that we have accurate memory data before
 	// loading the next model
 	if s.textProcessor == nil {
-		return s.WaitUntilRunning(ctx)
+		return uniqueDeviceIDs(s.loadRequest.GPULayers), s.WaitUntilRunning(ctx)
 	} else {
-		return nil
+		return uniqueDeviceIDs(s.loadRequest.GPULayers), nil
 	}
 }
@@ -610,7 +610,7 @@ func createGPULayers(estimate MemoryEstimate, ggml *ggml.GGML, gpus discover.Gpu
 	gpuLayers := make(ml.GPULayersList, len(gpus))
 	for i := range gpuLayers {
-		gpuLayers[i].ID = gpus[i].ID
+		gpuLayers[i].DeviceID = gpus[i].DeviceID
 	}
 	var sum float32
@@ -658,7 +658,9 @@ func createGPULayers(estimate MemoryEstimate, ggml *ggml.GGML, gpus discover.Gpu
 //
 // This process is repeated for higher levels of loading the model (fit, allocate, commit). The earlier levels are quicker,
 // allowing for faster iteration, but may return less information.
-func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) error {
+//
+// Returns the list of GPU IDs that were used in the final allocation on success
+func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error) {
 	var success bool
 	defer func() {
 		if !success {
@@ -683,7 +685,7 @@ func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requ
 			if gpu.FreeMemory < envconfig.GpuOverhead()+gpu.MinimumMemory {
 				available = 0
 			}
-			slog.Info("gpu memory", "id", gpu.ID,
+			slog.Info("gpu memory", "id", gpu.ID, "library", gpu.Library,
 				"available", format.HumanBytes2(available),
 				"free", format.HumanBytes2(gpu.FreeMemory),
 				"minimum", format.HumanBytes2(gpu.MinimumMemory),
@@ -696,11 +698,11 @@ func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requ
 	gpuLayers, err := s.createLayout(systemInfo, gpus, s.mem, requireFull, backoff)
 	if err != nil {
-		return err
+		return nil, err
 	}
 	if err := s.waitUntilRunnerLaunched(ctx); err != nil {
-		return err
+		return nil, err
 	}
 nextOperation:
@@ -710,7 +712,7 @@ nextOperation:
 			s.loadRequest.GPULayers = gpuLayers
 			resp, err := s.initModel(ctx, s.loadRequest, operation)
 			if err != nil {
-				return err
+				return nil, err
 			}
 			resp.Memory.Log(slog.LevelDebug)
@@ -722,7 +724,7 @@ nextOperation:
 			for {
 				newGPULayers, err := s.createLayout(systemInfo, gpus, s.mem, requireFull, backoff)
 				if err != nil {
-					return err
+					return nil, err
 				}
 				slog.Debug("new layout created", "layers", newGPULayers)
@@ -756,7 +758,7 @@ nextOperation:
 						newGPULayers, err = s.createLayout(systemInfo, gpus, s.mem, requireFull, backoff)
 						s.options.NumGPU = -1
 						if err != nil {
-							return err
+							return nil, err
 						}
 						slog.Debug("new layout created", "layers", newGPULayers)
@@ -764,7 +766,7 @@ nextOperation:
 						s.loadRequest.GPULayers = newGPULayers
 						resp, err = s.initModel(ctx, s.loadRequest, operation)
 						if err != nil {
-							return err
+							return nil, err
 						}
 						resp.Memory.Log(slog.LevelDebug)
@@ -773,7 +775,7 @@ nextOperation:
 						if resp.Success {
 							verifyGPULayers, err := s.createLayout(systemInfo, gpus, &resp.Memory, requireFull, backoff)
 							if err != nil {
-								return err
+								return nil, err
 							}
 							slog.Debug("verifying layout", "layers", verifyGPULayers)
@@ -798,7 +800,7 @@ nextOperation:
 				}
 				if s.options.NumGPU >= 0 {
-					return fmt.Errorf("memory layout cannot be allocated with num_gpu = %v", s.options.NumGPU)
+					return nil, fmt.Errorf("memory layout cannot be allocated with num_gpu = %v", s.options.NumGPU)
 				}
 				// Memory allocation failed even though we created a layout that we thought should
@@ -808,7 +810,7 @@ nextOperation:
 				// space.
 				if backoff > 1 {
 					slog.Warn("memory layout cannot be allocated", "memory", resp.Memory)
-					return errors.New("memory layout cannot be allocated")
+					return nil, errors.New("memory layout cannot be allocated")
 				} else if backoff == 0 {
 					backoff = 0.01
 				} else {
@@ -823,7 +825,7 @@ nextOperation:
 	s.loadRequest.GPULayers = gpuLayers
 	resp, err := s.initModel(ctx, s.loadRequest, LoadOperationCommit)
 	if err != nil {
-		return err
+		return nil, err
 	}
 	success = resp.Success
@@ -831,10 +833,27 @@ nextOperation:
 	if !success {
 		slog.Warn("failed to commit memory for model", "memory", resp.Memory)
-		return errors.New("failed to commit memory for model")
+		return nil, errors.New("failed to commit memory for model")
 	}
-	return nil
+	return uniqueDeviceIDs(gpuLayers), nil
+}
+func uniqueDeviceIDs(gpuLayers ml.GPULayersList) []ml.DeviceID {
+	devices := []ml.DeviceID{}
+	for _, layer := range gpuLayers {
+		new := true
+		for _, ID := range devices {
+			if layer.DeviceID == ID {
+				new = false
+				break
+			}
+		}
+		if new {
+			devices = append(devices, layer.DeviceID)
+		}
+	}
+	return devices
 }
 // createLayout uses the current best view of memory requirements and creates a layout of model layers on GPUs.
@@ -879,7 +898,7 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
 		for i := range gl {
 			found := false
 			for j := range memory.GPUs {
-				if gl[i].ID == memory.GPUs[j].ID {
+				if gl[i].DeviceID == memory.GPUs[j].DeviceID {
 					if memory.GPUs[j].Graph != 0 {
 						lastUsedGPU = i
 					}
@@ -891,7 +910,7 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
 						gl[i].FreeMemory = 0
 					}
-					slog.Debug("available gpu", "id", gl[i].ID,
+					slog.Debug("available gpu", "id", gl[i].ID, "library", gl[i].Library,
 						"available layer vram", format.HumanBytes2(gl[i].FreeMemory),
 						"backoff", fmt.Sprintf("%.2f", backoff), "minimum", format.HumanBytes2(gl[i].MinimumMemory),
 						"overhead", format.HumanBytes2(envconfig.GpuOverhead()),
@@ -918,7 +937,7 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
 	var vramSize uint64
 	for _, gl := range gpuLayers {
 		for _, gpu := range memory.GPUs {
-			if gl.ID == gpu.ID {
+			if gl.DeviceID == gpu.DeviceID {
 				vramSize += gpu.Graph
 				break
 			}
@@ -1039,7 +1058,7 @@ func findBestFit(layers []uint64, gpus discover.GpuInfoList, requestedLayers int
 // greedyFit assigns layers incrementally to GPUs, spilling over as each runs out of free space
 func greedyFit(layers []uint64, gpus discover.GpuInfoList, capacity float32, requestedLayers int) (gpuLayers ml.GPULayersList) {
 	device := len(gpus) - 1
-	gpuLayers = ml.GPULayersList{{ID: gpus[device].ID}}
+	gpuLayers = ml.GPULayersList{{DeviceID: gpus[device].DeviceID}}
 	freeSpace := uint64(float32(gpus[device].FreeMemory) * capacity)
 	for i := len(layers) - 1; i >= 0; i-- {
 		if requestedLayers >= 0 && len(layers)-1-i >= requestedLayers {
@@ -1057,7 +1076,7 @@ func greedyFit(layers []uint64, gpus discover.GpuInfoList, capacity float32, req
 			if device < 0 {
 				return gpuLayers
 			}
-			gpuLayers = append(ml.GPULayersList{{ID: gpus[device].ID}}, gpuLayers...)
+			gpuLayers = append(ml.GPULayersList{{DeviceID: gpus[device].DeviceID}}, gpuLayers...)
 			freeSpace = uint64(float32(gpus[device].FreeMemory) * capacity)
 		}
 	}
@@ -1312,6 +1331,17 @@ func (s *llmServer) Pid() int {
 	return -1
 }
+func (s *llmServer) GetPort() int {
+	return s.port
+}
+func (s *llmServer) HasExited() bool {
+	if s.cmd != nil && s.cmd.ProcessState != nil && s.cmd.ProcessState.ExitCode() >= 0 {
+		return true
+	}
+	return false
+}
 var grammarJSON = `
 root   ::= object
 value  ::= object | array | string | number | ("true" | "false" | "null") ws
@@ -1386,7 +1416,7 @@ type CompletionResponse struct {
 func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
 	slog.Debug("completion request", "images", len(req.Images), "prompt", len(req.Prompt), "format", string(req.Format))
-	slog.Log(ctx, logutil.LevelTrace, "completion request", "prompt", req.Prompt)
+	logutil.Trace("completion request", "prompt", req.Prompt)
 	if len(req.Format) > 0 {
 		switch string(req.Format) {
@@ -1552,7 +1582,7 @@ type EmbeddingResponse struct {
 }
 func (s *llmServer) Embedding(ctx context.Context, input string) ([]float32, error) {
-	slog.Log(ctx, logutil.LevelTrace, "embedding request", "input", input)
+	logutil.Trace("embedding request", "input", input)
 	if err := s.sem.Acquire(ctx, 1); err != nil {
 		if errors.Is(err, context.Canceled) {
@@ -1704,9 +1734,9 @@ func (s *llamaServer) TotalSize() uint64 {
 	return s.estimate.TotalSize
 }
-func (s *llamaServer) VRAMByGPU(gpuID string) uint64 {
+func (s *llamaServer) VRAMByGPU(id ml.DeviceID) uint64 {
 	for i, gpu := range s.gpus {
-		if gpu.ID == gpuID {
+		if gpu.DeviceID == id {
 			if i < len(s.estimate.GPUSizes) {
 				return s.estimate.GPUSizes[i]
 			}
@@ -1715,6 +1745,11 @@ func (s *llamaServer) VRAMByGPU(gpuID string) uint64 {
 	return 0
 }
+func (s *llamaServer) GetDeviceInfos(ctx context.Context) []ml.DeviceInfo {
+	slog.Debug("llamarunner free vram reporting not supported")
+	return nil
+}
 func (s *ollamaServer) VRAMSize() uint64 {
 	if s.mem == nil {
 		return 0
@@ -1757,16 +1792,28 @@ func (s *ollamaServer) TotalSize() uint64 {
 	return mem
 }
-func (s *ollamaServer) VRAMByGPU(gpuID string) uint64 {
+func (s *ollamaServer) VRAMByGPU(id ml.DeviceID) uint64 {
 	if s.mem == nil {
 		return 0
 	}
 	for _, g := range s.mem.GPUs {
-		if g.ID == gpuID {
+		if g.DeviceID == id {
 			return g.Size()
 		}
 	}
 	return 0
 }
+func (s *ollamaServer) GetDeviceInfos(ctx context.Context) []ml.DeviceInfo {
+	devices, err := discover.GetDevicesFromRunner(ctx, s)
+	if err != nil {
+		if s.cmd != nil && s.cmd.ProcessState == nil {
+			// Still running but hit an error, log
+			slog.Debug("failure refreshing GPU information", "error", err)
+		}
+		// else no longer running so suppress logging as a failure is expected
+	}
+	return devices
+}
--- a/llm/server_test.go
+++ b/llm/server_test.go
@@ -16,8 +16,8 @@ import (
 func TestLLMServerFitGPU(t *testing.T) {
 	type gpu struct {
-		library string
+		id   ml.DeviceID
-		free    int
+		free int
 	}
 	tests := []struct {
@@ -37,91 +37,91 @@ func TestLLMServerFitGPU(t *testing.T) {
 		},
 		{
 			name:     "Full single GPU",
-			gpus:     []gpu{{free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
 			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   -1,
-			expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{0, 1, 2}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0, 1, 2}}},
 		},
 		{
 			name:     "Partial single GPU",
-			gpus:     []gpu{{free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
 			layers:   []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
 			numGPU:   -1,
-			expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{1, 2}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1, 2}}},
 		},
 		{
 			name:     "Single GPU with numGPU 1",
-			gpus:     []gpu{{free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
 			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   1,
-			expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{1}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1}}},
 		},
 		{
 			name:     "Single GPU with numGPU 0",
-			gpus:     []gpu{{free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
 			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   0,
 			expected: ml.GPULayersList{},
 		},
 		{
 			name:     "Single GPU with numGPU 999",
-			gpus:     []gpu{{free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
 			layers:   []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
 			numGPU:   999,
-			expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{0, 1, 2, 3}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0, 1, 2, 3}}},
 		},
 		{
 			name:     "Multi GPU fits on one",
-			gpus:     []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
 			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   -1,
-			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1, 2}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0, 1, 2}}},
 		},
 		{
 			name:     "Multi GPU split",
-			gpus:     []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
 			layers:   []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   -1,
-			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0}}, {ID: "gpu0", Layers: []int{1, 2}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1, 2}}},
 		},
 		{
 			name:     "Multi GPU partial",
-			gpus:     []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
 			layers:   []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   -1,
-			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{1}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{1}}},
 		},
 		{
 			name:     "Multi GPU numGPU 1",
-			gpus:     []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
 			layers:   []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   1,
-			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{1}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{1}}},
 		},
 		{
 			name:     "Multi GPU numGPU 2",
-			gpus:     []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
 			layers:   []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   2,
-			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0}}, {ID: "gpu0", Layers: []int{1}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1}}},
 		},
 		{
 			name:     "Multi GPU numGPU 999",
-			gpus:     []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}},
 			layers:   []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   999,
-			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1}}, {ID: "gpu0", Layers: []int{2}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0, 1}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{2}}},
 		},
 		{
 			name:     "Multi GPU different libraries",
-			gpus:     []gpu{{library: "cuda", free: 128 * format.MebiByte}, {library: "rocm", free: 256 * format.MebiByte}},
+			gpus:     []gpu{{id: ml.DeviceID{Library: "CUDA", ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{Library: "ROCm", ID: "gpu1"}, free: 256 * format.MebiByte}},
 			layers:   []int{128 * format.MebiByte, 128 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:   -1,
-			expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1}}},
+			expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1", Library: "ROCm"}, Layers: []int{0, 1}}},
 		},
 		{
 			name:        "requireFull",
-			gpus:        []gpu{{free: 256 * format.MebiByte}},
+			gpus:        []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
 			layers:      []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
 			numGPU:      -1,
 			requireFull: true,
@@ -138,8 +138,7 @@ func TestLLMServerFitGPU(t *testing.T) {
 			gpus := make(discover.GpuInfoList, len(tt.gpus))
 			for i := range tt.gpus {
-				gpus[i].ID = fmt.Sprintf("gpu%d", i)
+				gpus[i].DeviceID = tt.gpus[i].id
-				gpus[i].Library = tt.gpus[i].library
 				gpus[i].FreeMemory = uint64(tt.gpus[i].free)
 			}
@@ -164,7 +163,7 @@ func TestLLMServerFitGPU(t *testing.T) {
 			}
 			for i := range s.mem.GPUs {
-				s.mem.GPUs[i].ID = fmt.Sprintf("gpu%d", i)
+				s.mem.GPUs[i].DeviceID = gpus[i].DeviceID
 				s.mem.GPUs[i].Weights = make([]uint64, s.totalLayers)
 				s.mem.GPUs[i].Cache = make([]uint64, s.totalLayers)
 			}

--- a/ml/backend.go
+++ b/ml/backend.go
@@ -5,14 +5,11 @@ import (
 	"context"
 	"encoding/binary"
 	"fmt"
-	"hash/maphash"
-	"log/slog"
 	"math"
 	"slices"
 	"strconv"
 	"strings"
-	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/fs"
 )
@@ -29,6 +26,9 @@ type Backend interface {
 	Get(name string) Tensor
 	NewContext() Context
 	NewContextSize(size int) Context
+	// Enumerate the devices available for inference via this backend
+	BackendDevices() []DeviceInfo
 }
 // BackendCacheConfig should be implemented by backends that need special output
@@ -60,77 +60,6 @@ type CacheConfig struct {
 	MaskBatchPadding int
 }
-// GPULayers is a set of layers to be allocated on a single GPU
-type GPULayers struct {
-	// ID is the identifier of the GPU, as reported in DeviceMemory
-	ID string
-	// Layers is a set of layer indicies to load
-	Layers []int
-}
-func (g GPULayers) String() string {
-	if len(g.Layers) == 0 {
-		return ""
-	}
-	slices.Sort(g.Layers)
-	contiguous := true
-	base := g.Layers[0]
-	for i := range g.Layers {
-		if g.Layers[i] != base+i {
-			contiguous = false
-			break
-		}
-	}
-	if contiguous {
-		return fmt.Sprintf("ID:%v Layers:%v(%v..%v)", g.ID, len(g.Layers), g.Layers[0], g.Layers[len(g.Layers)-1])
-	} else {
-		return fmt.Sprintf("ID:%v Layers:%v%v", g.ID, len(g.Layers), g.Layers)
-	}
-}
-// GPULayersList is a set of layer allocations across multiple GPUs
-type GPULayersList []GPULayers
-func (l GPULayersList) String() string {
-	if l.Sum() > 0 {
-		return fmt.Sprintf("%v%v", l.Sum(), []GPULayers(l))
-	} else {
-		return fmt.Sprintf("%v", []GPULayers(l))
-	}
-}
-// Sum is the total number of layers assigned across all GPUs
-func (l GPULayersList) Sum() int {
-	var sum int
-	for _, g := range l {
-		sum += len(g.Layers)
-	}
-	return sum
-}
-var h maphash.Hash
-// Hash is an identifier of this layer assignment
-func (l GPULayersList) Hash() uint64 {
-	h.Reset()
-	for _, g := range l {
-		if len(g.Layers) > 0 {
-			h.WriteString(g.ID)
-			for _, l := range g.Layers {
-				binary.Write(&h, binary.NativeEndian, int64(l))
-			}
-		}
-	}
-	return h.Sum64()
-}
 // BackendParams controls how the backend loads and executes models
 type BackendParams struct {
 	// AllocMemory causes the backend to allocate memory for the model. If
@@ -148,150 +77,6 @@ type BackendParams struct {
 	FlashAttention bool
 }
-// ErrNoMem is returned when panicing due to insufficient memory. It includes
-// the attempted memory allocation.
-type ErrNoMem struct {
-	BackendMemory
-}
-func (e ErrNoMem) Error() string {
-	return fmt.Sprintf("insufficient memory - required allocations: %+v", e.BackendMemory)
-}
-// DeviceMemory provides a breakdown of the memory needed
-// per device, such as a CPU or GPU.
-type DeviceMemory struct {
-	// Name is the name of the device as labeled by the backend. It
-	// may not be persistent across instances of the runner.
-	Name string
-	// ID is an identifier for the device for matching with system
-	// management libraries.
-	ID string
-	// Weights is the per-layer memory needed for the model weights.
-	Weights []uint64
-	// Cache is the per-layer memory needed for the KV cache.
-	Cache []uint64
-	// Graph is the size of the compute graph. It is not per-layer.
-	Graph uint64
-}
-func sumMemory(mem []uint64) uint64 {
-	var sum uint64
-	for _, m := range mem {
-		sum += m
-	}
-	return sum
-}
-// Size returns the total size of the memory required by this device
-func (m DeviceMemory) Size() uint64 {
-	return sumMemory(m.Weights) + sumMemory(m.Cache) + m.Graph
-}
-func memoryPresent(mem []uint64) bool {
-	return slices.ContainsFunc(mem, func(m uint64) bool { return m != 0 })
-}
-func (m DeviceMemory) LogValue() slog.Value {
-	var attrs []slog.Attr
-	if memoryPresent(m.Weights) {
-		attrs = append(attrs, slog.Any("Weights", m.Weights))
-	}
-	if memoryPresent(m.Cache) {
-		attrs = append(attrs, slog.Any("Cache", m.Cache))
-	}
-	if m.Graph != 0 {
-		attrs = append(attrs, slog.Any("Graph", m.Graph))
-	}
-	if len(attrs) > 0 && m.ID != "" {
-		attrs = append([]slog.Attr{slog.String("ID", m.ID)}, attrs...)
-	}
-	return slog.GroupValue(attrs...)
-}
-// BackendMemory provides the amount of memory required to load the model
-// per device based on the BackendParams. In some cases, not all required
-// allocations will be known at this point. However, the size of the most recent
-// allocation is guaranteed to be provided so that if it failed, the caller can
-// accommodate that to make forward progress.
-type BackendMemory struct {
-	// InputWeights are always located on the CPU and cannot be moved
-	InputWeights uint64
-	// CPU model components are located in system memory. This does not
-	// include unified memory allocated through the GPU.
-	CPU DeviceMemory
-	// GPU model components are located on one or more GPUs.
-	GPUs []DeviceMemory
-}
-func (m BackendMemory) LogValue() slog.Value {
-	var attrs []slog.Attr
-	if m.InputWeights != 0 {
-		attrs = append(attrs, slog.Any("InputWeights", m.InputWeights))
-	}
-	attrs = append(attrs, slog.Any(m.CPU.Name, m.CPU))
-	for _, g := range m.GPUs {
-		attrs = append(attrs, slog.Any(g.Name, g))
-	}
-	return slog.GroupValue(attrs...)
-}
-// Log prints a high level summary of the memory
-func (m BackendMemory) Log(level slog.Level) {
-	var total uint64
-	for _, gpu := range m.GPUs {
-		if sum := sumMemory(gpu.Weights); sum > 0 {
-			slog.Log(context.TODO(), level, "model weights", "device", gpu.Name, "size", format.HumanBytes2(sum))
-			total += sum
-		}
-	}
-	if sum := m.InputWeights + sumMemory(m.CPU.Weights); sum > 0 {
-		slog.Log(context.TODO(), level, "model weights", "device", m.CPU.Name, "size", format.HumanBytes2(sum))
-		total += sum
-	}
-	for _, gpu := range m.GPUs {
-		if sum := sumMemory(gpu.Cache); sum > 0 {
-			slog.Log(context.TODO(), level, "kv cache", "device", gpu.Name, "size", format.HumanBytes2(sum))
-			total += sum
-		}
-	}
-	if sum := sumMemory(m.CPU.Cache); sum > 0 {
-		slog.Log(context.TODO(), level, "kv cache", "device", m.CPU.Name, "size", format.HumanBytes2(sum))
-		total += sum
-	}
-	for _, gpu := range m.GPUs {
-		if sum := gpu.Graph; sum > 0 {
-			slog.Log(context.TODO(), level, "compute graph", "device", gpu.Name, "size", format.HumanBytes2(sum))
-			total += sum
-		}
-	}
-	if sum := m.CPU.Graph; sum > 0 {
-		slog.Log(context.TODO(), level, "compute graph", "device", m.CPU.Name, "size", format.HumanBytes2(sum))
-		total += sum
-	}
-	if total > 0 {
-		slog.Log(context.TODO(), level, "total memory", "size", format.HumanBytes2(total))
-	}
-}
 var backends = make(map[string]func(string, BackendParams) (Backend, error))
 func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)) {

--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
 package ggml
+// #cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm
+// #cgo windows LDFLAGS: -lpthread
 // #cgo CPPFLAGS: -I${SRCDIR}/ggml/include
 // #include <stdlib.h>
 // #include <stdint.h>
@@ -168,6 +170,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 	var props C.struct_ggml_backend_dev_props
 	C.ggml_backend_dev_get_props(cpuDeviceBufferType.d, &props)
 	requiredMemory.CPU.ID = C.GoString(props.id)
+	requiredMemory.CPU.Library = C.GoString(props.library)
 	requiredMemory.CPU.Weights = make([]uint64, blocks+1)
 	requiredMemory.CPU.Cache = make([]uint64, blocks+1)
@@ -186,6 +189,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		var props C.struct_ggml_backend_dev_props
 		C.ggml_backend_dev_get_props(d, &props)
 		requiredMemory.GPUs[i].ID = C.GoString(props.id)
+		requiredMemory.GPUs[i].Library = C.GoString(props.library)
 		requiredMemory.GPUs[i].Weights = make([]uint64, blocks+1)
 		requiredMemory.GPUs[i].Cache = make([]uint64, blocks+1)
 	}
@@ -198,7 +202,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 			for _, l := range p.Layers {
 				if l == layer {
 					for i := range requiredMemory.GPUs {
-						if requiredMemory.GPUs[i].ID == p.ID {
+						if requiredMemory.GPUs[i].DeviceID == p.DeviceID {
 							return gpuDeviceBufferTypes[i]
 						}
 					}
@@ -682,6 +686,52 @@ func (b *Backend) CacheConfig() ml.CacheConfig {
 	}
 }
+func (b *Backend) BackendDevices() []ml.DeviceInfo {
+	deviceInfos := []ml.DeviceInfo{}
+	for _, dev := range gpus {
+		// If we have a model loaded, and it's only loaded on a subset of the devices
+		// skip idle/unused devices to avoid initializing them and causing VRAM allocations
+		if b.allocMemory {
+			idleDev := true
+			for _, backend := range b.schedBackends {
+				if dev == C.ggml_backend_get_device(backend) {
+					idleDev = false
+					break
+				}
+			}
+			if idleDev {
+				slog.Debug("skipping unused backend device", "description", C.GoString(C.ggml_backend_dev_description(dev)))
+				continue
+			}
+		}
+		info := ml.DeviceInfo{}
+		props := C.struct_ggml_backend_dev_props{}
+		C.ggml_backend_dev_get_props(dev, &props)
+		info.Name = C.GoString(props.name)
+		info.Description = C.GoString(props.description)
+		info.ID = C.GoString(props.id)
+		info.Library = C.GoString(props.library)
+		info.ComputeMajor = (int)(props.compute_major)
+		info.ComputeMinor = (int)(props.compute_minor)
+		info.DriverMajor = (int)(props.driver_major)
+		info.DriverMinor = (int)(props.driver_minor)
+		info.Integrated = props.integrated != 0
+		if props.library != nil {
+			info.Library = C.GoString(props.library)
+		}
+		info.PCIID = fmt.Sprintf("%02x:%02x.%x", props.pci_bus_id, props.pci_device_id, props.pci_domain_id)
+		info.LibraryPath = ggml.LibPaths()
+		C.ggml_backend_dev_memory(dev, &props.memory_free, &props.memory_total)
+		info.TotalMemory = (uint64)(props.memory_total)
+		info.FreeMemory = (uint64)(props.memory_free)
+		deviceInfos = append(deviceInfos, info)
+	}
+	return deviceInfos
+}
 type Context struct {
 	b *Backend

--- a/ml/backend/ggml/ggml/include/ggml-backend.h
+++ b/ml/backend/ggml/ggml/include/ggml-backend.h
@@ -157,6 +157,15 @@ extern "C" {
        size_t memory_total;
        enum ggml_backend_dev_type type;
        struct ggml_backend_dev_caps caps;
+        int driver_major;
+        int driver_minor;
+        int compute_major;
+        int compute_minor;
+        int integrated;
+        int pci_bus_id;
+        int pci_device_id;
+        int pci_domain_id;
+        const char *library;
    };
    GGML_API const char *                  ggml_backend_dev_name(ggml_backend_dev_t device);

--- a/ml/backend/ggml/ggml/src/CMakeLists.txt
+++ b/ml/backend/ggml/ggml/src/CMakeLists.txt
@@ -203,6 +203,8 @@ add_library(ggml-base
            ggml-threading.h
            ggml-quants.c
            ggml-quants.h
+            mem_hip.cpp
+            mem_nvml.cpp
            gguf.cpp)
 target_include_directories(ggml-base PRIVATE .)

--- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -279,6 +279,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
    for (int id = 0; id < info.device_count; ++id) {
        int device_vmm = 0;
+#if defined(GGML_USE_HIP)
+        if (std::getenv("GGML_CUDA_INIT") != NULL) {
+            GGML_LOG_INFO("%s: initializing rocBLAS on device %d\n", __func__, id);
+            CUDA_CHECK(cudaSetDevice(id));
+            // rocblas_initialize will SIGABRT if the GPU isn't supported
+            rocblas_initialize();
+            GGML_LOG_INFO("%s: rocBLAS initialized on device %d\n", __func__, id);
+        }
+#endif
 #if defined(GGML_USE_VMM)
        CUdevice device;
        CU_CHECK(cuDeviceGet(&device, id));
@@ -332,9 +342,15 @@ static ggml_cuda_device_info ggml_cuda_init() {
 #else
        info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
        info.devices[id].cc = 100*prop.major + 10*prop.minor;
+#ifdef __CUDA_ARCH_LIST__
+        if (std::getenv("GGML_CUDA_INIT") != NULL) {
+            GGML_ASSERT(ggml_cuda_has_arch(info.devices[id].cc) && "ggml was not compiled with support for this arch");
+        }
+#endif // defined(__CUDA_ARCH_LIST__)
        GGML_LOG_INFO("  Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
                        id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
                        ggml_cuda_parse_uuid(prop, id).c_str());
 #endif // defined(GGML_USE_HIP)
    }
@@ -3352,6 +3368,14 @@ struct ggml_backend_cuda_device_context {
    std::string name;
    std::string description;
    std::string id;
+    int major;
+    int minor;
+    int driver_major;
+    int driver_minor;
+    int integrated;
+    int pci_bus_id;
+    int pci_device_id;
+    int pci_domain_id;
 };
 static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -3372,6 +3396,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
 static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
    ggml_cuda_set_device(ctx->device);
+#if defined(GGML_USE_HIP)
+    if (ggml_hip_mgmt_init() == 0) {
+        int status = ggml_hip_get_device_memory(ctx->pci_bus_id, ctx->pci_device_id, free, total);
+        if (status == 0) {
+            GGML_LOG_DEBUG("%s utilizing ADLX memory reporting free: %zu total: %zu\n", __func__, *free, *total);
+            ggml_hip_mgmt_release();
+            return;
+        }
+        ggml_hip_mgmt_release();
+    }
+#else
+    if (ggml_nvml_init() == 0) {
+        int status = ggml_nvml_get_device_memory(ctx->id.c_str(), free, total);
+        if (status == 0) {
+            GGML_LOG_DEBUG("%s utilizing NVML memory reporting free: %zu total: %zu\n", __func__, *free, *total);
+            ggml_nvml_release();
+            return;
+        }
+        ggml_nvml_release();
+    }
+#endif
    CUDA_CHECK(cudaMemGetInfo(free, total));
 }
@@ -3380,6 +3426,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
    return GGML_BACKEND_DEVICE_TYPE_GPU;
 }
+#define GGML_HIP_NAME "HIP"
 static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
    props->name        = ggml_backend_cuda_device_get_name(dev);
    props->description = ggml_backend_cuda_device_get_description(dev);
@@ -3390,6 +3437,23 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
    // If you need the memory data, call ggml_backend_dev_memory() explicitly.
    props->memory_total = props->memory_free = 0;
+    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
+#if defined(GGML_USE_HIP)
+    int cc = ggml_cuda_info().devices[ctx->device].cc - GGML_CUDA_CC_OFFSET_AMD;
+    props->compute_major = cc / 0x100;
+    props->compute_minor = cc - (props->compute_major * 0x100);
+#else
+    props->compute_major = ctx->major;
+    props->compute_minor = ctx->minor;
+#endif
+    props->driver_major = ctx->driver_major;
+    props->driver_minor = ctx->driver_minor;
+    props->integrated = ctx->integrated;
+    props->pci_bus_id = ctx->pci_bus_id;
+    props->pci_device_id = ctx->pci_device_id;
+    props->pci_domain_id = ctx->pci_domain_id;
+    props->library = GGML_CUDA_NAME;
    bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
 #ifdef GGML_CUDA_NO_PEER_COPY
    bool events = false;
@@ -3980,6 +4044,8 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
        std::lock_guard<std::mutex> lock(mutex);
        if (!initialized) {
            ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
+            int driverVersion = 0;
+            CUDA_CHECK(cudaDriverGetVersion(&driverVersion));
            for (int i = 0; i < ggml_cuda_info().device_count; i++) {
                ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
@@ -3990,7 +4056,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
                CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
                dev_ctx->description = prop.name;
                dev_ctx->id = ggml_cuda_parse_uuid(prop, i);
+                dev_ctx->major = prop.major;
+                dev_ctx->minor = prop.minor;
+                dev_ctx->driver_major = driverVersion / 1000;
+                dev_ctx->driver_minor = (driverVersion - (dev_ctx->driver_major * 1000)) / 10;
+                dev_ctx->integrated = prop.integrated;
+                dev_ctx->pci_bus_id = prop.pciBusID;
+                dev_ctx->pci_device_id = prop.pciDeviceID;
+                dev_ctx->pci_domain_id = prop.pciDomainID;
                ggml_backend_dev_t dev = new ggml_backend_device {
                    /* .iface   = */ ggml_backend_cuda_device_interface,
                    /* .reg     = */ &reg,