init

0ce8bcfd · xuxzh1 · b0135f4b · 0ce8bcfd · 0ce8bcfd · 0ce8bcfd
Commit 0ce8bcfd authored Nov 12, 2024 by xuxzh1 🎱
20 changed files
--- a/gpu/gpu_info_cudart.h
+++ b/gpu/gpu_info_cudart.h
@@ -140,7 +140,8 @@ typedef struct cudart_init_resp {
 } cudart_init_resp_t;

 void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp);
-void cudart_check_vram(cudart_handle_t ch, int device_id, mem_info_t *resp);
+void cudart_bootstrap(cudart_handle_t ch, int device_id, mem_info_t *resp);
+// TODO - if we keep this library longer term, add cudart_get_free
 void cudart_release(cudart_handle_t ch);

 #endif  // __GPU_INFO_CUDART_H__

--- a/gpu/gpu_info_darwin.h
+++ b/gpu/gpu_info_darwin.h
@@ -2,3 +2,4 @@
 #include <stdint.h>
 uint64_t getRecommendedMaxVRAM();
 uint64_t getPhysicalMemory();
+uint64_t getFreeMemory();
--- a/gpu/gpu_info_darwin.m
+++ b/gpu/gpu_info_darwin.m
-// go:build darwin
+#import <Foundation/Foundation.h>
+#import <mach/mach.h>
 #include "gpu_info_darwin.h"

 uint64_t getRecommendedMaxVRAM() {
@@ -8,6 +9,27 @@ uint64_t getRecommendedMaxVRAM() {
  return result;
 }

+// getPhysicalMemory returns the total physical memory in bytes
 uint64_t getPhysicalMemory() {
-  return [[NSProcessInfo processInfo] physicalMemory];
+  return [NSProcessInfo processInfo].physicalMemory;
+}
+
+// getFreeMemory returns the total free memory in bytes, including inactive
+// memory that can be reclaimed by the system.
+uint64_t getFreeMemory() {
+  mach_port_t host_port = mach_host_self();
+  mach_msg_type_number_t host_size = sizeof(vm_statistics64_data_t) / sizeof(integer_t);
+  vm_size_t pagesize;
+  vm_statistics64_data_t vm_stat;
+
+  host_page_size(host_port, &pagesize);
+  if (host_statistics64(host_port, HOST_VM_INFO64, (host_info64_t)&vm_stat, &host_size) != KERN_SUCCESS) {
+    return 0;
+  }
+
+  uint64_t free_memory = (uint64_t)vm_stat.free_count * pagesize;
+  free_memory += (uint64_t)vm_stat.speculative_count * pagesize;
+  free_memory += (uint64_t)vm_stat.inactive_count * pagesize;
+
+  return free_memory;
 }
--- a/gpu/gpu_info_nvcuda.c
+++ b/gpu/gpu_info_nvcuda.c
@@ -7,6 +7,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
  CUresult ret;
  resp->err = NULL;
  resp->num_devices = 0;
+  resp->cudaErr = CUDA_SUCCESS;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;
@@ -38,12 +39,13 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
            nvcuda_lib_path, msg);
    free(msg);
    resp->err = strdup(buf);
+    resp->cudaErr = -1;
    return;
  }

  for (i = 0; l[i].s != NULL; i++) {
    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
-    if (!*l[i].p) {
+    if (!*(l[i].p)) {
      char *msg = LOAD_ERR();
      LOG(resp->ch.verbose, "dlerr: %s\n", msg);
      UNLOAD_LIBRARY(resp->ch.handle);
@@ -52,6 +54,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
              msg);
      free(msg);
      resp->err = strdup(buf);
+      resp->cudaErr = -1;
      return;
    }
  }
@@ -61,12 +64,9 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
    LOG(resp->ch.verbose, "cuInit err: %d\n", ret);
    UNLOAD_LIBRARY(resp->ch.handle);
    resp->ch.handle = NULL;
-    if (ret == CUDA_ERROR_INSUFFICIENT_DRIVER) {
-      resp->err = strdup("your nvidia driver is too old or missing.  If you have a CUDA GPU please upgrade to run ollama");
-      return;
-    }
-    snprintf(buf, buflen, "nvcuda init failure: %d", ret);
+    snprintf(buf, buflen, "cuda driver library init failure: %d", ret);
    resp->err = strdup(buf);
+    resp->cudaErr = ret;
    return;
  }

@@ -91,12 +91,13 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
    resp->ch.handle = NULL;
    snprintf(buf, buflen, "unable to get device count: %d", ret);
    resp->err = strdup(buf);
+    resp->cudaErr = ret;
    return;
  }
 }

 const int buflen = 256;
-void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) {
+void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) {
  resp->err = NULL;
  nvcudaMemory_t memInfo = {0,0};
  CUresult ret;
@@ -106,13 +107,13 @@ void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) {
  CUuuid uuid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};

  if (h.handle == NULL) {
-    resp->err = strdup("nvcuda handle isn't initialized");
+    resp->err = strdup("cuda driver library handle isn't initialized");
    return;
  }

  ret = (*h.cuDeviceGet)(&device, i);
  if (ret != CUDA_SUCCESS) {
-    snprintf(buf, buflen, "nvcuda device failed to initialize");
+    snprintf(buf, buflen, "cuda driver library device failed to initialize");
    resp->err = strdup(buf);
    return;
  }
@@ -168,14 +169,14 @@ void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) {
  // To get memory we have to set (and release) a context
  ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
  if (ret != CUDA_SUCCESS) {
-    snprintf(buf, buflen, "nvcuda failed to get primary device context %d", ret);
+    snprintf(buf, buflen, "cuda driver library failed to get device context %d", ret);
    resp->err = strdup(buf);
    return;
  }

  ret = (*h.cuMemGetInfo_v2)(&memInfo.free, &memInfo.total);
  if (ret != CUDA_SUCCESS) {
-    snprintf(buf, buflen, "nvcuda device memory info lookup failure %d", ret);
+    snprintf(buf, buflen, "cuda driver library device memory info lookup failure %d", ret);
    resp->err = strdup(buf);
    // Best effort on failure...
    (*h.cuCtxDestroy)(ctx);
@@ -193,12 +194,47 @@ void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) {

  ret = (*h.cuCtxDestroy)(ctx);
  if (ret != CUDA_SUCCESS) {
-    LOG(1, "nvcuda failed to release primary device context %d", ret);
+    LOG(1, "cuda driver library failed to release device context %d", ret);
+  }
+}
+
+void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free, uint64_t *total) {
+  CUresult ret;
+  CUcontext ctx = NULL;
+  CUdevice device = -1;
+  *free = 0;
+  *total = 0;
+
+  ret = (*h.cuDeviceGet)(&device, i);
+  if (ret != CUDA_SUCCESS) {
+    LOG(1, "cuda driver library device failed to initialize");
+    return;
+  }
+
+
+  // To get memory we have to set (and release) a context
+  ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
+  if (ret != CUDA_SUCCESS) {
+    LOG(1, "cuda driver library failed to get device context %d", ret);
+    return;
+  }
+
+  ret = (*h.cuMemGetInfo_v2)(free, total);
+  if (ret != CUDA_SUCCESS) {
+    LOG(1, "cuda driver library device memory info lookup failure %d", ret);
+    // Best effort on failure...
+    (*h.cuCtxDestroy)(ctx);
+    return;
+  }
+
+  ret = (*h.cuCtxDestroy)(ctx);
+  if (ret != CUDA_SUCCESS) {
+    LOG(1, "cuda driver library failed to release device context %d", ret);
  }
 }

 void nvcuda_release(nvcuda_handle_t h) {
-  LOG(h.verbose, "releasing nvcuda library\n");
+  LOG(h.verbose, "releasing cuda driver library\n");
  UNLOAD_LIBRARY(h.handle);
  // TODO and other context release logic?
  h.handle = NULL;

--- a/gpu/gpu_info_nvcuda.h
+++ b/gpu/gpu_info_nvcuda.h
@@ -7,9 +7,12 @@
 typedef enum cudaError_enum {
  CUDA_SUCCESS = 0,
  CUDA_ERROR_INVALID_VALUE = 1,
-  CUDA_ERROR_MEMORY_ALLOCATION = 2,
+  CUDA_ERROR_OUT_OF_MEMORY = 2,
  CUDA_ERROR_NOT_INITIALIZED = 3,
  CUDA_ERROR_INSUFFICIENT_DRIVER = 35,
+  CUDA_ERROR_NO_DEVICE = 100,
+  CUDA_ERROR_SYSTEM_DRIVER_MISMATCH = 803,
+  CUDA_ERROR_UNKNOWN = 999,
  // Other values omitted for now...
 } CUresult;

@@ -64,10 +67,12 @@ typedef struct nvcuda_init_resp {
  char *err;  // If err is non-null handle is invalid
  nvcuda_handle_t ch;
  int num_devices;
+  CUresult cudaErr;
 } nvcuda_init_resp_t;

 void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp);
-void nvcuda_check_vram(nvcuda_handle_t ch, int device_id, mem_info_t *resp);
+void nvcuda_bootstrap(nvcuda_handle_t ch, int device_id, mem_info_t *resp);
+void nvcuda_get_free(nvcuda_handle_t ch,  int device_id, uint64_t *free, uint64_t *total);
 void nvcuda_release(nvcuda_handle_t ch);

 #endif  // __GPU_INFO_NVCUDA_H__

--- a/gpu/gpu_info_nvml.c
+++ b/gpu/gpu_info_nvml.c
+#ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?
+
+#include <string.h>
+
+#include "gpu_info_nvml.h"
+
+void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
+  nvmlReturn_t ret;
+  resp->err = NULL;
+  const int buflen = 256;
+  char buf[buflen + 1];
+  int i;
+
+  struct lookup {
+    char *s;
+    void **p;
+  } l[] = {
+      {"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2},
+      {"nvmlShutdown", (void *)&resp->ch.nvmlShutdown},
+      {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.nvmlDeviceGetHandleByIndex},
+      {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo},
+      {NULL, NULL},
+  };
+
+  resp->ch.handle = LOAD_LIBRARY(nvml_lib_path, RTLD_LAZY);
+  if (!resp->ch.handle) {
+    char *msg = LOAD_ERR();
+    LOG(resp->ch.verbose, "library %s load err: %s\n", nvml_lib_path, msg);
+    snprintf(buf, buflen,
+             "Unable to load %s library to query for Nvidia GPUs: %s",
+             nvml_lib_path, msg);
+    free(msg);
+    resp->err = strdup(buf);
+    return;
+  }
+
+  // TODO once we've squashed the remaining corner cases remove this log
+  // LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", nvml_lib_path);
+  
+  for (i = 0; l[i].s != NULL; i++) {
+    // TODO once we've squashed the remaining corner cases remove this log
+    // LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
+
+    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
+    if (!*(l[i].p)) {
+      resp->ch.handle = NULL;
+      char *msg = LOAD_ERR();
+      LOG(resp->ch.verbose, "dlerr: %s\n", msg);
+      UNLOAD_LIBRARY(resp->ch.handle);
+      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
+               msg);
+      free(msg);
+      resp->err = strdup(buf);
+      return;
+    }
+  }
+
+  ret = (*resp->ch.nvmlInit_v2)();
+  if (ret != NVML_SUCCESS) {
+    LOG(resp->ch.verbose, "nvmlInit_v2 err: %d\n", ret);
+    UNLOAD_LIBRARY(resp->ch.handle);
+    resp->ch.handle = NULL;
+    snprintf(buf, buflen, "nvml vram init failure: %d", ret);
+    resp->err = strdup(buf);
+    return;
+  }
+}
+
+
+void nvml_get_free(nvml_handle_t h, int device_id, uint64_t *free, uint64_t *total, uint64_t *used) {
+    nvmlDevice_t device;
+    nvmlMemory_t memInfo = {0};
+    nvmlReturn_t ret;
+    ret = (*h.nvmlDeviceGetHandleByIndex)(device_id, &device);
+    if (ret != NVML_SUCCESS) {
+        LOG(1, "unable to get device handle %d: %d", device_id, ret);
+        *free = 0;
+        return;
+    }
+
+    ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo);
+    if (ret != NVML_SUCCESS) {
+        LOG(1, "device memory info lookup failure %d: %d", device_id, ret);
+        *free = 0;
+        return;
+    }
+    *free = memInfo.free;
+    *total = memInfo.total;
+    *used = memInfo.used;
+}
+
+
+void nvml_release(nvml_handle_t h) {
+  LOG(h.verbose, "releasing nvml library\n");
+  nvmlReturn_t ret;
+  ret = (*h.nvmlShutdown)();
+  if (ret != NVML_SUCCESS) {
+    LOG(1, "error during nvmlShutdown %d", ret);
+  }
+  UNLOAD_LIBRARY(h.handle);
+  h.handle = NULL;
+}
+
+#endif  // __APPLE__
\ No newline at end of file
--- a/gpu/gpu_info_nvml.h
+++ b/gpu/gpu_info_nvml.h
+#ifndef __APPLE__
+#ifndef __GPU_INFO_NVML_H__
+#define __GPU_INFO_NVML_H__
+#include "gpu_info.h"
+
+// Just enough typedef's to dlopen/dlsym for memory information
+typedef enum nvmlReturn_enum {
+  NVML_SUCCESS = 0,
+  // Other values omitted for now...
+} nvmlReturn_t;
+typedef void *nvmlDevice_t;  // Opaque is sufficient
+typedef struct nvmlMemory_st {
+  unsigned long long total;
+  unsigned long long free;
+  unsigned long long used;
+} nvmlMemory_t;
+
+typedef enum nvmlBrandType_enum
+{
+    NVML_BRAND_UNKNOWN          = 0,
+} nvmlBrandType_t;
+
+typedef struct nvml_handle {
+  void *handle;
+  uint16_t verbose;
+  nvmlReturn_t (*nvmlInit_v2)(void);
+  nvmlReturn_t (*nvmlShutdown)(void);
+  nvmlReturn_t (*nvmlDeviceGetHandleByIndex)(unsigned int, nvmlDevice_t *);
+  nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *);
+} nvml_handle_t;
+
+typedef struct nvml_init_resp {
+  char *err;  // If err is non-null handle is invalid
+  nvml_handle_t ch;
+} nvml_init_resp_t;
+
+typedef struct nvml_compute_capability {
+  char *err;
+  int major;
+  int minor;
+} nvml_compute_capability_t;
+
+void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp);
+void nvml_get_free(nvml_handle_t ch,  int device_id, uint64_t *free, uint64_t *total, uint64_t *used);
+void nvml_release(nvml_handle_t ch);
+
+#endif  // __GPU_INFO_NVML_H__
+#endif  // __APPLE__
\ No newline at end of file
--- a/gpu/gpu_info_oneapi.c
+++ b/gpu/gpu_info_oneapi.c
@@ -4,15 +4,17 @@

 #include <string.h>

-void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
-{
+void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) {
  ze_result_t ret;
  resp->err = NULL;
+  resp->oh.devices = NULL;
+  resp->oh.num_devices = NULL;
+  resp->oh.drivers = NULL;
+  resp->oh.num_drivers = 0;
  const int buflen = 256;
  char buf[buflen + 1];
-  int i;
-  struct lookup
-  {
+  int i, d;
+  struct lookup {
    char *s;
    void **p;
  } l[] = {
@@ -28,8 +30,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
  };

  resp->oh.handle = LOAD_LIBRARY(oneapi_lib_path, RTLD_LAZY);
-  if (!resp->oh.handle)
-  {
+  if (!resp->oh.handle) {
    char *msg = LOAD_ERR();
    snprintf(buf, buflen,
             "Unable to load %s library to query for Intel GPUs: %s\n",
@@ -44,14 +45,12 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
      "wiring Level-Zero management library functions in %s\n",
      oneapi_lib_path);

-  for (i = 0; l[i].s != NULL; i++)
-  {
+  for (i = 0; l[i].s != NULL; i++) {
    // TODO once we've squashed the remaining corner cases remove this log
    LOG(resp->oh.verbose, "dlsym: %s\n", l[i].s);

    *l[i].p = LOAD_SYMBOL(resp->oh.handle, l[i].s);
-    if (!l[i].p)
-    {
+    if (!*(l[i].p)) {
      resp->oh.handle = NULL;
      char *msg = LOAD_ERR();
      LOG(resp->oh.verbose, "dlerr: %s\n", msg);
@@ -63,23 +62,70 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
    }
  }

+  LOG(resp->oh.verbose, "calling zesInit\n");
+
  ret = (*resp->oh.zesInit)(0);
-  if (ret != ZE_RESULT_SUCCESS)
-  {
-    LOG(resp->oh.verbose, "zesInit err: %d\n", ret);
-    UNLOAD_LIBRARY(resp->oh.handle);
-    resp->oh.handle = NULL;
-    snprintf(buf, buflen, "oneapi vram init failure: %d", ret);
+  if (ret != ZE_RESULT_SUCCESS) {
+    LOG(resp->oh.verbose, "zesInit err: %x\n", ret);
+    snprintf(buf, buflen, "oneapi vram init failure: %x", ret);
+    resp->err = strdup(buf);
+    oneapi_release(resp->oh);
+    return;
+  }
+
+  LOG(resp->oh.verbose, "calling zesDriverGet\n");
+  ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, NULL);
+  if (ret != ZE_RESULT_SUCCESS) {
+    LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret);
+    snprintf(buf, buflen, "unable to get driver count: %x", ret);
+    resp->err = strdup(buf);
+    oneapi_release(resp->oh);
+    return;
+  }
+  LOG(resp->oh.verbose, "oneapi driver count: %d\n", resp->oh.num_drivers);
+  resp->oh.drivers = malloc(resp->oh.num_drivers * sizeof(zes_driver_handle_t));
+  resp->oh.num_devices = malloc(resp->oh.num_drivers * sizeof(uint32_t));
+  memset(&resp->oh.num_devices[0], 0, resp->oh.num_drivers * sizeof(uint32_t));
+  resp->oh.devices =
+      malloc(resp->oh.num_drivers * sizeof(zes_device_handle_t *));
+  ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, &resp->oh.drivers[0]);
+  if (ret != ZE_RESULT_SUCCESS) {
+    LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret);
+    snprintf(buf, buflen, "unable to get driver count: %x", ret);
    resp->err = strdup(buf);
+    oneapi_release(resp->oh);
+    return;
  }

-  (*resp->oh.zesDriverGet)(&resp->num_devices, NULL);
+  for (d = 0; d < resp->oh.num_drivers; d++) {
+    LOG(resp->oh.verbose, "calling zesDeviceGet count %d: %p\n", d, resp->oh.drivers[d]);
+    ret = (*resp->oh.zesDeviceGet)(resp->oh.drivers[d],
+                                   &resp->oh.num_devices[d], NULL);
+    if (ret != ZE_RESULT_SUCCESS) {
+      LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret);
+      snprintf(buf, buflen, "unable to get device count: %x", ret);
+      resp->err = strdup(buf);
+      oneapi_release(resp->oh);
+      return;
+    }
+    resp->oh.devices[d] =
+        malloc(resp->oh.num_devices[d] * sizeof(zes_device_handle_t));
+    ret = (*resp->oh.zesDeviceGet)(
+        resp->oh.drivers[d], &resp->oh.num_devices[d], resp->oh.devices[d]);
+    if (ret != ZE_RESULT_SUCCESS) {
+      LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret);
+      snprintf(buf, buflen, "unable to get device count: %x", ret);
+      resp->err = strdup(buf);
+      oneapi_release(resp->oh);
+      return;
+    }
+  }

  return;
 }

-void oneapi_check_vram(oneapi_handle_t h, mem_info_t *resp)
-{
+void oneapi_check_vram(oneapi_handle_t h, int driver, int device,
+                       mem_info_t *resp) {
  ze_result_t ret;
  resp->err = NULL;
  uint64_t totalMem = 0;
@@ -88,127 +134,126 @@ void oneapi_check_vram(oneapi_handle_t h, mem_info_t *resp)
  char buf[buflen + 1];
  int i, d, m;

-  if (h.handle == NULL)
-  {
+  if (h.handle == NULL) {
    resp->err = strdup("Level-Zero handle not initialized");
    return;
  }

-  uint32_t driversCount = 0;
-  ret = (*h.zesDriverGet)(&driversCount, NULL);
-  if (ret != ZE_RESULT_SUCCESS)
-  {
-    snprintf(buf, buflen, "unable to get driver count: %d", ret);
-    resp->err = strdup(buf);
+  if (driver > h.num_drivers || device > h.num_devices[driver]) {
+    resp->err = strdup("driver of device index out of bounds");
    return;
  }
-  LOG(h.verbose, "discovered %d Level-Zero drivers\n", driversCount);
-
-  zes_driver_handle_t *allDrivers =
-      malloc(driversCount * sizeof(zes_driver_handle_t));
-  (*h.zesDriverGet)(&driversCount, allDrivers);

  resp->total = 0;
  resp->free = 0;

-  for (d = 0; d < driversCount; d++)
-  {
-    uint32_t deviceCount = 0;
-    ret = (*h.zesDeviceGet)(allDrivers[d], &deviceCount, NULL);
-    if (ret != ZE_RESULT_SUCCESS)
-    {
-      snprintf(buf, buflen, "unable to get device count: %d", ret);
+  zes_device_ext_properties_t ext_props;
+  ext_props.stype = ZES_STRUCTURE_TYPE_DEVICE_EXT_PROPERTIES;
+  ext_props.pNext = NULL;
+
+  zes_device_properties_t props;
+  props.stype = ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+  props.pNext = &ext_props;
+
+  ret = (*h.zesDeviceGetProperties)(h.devices[driver][device], &props);
+  if (ret != ZE_RESULT_SUCCESS) {
+    snprintf(buf, buflen, "unable to get device properties: %d", ret);
+    resp->err = strdup(buf);
+    return;
+  }
+
+  snprintf(&resp->gpu_name[0], GPU_NAME_LEN, "%s", props.modelName);
+
+  // TODO this needs to map to ONEAPI_DEVICE_SELECTOR syntax
+  // (this is probably wrong...)
+  // TODO - the driver isn't included - what if there are multiple drivers?
+  snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", device);
+
+  if (h.verbose) {
+    // When in verbose mode, report more information about
+    // the card we discover.
+    LOG(h.verbose, "[%d:%d] oneAPI device name: %s\n", driver, device,
+        props.modelName);
+    LOG(h.verbose, "[%d:%d] oneAPI brand: %s\n", driver, device,
+        props.brandName);
+    LOG(h.verbose, "[%d:%d] oneAPI vendor: %s\n", driver, device,
+        props.vendorName);
+    LOG(h.verbose, "[%d:%d] oneAPI S/N: %s\n", driver, device,
+        props.serialNumber);
+    LOG(h.verbose, "[%d:%d] oneAPI board number: %s\n", driver, device,
+        props.boardNumber);
+  }
+
+  // TODO
+  // Compute Capability equivalent in resp->major, resp->minor, resp->patch
+
+  uint32_t memCount = 0;
+  ret = (*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount,
+                                        NULL);
+  if (ret != ZE_RESULT_SUCCESS) {
+    snprintf(buf, buflen, "unable to enumerate Level-Zero memory modules: %x",
+             ret);
+    resp->err = strdup(buf);
+    return;
+  }
+
+  LOG(h.verbose, "discovered %d Level-Zero memory modules\n", memCount);
+
+  zes_mem_handle_t *mems = malloc(memCount * sizeof(zes_mem_handle_t));
+  (*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount, mems);
+
+  for (m = 0; m < memCount; m++) {
+    zes_mem_state_t state;
+    state.stype = ZES_STRUCTURE_TYPE_MEM_STATE;
+    state.pNext = NULL;
+    ret = (*h.zesMemoryGetState)(mems[m], &state);
+    if (ret != ZE_RESULT_SUCCESS) {
+      snprintf(buf, buflen, "unable to get memory state: %x", ret);
      resp->err = strdup(buf);
-      free(allDrivers);
+      free(mems);
      return;
    }

-    LOG(h.verbose, "discovered %d Level-Zero devices\n", deviceCount);
-
-    zes_device_handle_t *devices =
-        malloc(deviceCount * sizeof(zes_device_handle_t));
-    (*h.zesDeviceGet)(allDrivers[d], &deviceCount, devices);
-
-    for (i = 0; i < deviceCount; i++)
-    {
-      zes_device_ext_properties_t ext_props;
-      ext_props.stype = ZES_STRUCTURE_TYPE_DEVICE_EXT_PROPERTIES;
-      ext_props.pNext = NULL;
-
-      zes_device_properties_t props;
-      props.stype = ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES;
-      props.pNext = &ext_props;
-
-      ret = (*h.zesDeviceGetProperties)(devices[i], &props);
-      if (ret != ZE_RESULT_SUCCESS)
-      {
-        snprintf(buf, buflen, "unable to get device properties: %d", ret);
-        resp->err = strdup(buf);
-        free(allDrivers);
-        free(devices);
-        return;
-      }
-
-      if (h.verbose)
-      {
-        // When in verbose mode, report more information about
-        // the card we discover.
-        LOG(h.verbose, "[%d] oneAPI device name: %s\n", i,
-            props.modelName);
-        LOG(h.verbose, "[%d] oneAPI brand: %s\n", i,
-            props.brandName);
-        LOG(h.verbose, "[%d] oneAPI vendor: %s\n", i,
-            props.vendorName);
-        LOG(h.verbose, "[%d] oneAPI S/N: %s\n", i,
-            props.serialNumber);
-        LOG(h.verbose, "[%d] oneAPI board number: %s\n", i,
-            props.boardNumber);
-      }
-
-      uint32_t memCount = 0;
-      ret = (*h.zesDeviceEnumMemoryModules)(devices[i], &memCount, NULL);
-      if (ret != ZE_RESULT_SUCCESS)
-      {
-        snprintf(buf, buflen,
-                 "unable to enumerate Level-Zero memory modules: %d", ret);
-        resp->err = strdup(buf);
-        free(allDrivers);
-        free(devices);
-        return;
-      }
-
-      LOG(h.verbose, "discovered %d Level-Zero memory modules\n", memCount);
-
-      zes_mem_handle_t *mems = malloc(memCount * sizeof(zes_mem_handle_t));
-      (*h.zesDeviceEnumMemoryModules)(devices[i], &memCount, mems);
-
-      for (m = 0; m < memCount; m++)
-      {
-        zes_mem_state_t state;
-        state.stype = ZES_STRUCTURE_TYPE_MEM_STATE;
-        state.pNext = NULL;
-        ret = (*h.zesMemoryGetState)(mems[m], &state);
-        if (ret != ZE_RESULT_SUCCESS)
-        {
-          snprintf(buf, buflen, "unable to get memory state: %d", ret);
-          resp->err = strdup(buf);
-          free(allDrivers);
-          free(devices);
-          free(mems);
-          return;
-        }
-
-        resp->total += state.size;
-        resp->free += state.free;
-      }
+    resp->total += state.size;
+    resp->free += state.free;
+  }

-      free(mems);
-    }
+  free(mems);
+}

-    free(devices);
+void oneapi_release(oneapi_handle_t h) {
+  int d;
+  LOG(h.verbose, "releasing oneapi library\n");
+  for (d = 0; d < h.num_drivers; d++) {
+    if (h.devices != NULL && h.devices[d] != NULL) {
+      free(h.devices[d]);
+    }
+  }
+  if (h.devices != NULL) {
+    free(h.devices);
+    h.devices = NULL;
  }
+  if (h.num_devices != NULL) {
+    free(h.num_devices);
+    h.num_devices = NULL;
+  }
+  if (h.drivers != NULL) {
+    free(h.drivers);
+    h.drivers = NULL;
+  }
+  h.num_drivers = 0;
+  UNLOAD_LIBRARY(h.handle);
+  h.handle = NULL;
+}

-  free(allDrivers);
+int oneapi_get_device_count(oneapi_handle_t h, int driver) {
+  if (h.handle == NULL || h.num_devices == NULL) {
+    return 0;
+  }
+  if (driver > h.num_drivers) {
+    return 0;
+  }
+  return (int)h.num_devices[driver];
 }

 #endif // __APPLE__
--- a/gpu/gpu_info_oneapi.h
+++ b/gpu/gpu_info_oneapi.h
@@ -9,8 +9,7 @@
 #define ZE_BIT(_i) (1 << _i)

 // Just enough typedef's to dlopen/dlsym for memory information
-typedef enum ze_result_t
-{
+typedef enum ze_result_t {
  ZE_RESULT_SUCCESS = 0,
  // Other values omitted for now...
 } ze_result_t;
@@ -20,13 +19,11 @@ typedef struct _zes_driver_handle_t *zes_driver_handle_t;
 typedef struct _zes_device_handle_t *zes_device_handle_t;
 typedef struct _zes_mem_handle_t *zes_mem_handle_t;

-typedef enum _ze_structure_type_t
-{
+typedef enum _ze_structure_type_t {
  ZE_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
 } ze_structure_type_t;

-typedef enum _zes_structure_type_t
-{
+typedef enum _zes_structure_type_t {
  ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES = 0x1,
  ZES_STRUCTURE_TYPE_MEM_PROPERTIES = 0xb,
  ZES_STRUCTURE_TYPE_MEM_STATE = 0x1e,
@@ -34,35 +31,29 @@ typedef enum _zes_structure_type_t
  ZES_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
 } zes_structure_type_t;

-typedef enum _zes_mem_type_t
-{
+typedef enum _zes_mem_type_t {
  ZES_MEM_TYPE_FORCE_UINT32 = 0x7fffffff
 } zes_mem_type_t;

-typedef enum _zes_mem_loc_t
-{
+typedef enum _zes_mem_loc_t {
  ZES_MEM_LOC_SYSTEM = 0,
  ZES_MEM_LOC_DEVICE = 1,
  ZES_MEM_LOC_FORCE_UINT32 = 0x7fffffff
 } zes_mem_loc_t;

-typedef enum _zes_mem_health_t
-{
+typedef enum _zes_mem_health_t {
  ZES_MEM_HEALTH_FORCE_UINT32 = 0x7fffffff
 } zes_mem_health_t;

-typedef struct _ze_device_uuid_t
-{
+typedef struct _ze_device_uuid_t {
  uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
 } ze_device_uuid_t;

-typedef struct _zes_uuid_t
-{
+typedef struct _zes_uuid_t {
  uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
 } zes_uuid_t;

-typedef enum _ze_device_type_t
-{
+typedef enum _ze_device_type_t {
  ZE_DEVICE_TYPE_GPU = 1,
  ZE_DEVICE_TYPE_CPU = 2,
  ZE_DEVICE_TYPE_FPGA = 3,
@@ -71,8 +62,7 @@ typedef enum _ze_device_type_t
  ZE_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff
 } ze_device_type_t;

-typedef enum _zes_device_type_t
-{
+typedef enum _zes_device_type_t {
  ZES_DEVICE_TYPE_GPU = 1,
  ZES_DEVICE_TYPE_CPU = 2,
  ZES_DEVICE_TYPE_FPGA = 3,
@@ -82,8 +72,7 @@ typedef enum _zes_device_type_t
 } zes_device_type_t;

 typedef uint32_t ze_device_property_flags_t;
-typedef enum _ze_device_property_flag_t
-{
+typedef enum _ze_device_property_flag_t {
  ZE_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
  ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
  ZE_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
@@ -92,8 +81,7 @@ typedef enum _ze_device_property_flag_t
 } ze_device_property_flag_t;

 typedef uint32_t zes_device_property_flags_t;
-typedef enum _zes_device_property_flag_t
-{
+typedef enum _zes_device_property_flag_t {
  ZES_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
  ZES_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
  ZES_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
@@ -101,8 +89,7 @@ typedef enum _zes_device_property_flag_t
  ZES_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
 } zes_device_property_flag_t;

-typedef struct _ze_device_properties_t
-{
+typedef struct _ze_device_properties_t {
  ze_structure_type_t stype;
  void *pNext;
  ze_device_type_t type;
@@ -126,8 +113,7 @@ typedef struct _ze_device_properties_t
  char name[ZE_MAX_DEVICE_NAME];
 } ze_device_properties_t;

-typedef struct _zes_device_properties_t
-{
+typedef struct _zes_device_properties_t {
  zes_structure_type_t stype;
  void *pNext;
  ze_device_properties_t core;
@@ -140,8 +126,7 @@ typedef struct _zes_device_properties_t
  char driverVersion[ZES_STRING_PROPERTY_SIZE];
 } zes_device_properties_t;

-typedef struct _zes_device_ext_properties_t
-{
+typedef struct _zes_device_ext_properties_t {
  zes_structure_type_t stype;
  void *pNext;
  zes_uuid_t uuid;
@@ -149,8 +134,7 @@ typedef struct _zes_device_ext_properties_t
  zes_device_property_flags_t flags;
 } zes_device_ext_properties_t;

-typedef struct _zes_mem_properties_t
-{
+typedef struct _zes_mem_properties_t {
  zes_structure_type_t stype;
  void *pNext;
  zes_mem_type_t type;
@@ -162,8 +146,7 @@ typedef struct _zes_mem_properties_t
  int32_t numChannels;
 } zes_mem_properties_t;

-typedef struct _zes_mem_state_t
-{
+typedef struct _zes_mem_state_t {
  zes_structure_type_t stype;
  const void *pNext;
  zes_mem_health_t health;
@@ -171,10 +154,19 @@ typedef struct _zes_mem_state_t
  uint64_t size;
 } zes_mem_state_t;

-typedef struct oneapi_handle
-{
+typedef struct oneapi_handle {
  void *handle;
  uint16_t verbose;
+
+  uint32_t num_drivers;
+  zes_driver_handle_t *drivers;
+  uint32_t *num_devices;
+  zes_device_handle_t **devices;
+
+  // TODO Driver major, minor information
+  // int driver_major;
+  // int driver_minor;
+
  ze_result_t (*zesInit)(int);
  ze_result_t (*zesDriverGet)(uint32_t *pCount, zes_driver_handle_t *phDrivers);
  ze_result_t (*zesDeviceGet)(zes_driver_handle_t hDriver, uint32_t *pCount,
@@ -191,21 +183,21 @@ typedef struct oneapi_handle

 } oneapi_handle_t;

-typedef struct oneapi_init_resp
-{
+typedef struct oneapi_init_resp {
  char *err; // If err is non-null handle is invalid
-  int num_devices;
  oneapi_handle_t oh;
 } oneapi_init_resp_t;

-typedef struct oneapi_version_resp
-{
+typedef struct oneapi_version_resp {
  ze_result_t status;
  char *str; // Contains version or error string if status != 0
 } oneapi_version_resp_t;

 void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp);
-void oneapi_check_vram(oneapi_handle_t rh, mem_info_t *resp);
+void oneapi_check_vram(oneapi_handle_t h, int driver, int device,
+                       mem_info_t *resp);
+void oneapi_release(oneapi_handle_t h);
+int oneapi_get_device_count(oneapi_handle_t h, int driver);

 #endif // __GPU_INFO_INTEL_H__
 #endif // __APPLE__
--- a/gpu/gpu_linux.go
+++ b/gpu/gpu_linux.go
+package gpu
+
+import (
+	"bufio"
+	"fmt"
+	"os"
+	"strings"
+
+	"github.com/ollama/ollama/format"
+)
+
+var CudartGlobs = []string{
+	"/usr/local/cuda/lib64/libcudart.so*",
+	"/usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so*",
+	"/usr/lib/x86_64-linux-gnu/libcudart.so*",
+	"/usr/lib/wsl/lib/libcudart.so*",
+	"/usr/lib/wsl/drivers/*/libcudart.so*",
+	"/opt/cuda/lib64/libcudart.so*",
+	"/usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so*",
+	"/usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so*",
+	"/usr/lib/aarch64-linux-gnu/libcudart.so*",
+	"/usr/local/cuda/lib*/libcudart.so*",
+	"/usr/lib*/libcudart.so*",
+	"/usr/local/lib*/libcudart.so*",
+}
+
+var NvmlGlobs = []string{}
+
+var NvcudaGlobs = []string{
+	"/usr/local/cuda*/targets/*/lib/libcuda.so*",
+	"/usr/lib/*-linux-gnu/nvidia/current/libcuda.so*",
+	"/usr/lib/*-linux-gnu/libcuda.so*",
+	"/usr/lib/wsl/lib/libcuda.so*",
+	"/usr/lib/wsl/drivers/*/libcuda.so*",
+	"/opt/cuda/lib*/libcuda.so*",
+	"/usr/local/cuda/lib*/libcuda.so*",
+	"/usr/lib*/libcuda.so*",
+	"/usr/local/lib*/libcuda.so*",
+}
+
+var OneapiGlobs = []string{
+	"/usr/lib/x86_64-linux-gnu/libze_intel_gpu.so*",
+	"/usr/lib*/libze_intel_gpu.so*",
+}
+
+var (
+	CudartMgmtName = "libcudart.so*"
+	NvcudaMgmtName = "libcuda.so*"
+	NvmlMgmtName   = "" // not currently wired on linux
+	OneapiMgmtName = "libze_intel_gpu.so"
+)
+
+func GetCPUMem() (memInfo, error) {
+	var mem memInfo
+	var total, available, free, buffers, cached, freeSwap uint64
+	f, err := os.Open("/proc/meminfo")
+	if err != nil {
+		return mem, err
+	}
+	defer f.Close()
+	s := bufio.NewScanner(f)
+	for s.Scan() {
+		line := s.Text()
+		switch {
+		case strings.HasPrefix(line, "MemTotal:"):
+			_, err = fmt.Sscanf(line, "MemTotal:%d", &total)
+		case strings.HasPrefix(line, "MemAvailable:"):
+			_, err = fmt.Sscanf(line, "MemAvailable:%d", &available)
+		case strings.HasPrefix(line, "MemFree:"):
+			_, err = fmt.Sscanf(line, "MemFree:%d", &free)
+		case strings.HasPrefix(line, "Buffers:"):
+			_, err = fmt.Sscanf(line, "Buffers:%d", &buffers)
+		case strings.HasPrefix(line, "Cached:"):
+			_, err = fmt.Sscanf(line, "Cached:%d", &cached)
+		case strings.HasPrefix(line, "SwapFree:"):
+			_, err = fmt.Sscanf(line, "SwapFree:%d", &freeSwap)
+		default:
+			continue
+		}
+		if err != nil {
+			return mem, err
+		}
+	}
+	mem.TotalMemory = total * format.KibiByte
+	mem.FreeSwap = freeSwap * format.KibiByte
+	if available > 0 {
+		mem.FreeMemory = available * format.KibiByte
+	} else {
+		mem.FreeMemory = (free + buffers + cached) * format.KibiByte
+	}
+	return mem, nil
+}
--- a/gpu/gpu_test.go
+++ b/gpu/gpu_test.go
@@ -5,11 +5,12 @@ import (
 	"testing"

 	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
 )

 func TestBasicGetGPUInfo(t *testing.T) {
 	info := GetGPUInfo()
-	assert.Greater(t, len(info), 0)
+	assert.NotEmpty(t, len(info))
 	assert.Contains(t, "cuda rocm cpu metal", info[0].Library)
 	if info[0].Library != "cpu" {
 		assert.Greater(t, info[0].TotalMemory, uint64(0))
@@ -19,7 +20,7 @@ func TestBasicGetGPUInfo(t *testing.T) {

 func TestCPUMemInfo(t *testing.T) {
 	info, err := GetCPUMem()
-	assert.NoError(t, err)
+	require.NoError(t, err)
 	switch runtime.GOOS {
 	case "darwin":
 		t.Skip("CPU memory not populated on darwin")

--- a/gpu/gpu_windows.go
+++ b/gpu/gpu_windows.go
+package gpu
+
+import (
+	"fmt"
+	"syscall"
+	"unsafe"
+)
+
+type MEMORYSTATUSEX struct {
+	length               uint32
+	MemoryLoad           uint32
+	TotalPhys            uint64
+	AvailPhys            uint64
+	TotalPageFile        uint64
+	AvailPageFile        uint64
+	TotalVirtual         uint64
+	AvailVirtual         uint64
+	AvailExtendedVirtual uint64
+}
+
+var (
+	k32                      = syscall.NewLazyDLL("kernel32.dll")
+	globalMemoryStatusExProc = k32.NewProc("GlobalMemoryStatusEx")
+	sizeofMemoryStatusEx     = uint32(unsafe.Sizeof(MEMORYSTATUSEX{}))
+)
+
+var CudartGlobs = []string{
+	"c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll",
+}
+
+var NvmlGlobs = []string{
+	"c:\\Windows\\System32\\nvml.dll",
+}
+
+var NvcudaGlobs = []string{
+	"c:\\windows\\system*\\nvcuda.dll",
+}
+
+var OneapiGlobs = []string{
+	"c:\\Windows\\System32\\DriverStore\\FileRepository\\*\\ze_intel_gpu64.dll",
+}
+
+var (
+	CudartMgmtName = "cudart64_*.dll"
+	NvcudaMgmtName = "nvcuda.dll"
+	NvmlMgmtName   = "nvml.dll"
+	OneapiMgmtName = "ze_intel_gpu64.dll"
+)
+
+func GetCPUMem() (memInfo, error) {
+	memStatus := MEMORYSTATUSEX{length: sizeofMemoryStatusEx}
+	r1, _, err := globalMemoryStatusExProc.Call(uintptr(unsafe.Pointer(&memStatus)))
+	if r1 == 0 {
+		return memInfo{}, fmt.Errorf("GlobalMemoryStatusEx failed: %w", err)
+	}
+	return memInfo{TotalMemory: memStatus.TotalPhys, FreeMemory: memStatus.AvailPhys, FreeSwap: memStatus.AvailPageFile}, nil
+}
--- a/gpu/types.go
+++ b/gpu/types.go
@@ -10,6 +10,7 @@ import (
 type memInfo struct {
 	TotalMemory uint64 `json:"total_memory,omitempty"`
 	FreeMemory  uint64 `json:"free_memory,omitempty"`
+	FreeSwap    uint64 `json:"free_swap,omitempty"`
 }

 // Beginning of an `ollama info` command
@@ -18,7 +19,7 @@ type GpuInfo struct {
 	Library string `json:"library,omitempty"`

 	// Optional variant to select (e.g. versions, cpu feature flags)
-	Variant string `json:"variant,omitempty"`
+	Variant CPUCapability `json:"variant"`

 	// MinimumMemory represents the minimum memory required to use the GPU
 	MinimumMemory uint64 `json:"-"`
@@ -26,6 +27,14 @@ type GpuInfo struct {
 	// Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly
 	DependencyPath string `json:"lib_path,omitempty"`

+	// Extra environment variables specific to the GPU as list of [key,value]
+	EnvWorkarounds [][2]string `json:"envs,omitempty"`
+
+	// Set to true if we can NOT reliably discover FreeMemory.  A value of true indicates
+	// the FreeMemory is best effort, and may over or under report actual memory usage
+	// False indicates FreeMemory can generally be trusted on this GPU
+	UnreliableFreeMemory bool
+
 	// GPU information
 	ID      string `json:"gpu_id"`  // string to use for selection of this specific GPU
 	Name    string `json:"name"`    // user friendly name if available
@@ -38,6 +47,31 @@ type GpuInfo struct {
 	// TODO other performance capability info to help in scheduling decisions
 }

+type CPUInfo struct {
+	GpuInfo
+}
+
+type CudaGPUInfo struct {
+	GpuInfo
+	OSOverhead uint64 // Memory overhead between the driver library and management library
+	index      int    //nolint:unused,nolintlint
+}
+type CudaGPUInfoList []CudaGPUInfo
+
+type RocmGPUInfo struct {
+	GpuInfo
+	usedFilepath string //nolint:unused,nolintlint
+	index        int    //nolint:unused,nolintlint
+}
+type RocmGPUInfoList []RocmGPUInfo
+
+type OneapiGPUInfo struct {
+	GpuInfo
+	driverIndex int //nolint:unused,nolintlint
+	gpuIndex    int //nolint:unused,nolintlint
+}
+type OneapiGPUInfoList []OneapiGPUInfo
+
 type GpuInfoList []GpuInfo

 // Split up the set of gpu info's by Library and variant
@@ -47,8 +81,8 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
 	for _, info := range l {
 		found := false
 		requested := info.Library
-		if info.Variant != "" {
-			requested += "_" + info.Variant
+		if info.Variant != CPUCapabilityNone {
+			requested += "_" + info.Variant.String()
 		}
 		for i, lib := range libs {
 			if lib == requested {
@@ -86,3 +120,26 @@ type ByFreeMemory []GpuInfo
 func (a ByFreeMemory) Len() int           { return len(a) }
 func (a ByFreeMemory) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
 func (a ByFreeMemory) Less(i, j int) bool { return a[i].FreeMemory < a[j].FreeMemory }
+
+type CPUCapability uint32
+
+// Override at build time when building base GPU runners
+var GPURunnerCPUCapability = CPUCapabilityAVX
+
+const (
+	CPUCapabilityNone CPUCapability = iota
+	CPUCapabilityAVX
+	CPUCapabilityAVX2
+	// TODO AVX512
+)
+
+func (c CPUCapability) String() string {
+	switch c {
+	case CPUCapabilityAVX:
+		return "avx"
+	case CPUCapabilityAVX2:
+		return "avx2"
+	default:
+		return "no vector extensions"
+	}
+}
--- a/integration/basic_test.go
+++ b/integration/basic_test.go
@@ -45,14 +45,7 @@ func TestUnicodeModelDir(t *testing.T) {
 	defer os.RemoveAll(modelDir)
 	slog.Info("unicode", "OLLAMA_MODELS", modelDir)

-	oldModelsDir := os.Getenv("OLLAMA_MODELS")
-	if oldModelsDir == "" {
-		defer os.Unsetenv("OLLAMA_MODELS")
-	} else {
-		defer os.Setenv("OLLAMA_MODELS", oldModelsDir)
-	}
-	err = os.Setenv("OLLAMA_MODELS", modelDir)
-	require.NoError(t, err)
+	t.Setenv("OLLAMA_MODELS", modelDir)

 	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
 	defer cancel()

--- a/integration/concurrency_test.go
+++ b/integration/concurrency_test.go
@@ -11,25 +11,29 @@ import (
 	"testing"
 	"time"

-	"github.com/ollama/ollama/api"
 	"github.com/stretchr/testify/require"
+
+	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/format"
 )

 func TestMultiModelConcurrency(t *testing.T) {
 	var (
 		req = [2]api.GenerateRequest{
 			{
-				Model:  "orca-mini",
-				Prompt: "why is the ocean blue?",
-				Stream: &stream,
+				Model:     "orca-mini",
+				Prompt:    "why is the ocean blue?",
+				Stream:    &stream,
+				KeepAlive: &api.Duration{Duration: 10 * time.Second},
 				Options: map[string]interface{}{
 					"seed":        42,
 					"temperature": 0.0,
 				},
 			}, {
-				Model:  "tinydolphin",
-				Prompt: "what is the origin of the us thanksgiving holiday?",
-				Stream: &stream,
+				Model:     "tinydolphin",
+				Prompt:    "what is the origin of the us thanksgiving holiday?",
+				Stream:    &stream,
+				KeepAlive: &api.Duration{Duration: 10 * time.Second},
 				Options: map[string]interface{}{
 					"seed":        42,
 					"temperature": 0.0,
@@ -37,43 +41,64 @@ func TestMultiModelConcurrency(t *testing.T) {
 			},
 		}
 		resp = [2][]string{
-			[]string{"sunlight"},
-			[]string{"england", "english", "massachusetts", "pilgrims"},
+			{"sunlight"},
+			{"england", "english", "massachusetts", "pilgrims", "british"},
 		}
 	)
 	var wg sync.WaitGroup
 	wg.Add(len(req))
-	ctx, cancel := context.WithTimeout(context.Background(), time.Second*120)
+	ctx, cancel := context.WithTimeout(context.Background(), time.Second*240)
 	defer cancel()
+
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+
+	for i := 0; i < len(req); i++ {
+		require.NoError(t, PullIfMissing(ctx, client, req[i].Model))
+	}
+
 	for i := 0; i < len(req); i++ {
 		go func(i int) {
 			defer wg.Done()
-			GenerateTestHelper(ctx, t, req[i], resp[i])
+			DoGenerate(ctx, t, client, req[i], resp[i], 60*time.Second, 10*time.Second)
 		}(i)
 	}
 	wg.Wait()
 }

 func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
-	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) // GTX 750 2G card takes ~9 minutes
+	req, resp := GenerateRequests()
+	reqLimit := len(req)
+	iterLimit := 5
+
+	if s := os.Getenv("OLLAMA_MAX_VRAM"); s != "" {
+		maxVram, err := strconv.ParseUint(s, 10, 64)
+		require.NoError(t, err)
+		// Don't hammer on small VRAM cards...
+		if maxVram < 4*format.GibiByte {
+			reqLimit = min(reqLimit, 2)
+			iterLimit = 2
+		}
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 9*time.Minute)
 	defer cancel()
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()

-	req, resp := GenerateRequests()
 	// Get the server running (if applicable) warm the model up with a single initial request
-	DoGenerate(ctx, t, client, req[0], resp[0], 60*time.Second, 5*time.Second)
+	DoGenerate(ctx, t, client, req[0], resp[0], 60*time.Second, 10*time.Second)

 	var wg sync.WaitGroup
-	wg.Add(len(req))
-	for i := 0; i < len(req); i++ {
+	wg.Add(reqLimit)
+	for i := 0; i < reqLimit; i++ {
 		go func(i int) {
 			defer wg.Done()
-			for j := 0; j < 5; j++ {
+			for j := 0; j < iterLimit; j++ {
 				slog.Info("Starting", "req", i, "iter", j)
-				// On slower GPUs it can take a while to process the 4 concurrent requests
+				// On slower GPUs it can take a while to process the concurrent requests
 				// so we allow a much longer initial timeout
-				DoGenerate(ctx, t, client, req[i], resp[i], 90*time.Second, 5*time.Second)
+				DoGenerate(ctx, t, client, req[i], resp[i], 120*time.Second, 20*time.Second)
 			}
 		}(i)
 	}
@@ -82,13 +107,16 @@ func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {

 // Stress the system if we know how much VRAM it has, and attempt to load more models than will fit
 func TestMultiModelStress(t *testing.T) {
-	vram := os.Getenv("OLLAMA_MAX_VRAM")
-	if vram == "" {
+	s := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM
+	if s == "" {
 		t.Skip("OLLAMA_MAX_VRAM not specified, can't pick the right models for the stress test")
 	}
-	max, err := strconv.ParseUint(vram, 10, 64)
-	require.NoError(t, err)
-	const MB = uint64(1024 * 1024)
+
+	maxVram, err := strconv.ParseUint(s, 10, 64)
+	if err != nil {
+		t.Fatal(err)
+	}
+
 	type model struct {
 		name string
 		size uint64 // Approximate amount of VRAM they typically use when fully loaded in VRAM
@@ -97,83 +125,82 @@ func TestMultiModelStress(t *testing.T) {
 	smallModels := []model{
 		{
 			name: "orca-mini",
-			size: 2992 * MB,
+			size: 2992 * format.MebiByte,
 		},
 		{
 			name: "phi",
-			size: 2616 * MB,
+			size: 2616 * format.MebiByte,
 		},
 		{
 			name: "gemma:2b",
-			size: 2364 * MB,
+			size: 2364 * format.MebiByte,
 		},
 		{
 			name: "stable-code:3b",
-			size: 2608 * MB,
+			size: 2608 * format.MebiByte,
 		},
 		{
 			name: "starcoder2:3b",
-			size: 2166 * MB,
+			size: 2166 * format.MebiByte,
 		},
 	}
 	mediumModels := []model{
 		{
 			name: "llama2",
-			size: 5118 * MB,
+			size: 5118 * format.MebiByte,
 		},
 		{
 			name: "mistral",
-			size: 4620 * MB,
+			size: 4620 * format.MebiByte,
 		},
 		{
 			name: "orca-mini:7b",
-			size: 5118 * MB,
+			size: 5118 * format.MebiByte,
 		},
 		{
 			name: "dolphin-mistral",
-			size: 4620 * MB,
+			size: 4620 * format.MebiByte,
 		},
 		{
 			name: "gemma:7b",
-			size: 5000 * MB,
+			size: 5000 * format.MebiByte,
+		},
+		{
+			name: "codellama:7b",
+			size: 5118 * format.MebiByte,
 		},
-		// TODO - uncomment this once #3565 is merged and this is rebased on it
-		// {
-		// 	name: "codellama:7b",
-		// 	size: 5118 * MB,
-		// },
 	}

 	// These seem to be too slow to be useful...
 	// largeModels := []model{
 	// 	{
 	// 		name: "llama2:13b",
-	// 		size: 7400 * MB,
+	// 		size: 7400 * format.MebiByte,
 	// 	},
 	// 	{
 	// 		name: "codellama:13b",
-	// 		size: 7400 * MB,
+	// 		size: 7400 * format.MebiByte,
 	// 	},
 	// 	{
 	// 		name: "orca-mini:13b",
-	// 		size: 7400 * MB,
+	// 		size: 7400 * format.MebiByte,
 	// 	},
 	// 	{
 	// 		name: "gemma:7b",
-	// 		size: 5000 * MB,
+	// 		size: 5000 * format.MebiByte,
 	// 	},
 	// 	{
 	// 		name: "starcoder2:15b",
-	// 		size: 9100 * MB,
+	// 		size: 9100 * format.MebiByte,
 	// 	},
 	// }

 	var chosenModels []model
 	switch {
-	case max < 10000*MB:
+	case maxVram < 10000*format.MebiByte:
 		slog.Info("selecting small models")
 		chosenModels = smallModels
-	// case max < 30000*MB:
+	// case maxVram < 30000*format.MebiByte:
 	default:
 		slog.Info("selecting medium models")
 		chosenModels = mediumModels
@@ -202,15 +229,15 @@ func TestMultiModelStress(t *testing.T) {
 	}

 	var wg sync.WaitGroup
-	consumed := uint64(256 * MB) // Assume some baseline usage
+	consumed := uint64(256 * format.MebiByte) // Assume some baseline usage
 	for i := 0; i < len(req); i++ {
 		// Always get at least 2 models, but dont' overshoot VRAM too much or we'll take too long
-		if i > 1 && consumed > max {
-			slog.Info("achieved target vram exhaustion", "count", i, "vramMB", max/1024/1024, "modelsMB", consumed/1024/1024)
+		if i > 1 && consumed > maxVram {
+			slog.Info("achieved target vram exhaustion", "count", i, "vram", format.HumanBytes2(maxVram), "models", format.HumanBytes2(consumed))
 			break
 		}
 		consumed += chosenModels[i].size
-		slog.Info("target vram", "count", i, "vramMB", max/1024/1024, "modelsMB", consumed/1024/1024)
+		slog.Info("target vram", "count", i, "vram", format.HumanBytes2(maxVram), "models", format.HumanBytes2(consumed))

 		wg.Add(1)
 		go func(i int) {
@@ -221,5 +248,23 @@ func TestMultiModelStress(t *testing.T) {
 			}
 		}(i)
 	}
+	go func() {
+		for {
+			time.Sleep(2 * time.Second)
+			select {
+			case <-ctx.Done():
+				return
+			default:
+				models, err := client.ListRunning(ctx)
+				if err != nil {
+					slog.Warn("failed to list running models", "error", err)
+					continue
+				}
+				for _, m := range models.Models {
+					slog.Info("loaded model snapshot", "model", m)
+				}
+			}
+		}
+	}()
 	wg.Wait()
 }
--- a/integration/context_test.go
+++ b/integration/context_test.go
@@ -11,7 +11,8 @@ import (
 )

 func TestContextExhaustion(t *testing.T) {
-	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) // TODO maybe shorter?
+	// Longer needed for small footprint GPUs
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
 	defer cancel()
 	// Set up the test data
 	req := api.GenerateRequest{
@@ -24,5 +25,10 @@ func TestContextExhaustion(t *testing.T) {
 			"num_ctx":     128,
 		},
 	}
-	GenerateTestHelper(ctx, t, req, []string{"once", "upon", "lived"})
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+	if err := PullIfMissing(ctx, client, req.Model); err != nil {
+		t.Fatalf("PullIfMissing failed: %v", err)
+	}
+	DoGenerate(ctx, t, client, req, []string{"once", "upon", "lived"}, 120*time.Second, 10*time.Second)
 }
--- a/integration/embed_test.go
+++ b/integration/embed_test.go
+//go:build integration
+
+package integration
+
+import (
+	"context"
+	"math"
+	"testing"
+	"time"
+
+	"github.com/ollama/ollama/api"
+)
+
+func floatsEqual32(a, b float32) bool {
+	return math.Abs(float64(a-b)) <= 1e-4
+}
+
+func floatsEqual64(a, b float64) bool {
+	return math.Abs(a-b) <= 1e-4
+}
+
+func TestAllMiniLMEmbeddings(t *testing.T) {
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
+	defer cancel()
+
+	req := api.EmbeddingRequest{
+		Model:  "all-minilm",
+		Prompt: "why is the sky blue?",
+	}
+
+	res, err := embeddingTestHelper(ctx, t, req)
+
+	if err != nil {
+		t.Fatalf("error: %v", err)
+	}
+
+	if len(res.Embedding) != 384 {
+		t.Fatalf("expected 384 floats, got %d", len(res.Embedding))
+	}
+
+	if !floatsEqual64(res.Embedding[0], 0.06642947345972061) {
+		t.Fatalf("expected 0.06642947345972061, got %.16f", res.Embedding[0])
+	}
+}
+
+func TestAllMiniLMEmbed(t *testing.T) {
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
+	defer cancel()
+
+	req := api.EmbedRequest{
+		Model: "all-minilm",
+		Input: "why is the sky blue?",
+	}
+
+	res, err := embedTestHelper(ctx, t, req)
+
+	if err != nil {
+		t.Fatalf("error: %v", err)
+	}
+
+	if len(res.Embeddings) != 1 {
+		t.Fatalf("expected 1 embedding, got %d", len(res.Embeddings))
+	}
+
+	if len(res.Embeddings[0]) != 384 {
+		t.Fatalf("expected 384 floats, got %d", len(res.Embeddings[0]))
+	}
+
+	if !floatsEqual32(res.Embeddings[0][0], 0.010071031) {
+		t.Fatalf("expected 0.010071031, got %.8f", res.Embeddings[0][0])
+	}
+
+	if res.PromptEvalCount != 8 {
+		t.Fatalf("expected 8 prompt tokens, got %d", res.PromptEvalCount)
+	}
+}
+
+func TestAllMiniLMBatchEmbed(t *testing.T) {
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
+	defer cancel()
+
+	req := api.EmbedRequest{
+		Model: "all-minilm",
+		Input: []string{"why is the sky blue?", "why is the grass green?"},
+	}
+
+	res, err := embedTestHelper(ctx, t, req)
+
+	if err != nil {
+		t.Fatalf("error: %v", err)
+	}
+
+	if len(res.Embeddings) != 2 {
+		t.Fatalf("expected 2 embeddings, got %d", len(res.Embeddings))
+	}
+
+	if len(res.Embeddings[0]) != 384 {
+		t.Fatalf("expected 384 floats, got %d", len(res.Embeddings[0]))
+	}
+
+	if !floatsEqual32(res.Embeddings[0][0], 0.010071031) || !floatsEqual32(res.Embeddings[1][0], -0.009802706) {
+		t.Fatalf("expected 0.010071031 and -0.009802706, got %.8f and %.8f", res.Embeddings[0][0], res.Embeddings[1][0])
+	}
+
+	if res.PromptEvalCount != 16 {
+		t.Fatalf("expected 16 prompt tokens, got %d", res.PromptEvalCount)
+	}
+}
+
+func TestAllMiniLMEmbedTruncate(t *testing.T) {
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
+	defer cancel()
+
+	truncTrue, truncFalse := true, false
+
+	type testReq struct {
+		Name    string
+		Request api.EmbedRequest
+	}
+
+	reqs := []testReq{
+		{
+			Name: "Target Truncation",
+			Request: api.EmbedRequest{
+				Model: "all-minilm",
+				Input: "why",
+			},
+		},
+		{
+			Name: "Default Truncate",
+			Request: api.EmbedRequest{
+				Model:   "all-minilm",
+				Input:   "why is the sky blue?",
+				Options: map[string]any{"num_ctx": 1},
+			},
+		},
+		{
+			Name: "Explicit Truncate",
+			Request: api.EmbedRequest{
+				Model:    "all-minilm",
+				Input:    "why is the sky blue?",
+				Truncate: &truncTrue,
+				Options:  map[string]any{"num_ctx": 1},
+			},
+		},
+	}
+
+	res := make(map[string]*api.EmbedResponse)
+
+	for _, req := range reqs {
+		response, err := embedTestHelper(ctx, t, req.Request)
+		if err != nil {
+			t.Fatalf("error: %v", err)
+		}
+		res[req.Name] = response
+	}
+
+	if res["Target Truncation"].Embeddings[0][0] != res["Default Truncate"].Embeddings[0][0] {
+		t.Fatal("expected default request to truncate correctly")
+	}
+
+	if res["Default Truncate"].Embeddings[0][0] != res["Explicit Truncate"].Embeddings[0][0] {
+		t.Fatal("expected default request and truncate true request to be the same")
+	}
+
+	// check that truncate set to false returns an error if context length is exceeded
+	_, err := embedTestHelper(ctx, t, api.EmbedRequest{
+		Model:    "all-minilm",
+		Input:    "why is the sky blue?",
+		Truncate: &truncFalse,
+		Options:  map[string]any{"num_ctx": 1},
+	})
+
+	if err == nil {
+		t.Fatal("expected error, got nil")
+	}
+}
+
+func embeddingTestHelper(ctx context.Context, t *testing.T, req api.EmbeddingRequest) (*api.EmbeddingResponse, error) {
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+	if err := PullIfMissing(ctx, client, req.Model); err != nil {
+		t.Fatalf("failed to pull model %s: %v", req.Model, err)
+	}
+
+	response, err := client.Embeddings(ctx, &req)
+
+	if err != nil {
+		return nil, err
+	}
+
+	return response, nil
+}
+
+func embedTestHelper(ctx context.Context, t *testing.T, req api.EmbedRequest) (*api.EmbedResponse, error) {
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+	if err := PullIfMissing(ctx, client, req.Model); err != nil {
+		t.Fatalf("failed to pull model %s: %v", req.Model, err)
+	}
+
+	response, err := client.Embed(ctx, &req)
+
+	if err != nil {
+		return nil, err
+	}
+
+	return response, nil
+}
--- a/integration/llm_image_test.go
+++ b/integration/llm_image_test.go
@@ -32,7 +32,11 @@ func TestIntegrationMultimodal(t *testing.T) {
 	resp := "the ollam"
 	ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
 	defer cancel()
-	GenerateTestHelper(ctx, t, req, []string{resp})
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+	require.NoError(t, PullIfMissing(ctx, client, req.Model))
+	// llava models on CPU can be quite slow to start,
+	DoGenerate(ctx, t, client, req, []string{resp}, 120*time.Second, 30*time.Second)
 }

 const imageEncoding = `iVBORw0KGgoAAAANSUhEUgAAANIAAAB4CAYAAACHHqzKAAAAAXNSR0IArs4c6QAAAIRlWElmTU0AKgAAAAgABQESAAMAAAABAAEAAAEaAAUAAAABAAAASgEb

--- a/integration/llm_test.go
+++ b/integration/llm_test.go
@@ -35,8 +35,8 @@ var (
 		},
 	}
 	resp = [2][]string{
-		[]string{"sunlight"},
-		[]string{"england", "english", "massachusetts", "pilgrims"},
+		{"sunlight"},
+		{"england", "english", "massachusetts", "pilgrims"},
 	}
 )


--- a/integration/max_queue_test.go
+++ b/integration/max_queue_test.go
@@ -5,7 +5,6 @@ package integration
 import (
 	"context"
 	"errors"
-	"fmt"
 	"log/slog"
 	"os"
 	"strconv"
@@ -14,8 +13,10 @@ import (
 	"testing"
 	"time"

-	"github.com/ollama/ollama/api"
 	"github.com/stretchr/testify/require"
+
+	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/envconfig"
 )

 func TestMaxQueue(t *testing.T) {
@@ -27,13 +28,10 @@ func TestMaxQueue(t *testing.T) {
 	// Note: This test can be quite slow when running in CPU mode, so keep the threadCount low unless your on GPU
 	// Also note that by default Darwin can't sustain > ~128 connections without adjusting limits
 	threadCount := 32
-	mq := os.Getenv("OLLAMA_MAX_QUEUE")
-	if mq != "" {
-		var err error
-		threadCount, err = strconv.Atoi(mq)
-		require.NoError(t, err)
+	if maxQueue := envconfig.MaxQueue(); maxQueue != 0 {
+		threadCount = int(maxQueue)
 	} else {
-		os.Setenv("OLLAMA_MAX_QUEUE", fmt.Sprintf("%d", threadCount))
+		t.Setenv("OLLAMA_MAX_QUEUE", strconv.Itoa(threadCount))
 	}

 	req := api.GenerateRequest{