add all ollama

95005046 · wangkx1 · 22cb4ffc · 95005046 · 95005046 · 95005046
Commit 95005046 authored Aug 13, 2024 by wangkx1
20 changed files
--- a/ollama/gpu/gpu_info.h
+++ b/ollama/gpu/gpu_info.h
+#ifndef __APPLE__
+#ifndef __GPU_INFO_H__
+#define __GPU_INFO_H__
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifndef _WIN32
+#include <dlfcn.h>
+#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
+#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
+#define LOAD_ERR() strdup(dlerror())
+#define UNLOAD_LIBRARY(handle) dlclose(handle)
+#else
+#include <windows.h>
+#define LOAD_LIBRARY(lib, flags) LoadLibrary(lib)
+#define LOAD_SYMBOL(handle, sym) GetProcAddress(handle, sym)
+#define UNLOAD_LIBRARY(handle) FreeLibrary(handle)
+#define LOAD_ERR() ({\
+  LPSTR messageBuffer = NULL; \
+  size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, \
+                                 NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&messageBuffer, 0, NULL); \
+  char *resp = strdup(messageBuffer); \
+  LocalFree(messageBuffer); \
+  resp; \
+})
+
+#endif
+
+#define LOG(verbose, ...) \
+  do { \
+    if (verbose) { \
+      fprintf(stderr, __VA_ARGS__); \
+    } \
+  } while (0)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define GPU_ID_LEN 64
+#define GPU_NAME_LEN 96
+
+typedef struct mem_info {
+  char *err;  // If non-nill, caller responsible for freeing
+  char gpu_id[GPU_ID_LEN];
+  char gpu_name[GPU_NAME_LEN];
+  uint64_t total;
+  uint64_t free;
+  uint64_t used;
+
+  // Compute Capability
+  int major; 
+  int minor;
+  int patch;
+} mem_info_t;
+
+void cpu_check_ram(mem_info_t *resp);
+
+#ifdef __cplusplus
+}
+#endif
+
+#include "gpu_info_cudart.h"
+#include "gpu_info_nvcuda.h"
+#include "gpu_info_nvml.h"
+#include "gpu_info_oneapi.h"
+
+#endif  // __GPU_INFO_H__
+#endif  // __APPLE__
--- a/ollama/gpu/gpu_info_cudart.c
+++ b/ollama/gpu/gpu_info_cudart.c
+#ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?
+
+#include <string.h>
+#include "gpu_info_cudart.h"
+
+void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
+  cudartReturn_t ret;
+  resp->err = NULL;
+  resp->num_devices = 0;
+  const int buflen = 256;
+  char buf[buflen + 1];
+  int i;
+
+  struct lookup {
+    char *s;
+    void **p;
+  } l[] = {
+      {"cudaSetDevice", (void *)&resp->ch.cudaSetDevice},
+      {"cudaDeviceSynchronize", (void *)&resp->ch.cudaDeviceSynchronize},
+      {"cudaDeviceReset", (void *)&resp->ch.cudaDeviceReset},
+      {"cudaMemGetInfo", (void *)&resp->ch.cudaMemGetInfo},
+      {"cudaGetDeviceCount", (void *)&resp->ch.cudaGetDeviceCount},
+      {"cudaDeviceGetAttribute", (void *)&resp->ch.cudaDeviceGetAttribute},
+      {"cudaDriverGetVersion", (void *)&resp->ch.cudaDriverGetVersion},
+      {"cudaGetDeviceProperties", (void *)&resp->ch.cudaGetDeviceProperties},
+      {NULL, NULL},
+  };
+
+  resp->ch.handle = LOAD_LIBRARY(cudart_lib_path, RTLD_LAZY);
+  if (!resp->ch.handle) {
+    char *msg = LOAD_ERR();
+    LOG(resp->ch.verbose, "library %s load err: %s\n", cudart_lib_path, msg);
+    snprintf(buf, buflen,
+            "Unable to load %s library to query for Nvidia GPUs: %s",
+            cudart_lib_path, msg);
+    free(msg);
+    resp->err = strdup(buf);
+    return;
+  }
+
+  for (i = 0; l[i].s != NULL; i++) {
+    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
+    if (!*(l[i].p)) {
+      char *msg = LOAD_ERR();
+      LOG(resp->ch.verbose, "dlerr: %s\n", msg);
+      UNLOAD_LIBRARY(resp->ch.handle);
+      resp->ch.handle = NULL;
+      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
+              msg);
+      free(msg);
+      resp->err = strdup(buf);
+      return;
+    }
+  }
+
+  ret = (*resp->ch.cudaSetDevice)(0);
+  if (ret != CUDART_SUCCESS) {
+    LOG(resp->ch.verbose, "cudaSetDevice err: %d\n", ret);
+    UNLOAD_LIBRARY(resp->ch.handle);
+    resp->ch.handle = NULL;
+    if (ret == CUDA_ERROR_INSUFFICIENT_DRIVER) {
+      resp->err = strdup("your nvidia driver is too old or missing.  If you have a CUDA GPU please upgrade to run ollama");
+      return;
+    }
+    snprintf(buf, buflen, "cudart init failure: %d", ret);
+    resp->err = strdup(buf);
+    return;
+  }
+
+  int version = 0;
+  cudartDriverVersion_t driverVersion;
+  driverVersion.major = 0;
+  driverVersion.minor = 0;
+
+  // Report driver version if we're in verbose mode, ignore errors
+  ret = (*resp->ch.cudaDriverGetVersion)(&version);
+  if (ret != CUDART_SUCCESS) {
+    LOG(resp->ch.verbose, "cudaDriverGetVersion failed: %d\n", ret);
+  } else {
+    driverVersion.major = version / 1000;
+    driverVersion.minor = (version - (driverVersion.major * 1000)) / 10;
+    LOG(resp->ch.verbose, "CUDA driver version: %d-%d\n", driverVersion.major, driverVersion.minor);
+  }
+
+  ret = (*resp->ch.cudaGetDeviceCount)(&resp->num_devices);
+  if (ret != CUDART_SUCCESS) {
+    LOG(resp->ch.verbose, "cudaGetDeviceCount err: %d\n", ret);
+    UNLOAD_LIBRARY(resp->ch.handle);
+    resp->ch.handle = NULL;
+    snprintf(buf, buflen, "unable to get device count: %d", ret);
+    resp->err = strdup(buf);
+    return;
+  }
+}
+
+
+void cudart_bootstrap(cudart_handle_t h, int i, mem_info_t *resp) {
+  resp->err = NULL;
+  cudartMemory_t memInfo = {0,0,0};
+  cudartReturn_t ret;
+  const int buflen = 256;
+  char buf[buflen + 1];
+
+  if (h.handle == NULL) {
+    resp->err = strdup("cudart handle isn't initialized");
+    return;
+  }
+
+  ret = (*h.cudaSetDevice)(i);
+  if (ret != CUDART_SUCCESS) {
+    snprintf(buf, buflen, "cudart device failed to initialize");
+    resp->err = strdup(buf);
+    return;
+  }
+
+  cudaDeviceProp_t props;
+  ret = (*h.cudaGetDeviceProperties)(&props, i);
+  if (ret != CUDART_SUCCESS) {
+    LOG(h.verbose, "[%d] device properties lookup failure: %d\n", i, ret);
+    snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", i);
+    resp->major = 0;
+    resp->minor = 0;
+  } else {
+    int allNull = 1;
+    for (int j = 0; j < 16; j++) {
+      if (props.uuid.bytes[j] != 0) {
+        allNull = 0;
+        break;
+      }
+    }
+    if (allNull != 0) {
+      snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", i);
+    } else {
+      // GPU-d110a105-ac29-1d54-7b49-9c90440f215b
+      snprintf(&resp->gpu_id[0], GPU_ID_LEN,
+          "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+          props.uuid.bytes[0],
+          props.uuid.bytes[1],
+          props.uuid.bytes[2],
+          props.uuid.bytes[3],
+          props.uuid.bytes[4],
+          props.uuid.bytes[5],
+          props.uuid.bytes[6],
+          props.uuid.bytes[7],
+          props.uuid.bytes[8],
+          props.uuid.bytes[9],
+          props.uuid.bytes[10],
+          props.uuid.bytes[11],
+          props.uuid.bytes[12],
+          props.uuid.bytes[13],
+          props.uuid.bytes[14],
+          props.uuid.bytes[15]
+        );
+    }
+    resp->major = props.major;
+    resp->minor = props.minor;
+
+    // TODO add other useful properties from props
+  }
+  ret = (*h.cudaMemGetInfo)(&memInfo.free, &memInfo.total);
+  if (ret != CUDART_SUCCESS) {
+    snprintf(buf, buflen, "cudart device memory info lookup failure %d", ret);
+    resp->err = strdup(buf);
+    return;
+  }
+
+  resp->total = memInfo.total;
+  resp->free = memInfo.free;
+  resp->used = memInfo.used;
+
+  LOG(h.verbose, "[%s] CUDA totalMem %lu\n", resp->gpu_id, resp->total);
+  LOG(h.verbose, "[%s] CUDA freeMem %lu\n", resp->gpu_id, resp->free);
+  LOG(h.verbose, "[%s] CUDA usedMem %lu\n", resp->gpu_id, resp->used);
+  LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);
+}
+
+void cudart_release(cudart_handle_t h) {
+  LOG(h.verbose, "releasing cudart library\n");
+  UNLOAD_LIBRARY(h.handle);
+  h.handle = NULL;
+}
+
+#endif  // __APPLE__
\ No newline at end of file
--- a/ollama/gpu/gpu_info_cudart.h
+++ b/ollama/gpu/gpu_info_cudart.h
+#ifndef __APPLE__
+#ifndef __GPU_INFO_CUDART_H__
+#define __GPU_INFO_CUDART_H__
+#include "gpu_info.h"
+
+// Just enough typedef's to dlopen/dlsym for memory information
+typedef enum cudartReturn_enum {
+  CUDART_SUCCESS = 0,
+  CUDART_ERROR_INVALID_VALUE = 1,
+  CUDART_ERROR_MEMORY_ALLOCATION = 2,
+  CUDART_ERROR_INSUFFICIENT_DRIVER = 35,
+  // Other values omitted for now...
+} cudartReturn_t;
+
+typedef enum cudartDeviceAttr_enum {
+  cudartDevAttrComputeCapabilityMajor = 75,
+  cudartDevAttrComputeCapabilityMinor = 76,
+
+  // TODO - not yet wired up but may be useful for Jetson or other
+  // integrated GPU scenarios with shared memory
+  cudaDevAttrIntegrated = 18
+
+} cudartDeviceAttr_t;
+
+typedef void *cudartDevice_t;  // Opaque is sufficient
+typedef struct cudartMemory_st {
+  size_t total;
+  size_t free;
+  size_t used;
+} cudartMemory_t;
+
+typedef struct cudartDriverVersion {
+  int major;
+  int minor;
+} cudartDriverVersion_t;
+
+typedef struct cudaUUID {
+    unsigned char bytes[16];
+} cudaUUID_t;
+typedef struct cudaDeviceProp {
+    char         name[256];                  /**< ASCII string identifying device */
+    cudaUUID_t   uuid;                       /**< 16-byte unique identifier */
+    char         luid[8];                    /**< 8-byte locally unique identifier. Value is undefined on TCC and non-Windows platforms */
+    unsigned int luidDeviceNodeMask;         /**< LUID device node mask. Value is undefined on TCC and non-Windows platforms */
+    size_t       totalGlobalMem;             /**< Global memory available on device in bytes */
+    size_t       sharedMemPerBlock;          /**< Shared memory available per block in bytes */
+    int          regsPerBlock;               /**< 32-bit registers available per block */
+    int          warpSize;                   /**< Warp size in threads */
+    size_t       memPitch;                   /**< Maximum pitch in bytes allowed by memory copies */
+    int          maxThreadsPerBlock;         /**< Maximum number of threads per block */
+    int          maxThreadsDim[3];           /**< Maximum size of each dimension of a block */
+    int          maxGridSize[3];             /**< Maximum size of each dimension of a grid */
+    int          clockRate;                  /**< Clock frequency in kilohertz */
+    size_t       totalConstMem;              /**< Constant memory available on device in bytes */
+    int          major;                      /**< Major compute capability */
+    int          minor;                      /**< Minor compute capability */
+    size_t       textureAlignment;           /**< Alignment requirement for textures */
+    size_t       texturePitchAlignment;      /**< Pitch alignment requirement for texture references bound to pitched memory */
+    int          deviceOverlap;              /**< Device can concurrently copy memory and execute a kernel. Deprecated. Use instead asyncEngineCount. */
+    int          multiProcessorCount;        /**< Number of multiprocessors on device */
+    int          kernelExecTimeoutEnabled;   /**< Specified whether there is a run time limit on kernels */
+    int          integrated;                 /**< Device is integrated as opposed to discrete */
+    int          canMapHostMemory;           /**< Device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer */
+    int          computeMode;                /**< Compute mode (See ::cudaComputeMode) */
+    int          maxTexture1D;               /**< Maximum 1D texture size */
+    int          maxTexture1DMipmap;         /**< Maximum 1D mipmapped texture size */
+    int          maxTexture1DLinear;         /**< Deprecated, do not use. Use cudaDeviceGetTexture1DLinearMaxWidth() or cuDeviceGetTexture1DLinearMaxWidth() instead. */
+    int          maxTexture2D[2];            /**< Maximum 2D texture dimensions */
+    int          maxTexture2DMipmap[2];      /**< Maximum 2D mipmapped texture dimensions */
+    int          maxTexture2DLinear[3];      /**< Maximum dimensions (width, height, pitch) for 2D textures bound to pitched memory */
+    int          maxTexture2DGather[2];      /**< Maximum 2D texture dimensions if texture gather operations have to be performed */
+    int          maxTexture3D[3];            /**< Maximum 3D texture dimensions */
+    int          maxTexture3DAlt[3];         /**< Maximum alternate 3D texture dimensions */
+    int          maxTextureCubemap;          /**< Maximum Cubemap texture dimensions */
+    int          maxTexture1DLayered[2];     /**< Maximum 1D layered texture dimensions */
+    int          maxTexture2DLayered[3];     /**< Maximum 2D layered texture dimensions */
+    int          maxTextureCubemapLayered[2];/**< Maximum Cubemap layered texture dimensions */
+    int          maxSurface1D;               /**< Maximum 1D surface size */
+    int          maxSurface2D[2];            /**< Maximum 2D surface dimensions */
+    int          maxSurface3D[3];            /**< Maximum 3D surface dimensions */
+    int          maxSurface1DLayered[2];     /**< Maximum 1D layered surface dimensions */
+    int          maxSurface2DLayered[3];     /**< Maximum 2D layered surface dimensions */
+    int          maxSurfaceCubemap;          /**< Maximum Cubemap surface dimensions */
+    int          maxSurfaceCubemapLayered[2];/**< Maximum Cubemap layered surface dimensions */
+    size_t       surfaceAlignment;           /**< Alignment requirements for surfaces */
+    int          concurrentKernels;          /**< Device can possibly execute multiple kernels concurrently */
+    int          ECCEnabled;                 /**< Device has ECC support enabled */
+    int          pciBusID;                   /**< PCI bus ID of the device */
+    int          pciDeviceID;                /**< PCI device ID of the device */
+    int          pciDomainID;                /**< PCI domain ID of the device */
+    int          tccDriver;                  /**< 1 if device is a Tesla device using TCC driver, 0 otherwise */
+    int          asyncEngineCount;           /**< Number of asynchronous engines */
+    int          unifiedAddressing;          /**< Device shares a unified address space with the host */
+    int          memoryClockRate;            /**< Peak memory clock frequency in kilohertz */
+    int          memoryBusWidth;             /**< Global memory bus width in bits */
+    int          l2CacheSize;                /**< Size of L2 cache in bytes */
+    int          persistingL2CacheMaxSize;   /**< Device's maximum l2 persisting lines capacity setting in bytes */
+    int          maxThreadsPerMultiProcessor;/**< Maximum resident threads per multiprocessor */
+    int          streamPrioritiesSupported;  /**< Device supports stream priorities */
+    int          globalL1CacheSupported;     /**< Device supports caching globals in L1 */
+    int          localL1CacheSupported;      /**< Device supports caching locals in L1 */
+    size_t       sharedMemPerMultiprocessor; /**< Shared memory available per multiprocessor in bytes */
+    int          regsPerMultiprocessor;      /**< 32-bit registers available per multiprocessor */
+    int          managedMemory;              /**< Device supports allocating managed memory on this system */
+    int          isMultiGpuBoard;            /**< Device is on a multi-GPU board */
+    int          multiGpuBoardGroupID;       /**< Unique identifier for a group of devices on the same multi-GPU board */
+    int          hostNativeAtomicSupported;  /**< Link between the device and the host supports native atomic operations */
+    int          singleToDoublePrecisionPerfRatio; /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */
+    int          pageableMemoryAccess;       /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */
+    int          concurrentManagedAccess;    /**< Device can coherently access managed memory concurrently with the CPU */
+    int          computePreemptionSupported; /**< Device supports Compute Preemption */
+    int          canUseHostPointerForRegisteredMem; /**< Device can access host registered memory at the same virtual address as the CPU */
+    int          cooperativeLaunch;          /**< Device supports launching cooperative kernels via ::cudaLaunchCooperativeKernel */
+    int          cooperativeMultiDeviceLaunch; /**< Deprecated, cudaLaunchCooperativeKernelMultiDevice is deprecated. */
+    size_t       sharedMemPerBlockOptin;     /**< Per device maximum shared memory per block usable by special opt in */
+    int          pageableMemoryAccessUsesHostPageTables; /**< Device accesses pageable memory via the host's page tables */
+    int          directManagedMemAccessFromHost; /**< Host can directly access managed memory on the device without migration. */
+    int          maxBlocksPerMultiProcessor; /**< Maximum number of resident blocks per multiprocessor */
+    int          accessPolicyMaxWindowSize;  /**< The maximum value of ::cudaAccessPolicyWindow::num_bytes. */
+    size_t       reservedSharedMemPerBlock;  /**< Shared memory reserved by CUDA driver per block in bytes */
+  } cudaDeviceProp_t;
+
+typedef struct cudart_handle {
+  void *handle;
+  uint16_t verbose;
+  cudartReturn_t (*cudaSetDevice)(int device);
+  cudartReturn_t (*cudaDeviceSynchronize)(void);
+  cudartReturn_t (*cudaDeviceReset)(void);
+  cudartReturn_t (*cudaMemGetInfo)(size_t *, size_t *);
+  cudartReturn_t (*cudaGetDeviceCount)(int *);
+  cudartReturn_t (*cudaDeviceGetAttribute)(int* value, cudartDeviceAttr_t attr, int device);
+  cudartReturn_t (*cudaDriverGetVersion) (int *driverVersion);
+  cudartReturn_t (*cudaGetDeviceProperties) (cudaDeviceProp_t* prop, int device);
+} cudart_handle_t;
+
+typedef struct cudart_init_resp {
+  char *err;  // If err is non-null handle is invalid
+  cudart_handle_t ch;
+  int num_devices;
+} cudart_init_resp_t;
+
+void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp);
+void cudart_bootstrap(cudart_handle_t ch, int device_id, mem_info_t *resp);
+// TODO - if we keep this library longer term, add cudart_get_free
+void cudart_release(cudart_handle_t ch);
+
+#endif  // __GPU_INFO_CUDART_H__
+#endif  // __APPLE__
--- a/ollama/gpu/gpu_info_darwin.h
+++ b/ollama/gpu/gpu_info_darwin.h
+#import <Metal/Metal.h>
+#include <stdint.h>
+uint64_t getRecommendedMaxVRAM();
+uint64_t getPhysicalMemory();
+uint64_t getFreeMemory();
--- a/ollama/gpu/gpu_info_darwin.m
+++ b/ollama/gpu/gpu_info_darwin.m
+#import <Foundation/Foundation.h>
+#import <mach/mach.h>
+#include "gpu_info_darwin.h"
+
+uint64_t getRecommendedMaxVRAM() {
+  id<MTLDevice> device = MTLCreateSystemDefaultDevice();
+  uint64_t result = device.recommendedMaxWorkingSetSize;
+  CFRelease(device);
+  return result;
+}
+
+// getPhysicalMemory returns the total physical memory in bytes
+uint64_t getPhysicalMemory() {
+  return [NSProcessInfo processInfo].physicalMemory;
+}
+
+// getFreeMemory returns the total free memory in bytes, including inactive
+// memory that can be reclaimed by the system.
+uint64_t getFreeMemory() {
+  mach_port_t host_port = mach_host_self();
+  mach_msg_type_number_t host_size = sizeof(vm_statistics64_data_t) / sizeof(integer_t);
+  vm_size_t pagesize;
+  vm_statistics64_data_t vm_stat;
+
+  host_page_size(host_port, &pagesize);
+  if (host_statistics64(host_port, HOST_VM_INFO64, (host_info64_t)&vm_stat, &host_size) != KERN_SUCCESS) {
+    return 0;
+  }
+
+  uint64_t free_memory = (uint64_t)vm_stat.free_count * pagesize;
+  free_memory += (uint64_t)vm_stat.speculative_count * pagesize;
+  free_memory += (uint64_t)vm_stat.inactive_count * pagesize;
+
+  return free_memory;
+}
--- a/ollama/gpu/gpu_info_nvcuda.c
+++ b/ollama/gpu/gpu_info_nvcuda.c
+#ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?
+
+#include <string.h>
+#include "gpu_info_nvcuda.h"
+
+void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
+  CUresult ret;
+  resp->err = NULL;
+  resp->num_devices = 0;
+  resp->cudaErr = CUDA_SUCCESS;
+  const int buflen = 256;
+  char buf[buflen + 1];
+  int i;
+
+  struct lookup {
+    char *s;
+    void **p;
+  } l[] = {
+   
+      {"cuInit", (void *)&resp->ch.cuInit},
+      {"cuDriverGetVersion", (void *)&resp->ch.cuDriverGetVersion},
+      {"cuDeviceGetCount", (void *)&resp->ch.cuDeviceGetCount},
+      {"cuDeviceGet", (void *)&resp->ch.cuDeviceGet},
+      {"cuDeviceGetAttribute", (void *)&resp->ch.cuDeviceGetAttribute},
+      {"cuDeviceGetUuid", (void *)&resp->ch.cuDeviceGetUuid},
+      {"cuDeviceGetName", (void *)&resp->ch.cuDeviceGetName},
+      {"cuCtxCreate_v3", (void *)&resp->ch.cuCtxCreate_v3},
+      {"cuMemGetInfo_v2", (void *)&resp->ch.cuMemGetInfo_v2},
+      {"cuCtxDestroy", (void *)&resp->ch.cuCtxDestroy},
+      {NULL, NULL},
+  };
+
+  resp->ch.handle = LOAD_LIBRARY(nvcuda_lib_path, RTLD_LAZY);
+  if (!resp->ch.handle) {
+    char *msg = LOAD_ERR();
+    LOG(resp->ch.verbose, "library %s load err: %s\n", nvcuda_lib_path, msg);
+    snprintf(buf, buflen,
+            "Unable to load %s library to query for Nvidia GPUs: %s",
+            nvcuda_lib_path, msg);
+    free(msg);
+    resp->err = strdup(buf);
+    resp->cudaErr = -1;
+    return;
+  }
+
+  for (i = 0; l[i].s != NULL; i++) {
+    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
+    if (!*(l[i].p)) {
+      char *msg = LOAD_ERR();
+      LOG(resp->ch.verbose, "dlerr: %s\n", msg);
+      UNLOAD_LIBRARY(resp->ch.handle);
+      resp->ch.handle = NULL;
+      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
+              msg);
+      free(msg);
+      resp->err = strdup(buf);
+      resp->cudaErr = -1;
+      return;
+    }
+  }
+
+  ret = (*resp->ch.cuInit)(0);
+  if (ret != CUDA_SUCCESS) {
+    LOG(resp->ch.verbose, "cuInit err: %d\n", ret);
+    UNLOAD_LIBRARY(resp->ch.handle);
+    resp->ch.handle = NULL;
+    snprintf(buf, buflen, "cuda driver library init failure: %d", ret);
+    resp->err = strdup(buf);
+    resp->cudaErr = ret;
+    return;
+  }
+
+  int version = 0;
+  resp->ch.driver_major = 0;
+  resp->ch.driver_minor = 0;
+
+  // Report driver version if we're in verbose mode, ignore errors
+  ret = (*resp->ch.cuDriverGetVersion)(&version);
+  if (ret != CUDA_SUCCESS) {
+    LOG(resp->ch.verbose, "cuDriverGetVersion failed: %d\n", ret);
+  } else {
+    resp->ch.driver_major = version / 1000;
+    resp->ch.driver_minor = (version - (resp->ch.driver_major * 1000)) / 10;
+    LOG(resp->ch.verbose, "CUDA driver version: %d.%d\n", resp->ch.driver_major, resp->ch.driver_minor);
+  }
+
+  ret = (*resp->ch.cuDeviceGetCount)(&resp->num_devices);
+  if (ret != CUDA_SUCCESS) {
+    LOG(resp->ch.verbose, "cuDeviceGetCount err: %d\n", ret);
+    UNLOAD_LIBRARY(resp->ch.handle);
+    resp->ch.handle = NULL;
+    snprintf(buf, buflen, "unable to get device count: %d", ret);
+    resp->err = strdup(buf);
+    resp->cudaErr = ret;
+    return;
+  }
+}
+
+const int buflen = 256;
+void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) {
+  resp->err = NULL;
+  nvcudaMemory_t memInfo = {0,0};
+  CUresult ret;
+  CUdevice device = -1;
+  CUcontext ctx = NULL;
+  char buf[buflen + 1];
+  CUuuid uuid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+
+  if (h.handle == NULL) {
+    resp->err = strdup("cuda driver library handle isn't initialized");
+    return;
+  }
+
+  ret = (*h.cuDeviceGet)(&device, i);
+  if (ret != CUDA_SUCCESS) {
+    snprintf(buf, buflen, "cuda driver library device failed to initialize");
+    resp->err = strdup(buf);
+    return;
+  }
+
+  int major = 0;
+  int minor = 0;
+  ret = (*h.cuDeviceGetAttribute)(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
+  if (ret != CUDA_SUCCESS) {
+    LOG(h.verbose, "[%d] device major lookup failure: %d\n", i, ret);
+  } else {
+    ret = (*h.cuDeviceGetAttribute)(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);
+    if (ret != CUDA_SUCCESS) {
+      LOG(h.verbose, "[%d] device minor lookup failure: %d\n", i, ret);
+    } else {
+      resp->minor = minor;  
+      resp->major = major;  
+    }
+  }
+
+  ret = (*h.cuDeviceGetUuid)(&uuid, device);
+  if (ret != CUDA_SUCCESS) {
+    LOG(h.verbose, "[%d] device uuid lookup failure: %d\n", i, ret);
+    snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", i);
+  } else {
+    // GPU-d110a105-ac29-1d54-7b49-9c90440f215b
+    snprintf(&resp->gpu_id[0], GPU_ID_LEN,
+        "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+        uuid.bytes[0],
+        uuid.bytes[1],
+        uuid.bytes[2],
+        uuid.bytes[3],
+        uuid.bytes[4],
+        uuid.bytes[5],
+        uuid.bytes[6],
+        uuid.bytes[7],
+        uuid.bytes[8],
+        uuid.bytes[9],
+        uuid.bytes[10],
+        uuid.bytes[11],
+        uuid.bytes[12],
+        uuid.bytes[13],
+        uuid.bytes[14],
+        uuid.bytes[15]
+      );
+  }
+
+  ret = (*h.cuDeviceGetName)(&resp->gpu_name[0], GPU_NAME_LEN, device);
+  if (ret != CUDA_SUCCESS) {
+    LOG(h.verbose, "[%d] device name lookup failure: %d\n", i, ret);
+    resp->gpu_name[0] = '\0';
+  }
+
+  // To get memory we have to set (and release) a context
+  ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
+  if (ret != CUDA_SUCCESS) {
+    snprintf(buf, buflen, "cuda driver library failed to get device context %d", ret);
+    resp->err = strdup(buf);
+    return;
+  }
+
+  ret = (*h.cuMemGetInfo_v2)(&memInfo.free, &memInfo.total);
+  if (ret != CUDA_SUCCESS) {
+    snprintf(buf, buflen, "cuda driver library device memory info lookup failure %d", ret);
+    resp->err = strdup(buf);
+    // Best effort on failure...
+    (*h.cuCtxDestroy)(ctx);
+    return;
+  }
+
+  resp->total = memInfo.total;
+  resp->free = memInfo.free;
+
+  LOG(h.verbose, "[%s] CUDA totalMem %lu mb\n", resp->gpu_id, resp->total / 1024 / 1024);
+  LOG(h.verbose, "[%s] CUDA freeMem %lu mb\n", resp->gpu_id, resp->free / 1024 / 1024);
+  LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);
+
+  
+
+  ret = (*h.cuCtxDestroy)(ctx);
+  if (ret != CUDA_SUCCESS) {
+    LOG(1, "cuda driver library failed to release device context %d", ret);
+  }
+}
+
+void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free, uint64_t *total) {
+  CUresult ret;
+  CUcontext ctx = NULL;
+  CUdevice device = -1;
+  *free = 0;
+  *total = 0;
+
+  ret = (*h.cuDeviceGet)(&device, i);
+  if (ret != CUDA_SUCCESS) {
+    LOG(1, "cuda driver library device failed to initialize");
+    return;
+  }
+
+
+  // To get memory we have to set (and release) a context
+  ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
+  if (ret != CUDA_SUCCESS) {
+    LOG(1, "cuda driver library failed to get device context %d", ret);
+    return;
+  }
+
+  ret = (*h.cuMemGetInfo_v2)(free, total);
+  if (ret != CUDA_SUCCESS) {
+    LOG(1, "cuda driver library device memory info lookup failure %d", ret);
+    // Best effort on failure...
+    (*h.cuCtxDestroy)(ctx);
+    return;
+  }
+
+  ret = (*h.cuCtxDestroy)(ctx);
+  if (ret != CUDA_SUCCESS) {
+    LOG(1, "cuda driver library failed to release device context %d", ret);
+  }
+}
+
+void nvcuda_release(nvcuda_handle_t h) {
+  LOG(h.verbose, "releasing cuda driver library\n");
+  UNLOAD_LIBRARY(h.handle);
+  // TODO and other context release logic?
+  h.handle = NULL;
+}
+
+#endif  // __APPLE__
\ No newline at end of file
--- a/ollama/gpu/gpu_info_nvcuda.h
+++ b/ollama/gpu/gpu_info_nvcuda.h
+#ifndef __APPLE__
+#ifndef __GPU_INFO_NVCUDA_H__
+#define __GPU_INFO_NVCUDA_H__
+#include "gpu_info.h"
+
+// Just enough typedef's to dlopen/dlsym for memory information
+typedef enum cudaError_enum {
+  CUDA_SUCCESS = 0,
+  CUDA_ERROR_INVALID_VALUE = 1,
+  CUDA_ERROR_OUT_OF_MEMORY = 2,
+  CUDA_ERROR_NOT_INITIALIZED = 3,
+  CUDA_ERROR_INSUFFICIENT_DRIVER = 35,
+  CUDA_ERROR_NO_DEVICE = 100,
+  CUDA_ERROR_SYSTEM_DRIVER_MISMATCH = 803,
+  CUDA_ERROR_UNKNOWN = 999,
+  // Other values omitted for now...
+} CUresult;
+
+typedef enum CUdevice_attribute_enum {
+  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75,
+  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76,
+
+  // TODO - not yet wired up but may be useful for Jetson or other
+  // integrated GPU scenarios with shared memory
+  CU_DEVICE_ATTRIBUTE_INTEGRATED = 18
+
+} CUdevice_attribute;
+
+typedef void *nvcudaDevice_t;  // Opaque is sufficient
+typedef struct nvcudaMemory_st {
+  uint64_t total;
+  uint64_t free;
+} nvcudaMemory_t;
+
+typedef struct nvcudaDriverVersion {
+  int major;
+  int minor;
+} nvcudaDriverVersion_t;
+
+typedef struct CUuuid_st {
+    unsigned char bytes[16];
+} CUuuid;
+
+typedef int CUdevice;
+typedef void* CUcontext;
+
+typedef struct nvcuda_handle {
+  void *handle;
+  uint16_t verbose;
+  int driver_major;
+  int driver_minor;
+  CUresult (*cuInit)(unsigned int Flags);
+  CUresult (*cuDriverGetVersion)(int *driverVersion);
+  CUresult (*cuDeviceGetCount)(int *);
+  CUresult (*cuDeviceGet)(CUdevice* device, int ordinal);
+  CUresult (*cuDeviceGetAttribute)(int* pi, CUdevice_attribute attrib, CUdevice dev);
+  CUresult (*cuDeviceGetUuid)(CUuuid* uuid, CUdevice dev); // signature compatible with cuDeviceGetUuid_v2
+  CUresult (*cuDeviceGetName)(char *name, int len, CUdevice dev);
+
+  // Context specific aspects
+  CUresult (*cuCtxCreate_v3)(CUcontext* pctx, void *params, int len, unsigned int flags, CUdevice dev);
+  CUresult (*cuMemGetInfo_v2)(uint64_t* free, uint64_t* total);
+  CUresult (*cuCtxDestroy)(CUcontext ctx);
+} nvcuda_handle_t;
+
+typedef struct nvcuda_init_resp {
+  char *err;  // If err is non-null handle is invalid
+  nvcuda_handle_t ch;
+  int num_devices;
+  CUresult cudaErr;
+} nvcuda_init_resp_t;
+
+void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp);
+void nvcuda_bootstrap(nvcuda_handle_t ch, int device_id, mem_info_t *resp);
+void nvcuda_get_free(nvcuda_handle_t ch,  int device_id, uint64_t *free, uint64_t *total);
+void nvcuda_release(nvcuda_handle_t ch);
+
+#endif  // __GPU_INFO_NVCUDA_H__
+#endif  // __APPLE__
--- a/ollama/gpu/gpu_info_nvml.c
+++ b/ollama/gpu/gpu_info_nvml.c
+#ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?
+
+#include <string.h>
+
+#include "gpu_info_nvml.h"
+
+void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
+  nvmlReturn_t ret;
+  resp->err = NULL;
+  const int buflen = 256;
+  char buf[buflen + 1];
+  int i;
+
+  struct lookup {
+    char *s;
+    void **p;
+  } l[] = {
+      {"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2},
+      {"nvmlShutdown", (void *)&resp->ch.nvmlShutdown},
+      {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.nvmlDeviceGetHandleByIndex},
+      {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo},
+      {NULL, NULL},
+  };
+
+  resp->ch.handle = LOAD_LIBRARY(nvml_lib_path, RTLD_LAZY);
+  if (!resp->ch.handle) {
+    char *msg = LOAD_ERR();
+    LOG(resp->ch.verbose, "library %s load err: %s\n", nvml_lib_path, msg);
+    snprintf(buf, buflen,
+             "Unable to load %s library to query for Nvidia GPUs: %s",
+             nvml_lib_path, msg);
+    free(msg);
+    resp->err = strdup(buf);
+    return;
+  }
+
+  // TODO once we've squashed the remaining corner cases remove this log
+  // LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", nvml_lib_path);
+  
+  for (i = 0; l[i].s != NULL; i++) {
+    // TODO once we've squashed the remaining corner cases remove this log
+    // LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
+
+    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
+    if (!*(l[i].p)) {
+      resp->ch.handle = NULL;
+      char *msg = LOAD_ERR();
+      LOG(resp->ch.verbose, "dlerr: %s\n", msg);
+      UNLOAD_LIBRARY(resp->ch.handle);
+      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
+               msg);
+      free(msg);
+      resp->err = strdup(buf);
+      return;
+    }
+  }
+
+  ret = (*resp->ch.nvmlInit_v2)();
+  if (ret != NVML_SUCCESS) {
+    LOG(resp->ch.verbose, "nvmlInit_v2 err: %d\n", ret);
+    UNLOAD_LIBRARY(resp->ch.handle);
+    resp->ch.handle = NULL;
+    snprintf(buf, buflen, "nvml vram init failure: %d", ret);
+    resp->err = strdup(buf);
+    return;
+  }
+}
+
+
+void nvml_get_free(nvml_handle_t h, int device_id, uint64_t *free, uint64_t *total, uint64_t *used) {
+    nvmlDevice_t device;
+    nvmlMemory_t memInfo = {0};
+    nvmlReturn_t ret;
+    ret = (*h.nvmlDeviceGetHandleByIndex)(device_id, &device);
+    if (ret != NVML_SUCCESS) {
+        LOG(1, "unable to get device handle %d: %d", device_id, ret);
+        *free = 0;
+        return;
+    }
+
+    ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo);
+    if (ret != NVML_SUCCESS) {
+        LOG(1, "device memory info lookup failure %d: %d", device_id, ret);
+        *free = 0;
+        return;
+    }
+    *free = memInfo.free;
+    *total = memInfo.total;
+    *used = memInfo.used;
+}
+
+
+void nvml_release(nvml_handle_t h) {
+  LOG(h.verbose, "releasing nvml library\n");
+  nvmlReturn_t ret;
+  ret = (*h.nvmlShutdown)();
+  if (ret != NVML_SUCCESS) {
+    LOG(1, "error during nvmlShutdown %d", ret);
+  }
+  UNLOAD_LIBRARY(h.handle);
+  h.handle = NULL;
+}
+
+#endif  // __APPLE__
\ No newline at end of file
--- a/ollama/gpu/gpu_info_nvml.h
+++ b/ollama/gpu/gpu_info_nvml.h
+#ifndef __APPLE__
+#ifndef __GPU_INFO_NVML_H__
+#define __GPU_INFO_NVML_H__
+#include "gpu_info.h"
+
+// Just enough typedef's to dlopen/dlsym for memory information
+typedef enum nvmlReturn_enum {
+  NVML_SUCCESS = 0,
+  // Other values omitted for now...
+} nvmlReturn_t;
+typedef void *nvmlDevice_t;  // Opaque is sufficient
+typedef struct nvmlMemory_st {
+  unsigned long long total;
+  unsigned long long free;
+  unsigned long long used;
+} nvmlMemory_t;
+
+typedef enum nvmlBrandType_enum
+{
+    NVML_BRAND_UNKNOWN          = 0,
+} nvmlBrandType_t;
+
+typedef struct nvml_handle {
+  void *handle;
+  uint16_t verbose;
+  nvmlReturn_t (*nvmlInit_v2)(void);
+  nvmlReturn_t (*nvmlShutdown)(void);
+  nvmlReturn_t (*nvmlDeviceGetHandleByIndex)(unsigned int, nvmlDevice_t *);
+  nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *);
+} nvml_handle_t;
+
+typedef struct nvml_init_resp {
+  char *err;  // If err is non-null handle is invalid
+  nvml_handle_t ch;
+} nvml_init_resp_t;
+
+typedef struct nvml_compute_capability {
+  char *err;
+  int major;
+  int minor;
+} nvml_compute_capability_t;
+
+void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp);
+void nvml_get_free(nvml_handle_t ch,  int device_id, uint64_t *free, uint64_t *total, uint64_t *used);
+void nvml_release(nvml_handle_t ch);
+
+#endif  // __GPU_INFO_NVML_H__
+#endif  // __APPLE__
\ No newline at end of file
--- a/ollama/gpu/gpu_info_oneapi.c
+++ b/ollama/gpu/gpu_info_oneapi.c
+#ifndef __APPLE__
+
+#include "gpu_info_oneapi.h"
+
+#include <string.h>
+
+void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) {
+  ze_result_t ret;
+  resp->err = NULL;
+  resp->oh.devices = NULL;
+  resp->oh.num_devices = NULL;
+  resp->oh.drivers = NULL;
+  resp->oh.num_drivers = 0;
+  const int buflen = 256;
+  char buf[buflen + 1];
+  int i, d;
+  struct lookup {
+    char *s;
+    void **p;
+  } l[] = {
+      {"zesInit", (void *)&resp->oh.zesInit},
+      {"zesDriverGet", (void *)&resp->oh.zesDriverGet},
+      {"zesDeviceGet", (void *)&resp->oh.zesDeviceGet},
+      {"zesDeviceGetProperties", (void *)&resp->oh.zesDeviceGetProperties},
+      {"zesDeviceEnumMemoryModules",
+       (void *)&resp->oh.zesDeviceEnumMemoryModules},
+      {"zesMemoryGetProperties", (void *)&resp->oh.zesMemoryGetProperties},
+      {"zesMemoryGetState", (void *)&resp->oh.zesMemoryGetState},
+      {NULL, NULL},
+  };
+
+  resp->oh.handle = LOAD_LIBRARY(oneapi_lib_path, RTLD_LAZY);
+  if (!resp->oh.handle) {
+    char *msg = LOAD_ERR();
+    snprintf(buf, buflen,
+             "Unable to load %s library to query for Intel GPUs: %s\n",
+             oneapi_lib_path, msg);
+    free(msg);
+    resp->err = strdup(buf);
+    return;
+  }
+
+  // TODO once we've squashed the remaining corner cases remove this log
+  LOG(resp->oh.verbose,
+      "wiring Level-Zero management library functions in %s\n",
+      oneapi_lib_path);
+
+  for (i = 0; l[i].s != NULL; i++) {
+    // TODO once we've squashed the remaining corner cases remove this log
+    LOG(resp->oh.verbose, "dlsym: %s\n", l[i].s);
+
+    *l[i].p = LOAD_SYMBOL(resp->oh.handle, l[i].s);
+    if (!*(l[i].p)) {
+      resp->oh.handle = NULL;
+      char *msg = LOAD_ERR();
+      LOG(resp->oh.verbose, "dlerr: %s\n", msg);
+      UNLOAD_LIBRARY(resp->oh.handle);
+      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s, msg);
+      free(msg);
+      resp->err = strdup(buf);
+      return;
+    }
+  }
+
+  LOG(resp->oh.verbose, "calling zesInit\n");
+
+  ret = (*resp->oh.zesInit)(0);
+  if (ret != ZE_RESULT_SUCCESS) {
+    LOG(resp->oh.verbose, "zesInit err: %x\n", ret);
+    snprintf(buf, buflen, "oneapi vram init failure: %x", ret);
+    resp->err = strdup(buf);
+    oneapi_release(resp->oh);
+    return;
+  }
+
+  LOG(resp->oh.verbose, "calling zesDriverGet\n");
+  ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, NULL);
+  if (ret != ZE_RESULT_SUCCESS) {
+    LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret);
+    snprintf(buf, buflen, "unable to get driver count: %x", ret);
+    resp->err = strdup(buf);
+    oneapi_release(resp->oh);
+    return;
+  }
+  LOG(resp->oh.verbose, "oneapi driver count: %d\n", resp->oh.num_drivers);
+  resp->oh.drivers = malloc(resp->oh.num_drivers * sizeof(zes_driver_handle_t));
+  resp->oh.num_devices = malloc(resp->oh.num_drivers * sizeof(uint32_t));
+  memset(&resp->oh.num_devices[0], 0, resp->oh.num_drivers * sizeof(uint32_t));
+  resp->oh.devices =
+      malloc(resp->oh.num_drivers * sizeof(zes_device_handle_t *));
+  ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, &resp->oh.drivers[0]);
+  if (ret != ZE_RESULT_SUCCESS) {
+    LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret);
+    snprintf(buf, buflen, "unable to get driver count: %x", ret);
+    resp->err = strdup(buf);
+    oneapi_release(resp->oh);
+    return;
+  }
+
+  for (d = 0; d < resp->oh.num_drivers; d++) {
+    LOG(resp->oh.verbose, "calling zesDeviceGet count %d: %p\n", d, resp->oh.drivers[d]);
+    ret = (*resp->oh.zesDeviceGet)(resp->oh.drivers[d],
+                                   &resp->oh.num_devices[d], NULL);
+    if (ret != ZE_RESULT_SUCCESS) {
+      LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret);
+      snprintf(buf, buflen, "unable to get device count: %x", ret);
+      resp->err = strdup(buf);
+      oneapi_release(resp->oh);
+      return;
+    }
+    resp->oh.devices[d] =
+        malloc(resp->oh.num_devices[d] * sizeof(zes_device_handle_t));
+    ret = (*resp->oh.zesDeviceGet)(
+        resp->oh.drivers[d], &resp->oh.num_devices[d], resp->oh.devices[d]);
+    if (ret != ZE_RESULT_SUCCESS) {
+      LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret);
+      snprintf(buf, buflen, "unable to get device count: %x", ret);
+      resp->err = strdup(buf);
+      oneapi_release(resp->oh);
+      return;
+    }
+  }
+
+  return;
+}
+
+void oneapi_check_vram(oneapi_handle_t h, int driver, int device,
+                       mem_info_t *resp) {
+  ze_result_t ret;
+  resp->err = NULL;
+  uint64_t totalMem = 0;
+  uint64_t usedMem = 0;
+  const int buflen = 256;
+  char buf[buflen + 1];
+  int i, d, m;
+
+  if (h.handle == NULL) {
+    resp->err = strdup("Level-Zero handle not initialized");
+    return;
+  }
+
+  if (driver > h.num_drivers || device > h.num_devices[driver]) {
+    resp->err = strdup("driver of device index out of bounds");
+    return;
+  }
+
+  resp->total = 0;
+  resp->free = 0;
+
+  zes_device_ext_properties_t ext_props;
+  ext_props.stype = ZES_STRUCTURE_TYPE_DEVICE_EXT_PROPERTIES;
+  ext_props.pNext = NULL;
+
+  zes_device_properties_t props;
+  props.stype = ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+  props.pNext = &ext_props;
+
+  ret = (*h.zesDeviceGetProperties)(h.devices[driver][device], &props);
+  if (ret != ZE_RESULT_SUCCESS) {
+    snprintf(buf, buflen, "unable to get device properties: %d", ret);
+    resp->err = strdup(buf);
+    return;
+  }
+
+  snprintf(&resp->gpu_name[0], GPU_NAME_LEN, "%s", props.modelName);
+
+  // TODO this needs to map to ONEAPI_DEVICE_SELECTOR syntax
+  // (this is probably wrong...)
+  // TODO - the driver isn't included - what if there are multiple drivers?
+  snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", device);
+
+  if (h.verbose) {
+    // When in verbose mode, report more information about
+    // the card we discover.
+    LOG(h.verbose, "[%d:%d] oneAPI device name: %s\n", driver, device,
+        props.modelName);
+    LOG(h.verbose, "[%d:%d] oneAPI brand: %s\n", driver, device,
+        props.brandName);
+    LOG(h.verbose, "[%d:%d] oneAPI vendor: %s\n", driver, device,
+        props.vendorName);
+    LOG(h.verbose, "[%d:%d] oneAPI S/N: %s\n", driver, device,
+        props.serialNumber);
+    LOG(h.verbose, "[%d:%d] oneAPI board number: %s\n", driver, device,
+        props.boardNumber);
+  }
+
+  // TODO
+  // Compute Capability equivalent in resp->major, resp->minor, resp->patch
+
+  uint32_t memCount = 0;
+  ret = (*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount,
+                                        NULL);
+  if (ret != ZE_RESULT_SUCCESS) {
+    snprintf(buf, buflen, "unable to enumerate Level-Zero memory modules: %x",
+             ret);
+    resp->err = strdup(buf);
+    return;
+  }
+
+  LOG(h.verbose, "discovered %d Level-Zero memory modules\n", memCount);
+
+  zes_mem_handle_t *mems = malloc(memCount * sizeof(zes_mem_handle_t));
+  (*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount, mems);
+
+  for (m = 0; m < memCount; m++) {
+    zes_mem_state_t state;
+    state.stype = ZES_STRUCTURE_TYPE_MEM_STATE;
+    state.pNext = NULL;
+    ret = (*h.zesMemoryGetState)(mems[m], &state);
+    if (ret != ZE_RESULT_SUCCESS) {
+      snprintf(buf, buflen, "unable to get memory state: %x", ret);
+      resp->err = strdup(buf);
+      free(mems);
+      return;
+    }
+
+    resp->total += state.size;
+    resp->free += state.free;
+  }
+
+  free(mems);
+}
+
+void oneapi_release(oneapi_handle_t h) {
+  int d;
+  LOG(h.verbose, "releasing oneapi library\n");
+  for (d = 0; d < h.num_drivers; d++) {
+    if (h.devices != NULL && h.devices[d] != NULL) {
+      free(h.devices[d]);
+    }
+  }
+  if (h.devices != NULL) {
+    free(h.devices);
+    h.devices = NULL;
+  }
+  if (h.num_devices != NULL) {
+    free(h.num_devices);
+    h.num_devices = NULL;
+  }
+  if (h.drivers != NULL) {
+    free(h.drivers);
+    h.drivers = NULL;
+  }
+  h.num_drivers = 0;
+  UNLOAD_LIBRARY(h.handle);
+  h.handle = NULL;
+}
+
+int oneapi_get_device_count(oneapi_handle_t h, int driver) {
+  if (h.handle == NULL || h.num_devices == NULL) {
+    return 0;
+  }
+  if (driver > h.num_drivers) {
+    return 0;
+  }
+  return (int)h.num_devices[driver];
+}
+
+#endif // __APPLE__
--- a/ollama/gpu/gpu_info_oneapi.h
+++ b/ollama/gpu/gpu_info_oneapi.h
+#ifndef __APPLE__
+#ifndef __GPU_INFO_ONEAPI_H__
+#define __GPU_INFO_ONEAPI_H__
+#include "gpu_info.h"
+
+#define ZE_MAX_DEVICE_NAME 256
+#define ZE_MAX_DEVICE_UUID_SIZE 16
+#define ZES_STRING_PROPERTY_SIZE 64
+#define ZE_BIT(_i) (1 << _i)
+
+// Just enough typedef's to dlopen/dlsym for memory information
+typedef enum ze_result_t {
+  ZE_RESULT_SUCCESS = 0,
+  // Other values omitted for now...
+} ze_result_t;
+
+typedef uint8_t ze_bool_t;
+typedef struct _zes_driver_handle_t *zes_driver_handle_t;
+typedef struct _zes_device_handle_t *zes_device_handle_t;
+typedef struct _zes_mem_handle_t *zes_mem_handle_t;
+
+typedef enum _ze_structure_type_t {
+  ZE_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
+} ze_structure_type_t;
+
+typedef enum _zes_structure_type_t {
+  ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES = 0x1,
+  ZES_STRUCTURE_TYPE_MEM_PROPERTIES = 0xb,
+  ZES_STRUCTURE_TYPE_MEM_STATE = 0x1e,
+  ZES_STRUCTURE_TYPE_DEVICE_EXT_PROPERTIES = 0x2d,
+  ZES_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
+} zes_structure_type_t;
+
+typedef enum _zes_mem_type_t {
+  ZES_MEM_TYPE_FORCE_UINT32 = 0x7fffffff
+} zes_mem_type_t;
+
+typedef enum _zes_mem_loc_t {
+  ZES_MEM_LOC_SYSTEM = 0,
+  ZES_MEM_LOC_DEVICE = 1,
+  ZES_MEM_LOC_FORCE_UINT32 = 0x7fffffff
+} zes_mem_loc_t;
+
+typedef enum _zes_mem_health_t {
+  ZES_MEM_HEALTH_FORCE_UINT32 = 0x7fffffff
+} zes_mem_health_t;
+
+typedef struct _ze_device_uuid_t {
+  uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
+} ze_device_uuid_t;
+
+typedef struct _zes_uuid_t {
+  uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
+} zes_uuid_t;
+
+typedef enum _ze_device_type_t {
+  ZE_DEVICE_TYPE_GPU = 1,
+  ZE_DEVICE_TYPE_CPU = 2,
+  ZE_DEVICE_TYPE_FPGA = 3,
+  ZE_DEVICE_TYPE_MCA = 4,
+  ZE_DEVICE_TYPE_VPU = 5,
+  ZE_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff
+} ze_device_type_t;
+
+typedef enum _zes_device_type_t {
+  ZES_DEVICE_TYPE_GPU = 1,
+  ZES_DEVICE_TYPE_CPU = 2,
+  ZES_DEVICE_TYPE_FPGA = 3,
+  ZES_DEVICE_TYPE_MCA = 4,
+  ZES_DEVICE_TYPE_VPU = 5,
+  ZES_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff
+} zes_device_type_t;
+
+typedef uint32_t ze_device_property_flags_t;
+typedef enum _ze_device_property_flag_t {
+  ZE_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
+  ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
+  ZE_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
+  ZE_DEVICE_PROPERTY_FLAG_ONDEMANDPAGING = ZE_BIT(3),
+  ZE_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
+} ze_device_property_flag_t;
+
+typedef uint32_t zes_device_property_flags_t;
+typedef enum _zes_device_property_flag_t {
+  ZES_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
+  ZES_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
+  ZES_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
+  ZES_DEVICE_PROPERTY_FLAG_ONDEMANDPAGING = ZE_BIT(3),
+  ZES_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
+} zes_device_property_flag_t;
+
+typedef struct _ze_device_properties_t {
+  ze_structure_type_t stype;
+  void *pNext;
+  ze_device_type_t type;
+  uint32_t vendorId;
+  uint32_t deviceId;
+  ze_device_property_flags_t flags;
+  uint32_t subdeviceId;
+  uint32_t coreClockRate;
+  uint64_t maxMemAllocSize;
+  uint32_t maxHardwareContexts;
+  uint32_t maxCommandQueuePriority;
+  uint32_t numThreadsPerEU;
+  uint32_t physicalEUSimdWidth;
+  uint32_t numEUsPerSubslice;
+  uint32_t numSubslicesPerSlice;
+  uint32_t numSlices;
+  uint64_t timerResolution;
+  uint32_t timestampValidBits;
+  uint32_t kernelTimestampValidBits;
+  ze_device_uuid_t uuid;
+  char name[ZE_MAX_DEVICE_NAME];
+} ze_device_properties_t;
+
+typedef struct _zes_device_properties_t {
+  zes_structure_type_t stype;
+  void *pNext;
+  ze_device_properties_t core;
+  uint32_t numSubdevices;
+  char serialNumber[ZES_STRING_PROPERTY_SIZE];
+  char boardNumber[ZES_STRING_PROPERTY_SIZE];
+  char brandName[ZES_STRING_PROPERTY_SIZE];
+  char modelName[ZES_STRING_PROPERTY_SIZE];
+  char vendorName[ZES_STRING_PROPERTY_SIZE];
+  char driverVersion[ZES_STRING_PROPERTY_SIZE];
+} zes_device_properties_t;
+
+typedef struct _zes_device_ext_properties_t {
+  zes_structure_type_t stype;
+  void *pNext;
+  zes_uuid_t uuid;
+  zes_device_type_t type;
+  zes_device_property_flags_t flags;
+} zes_device_ext_properties_t;
+
+typedef struct _zes_mem_properties_t {
+  zes_structure_type_t stype;
+  void *pNext;
+  zes_mem_type_t type;
+  ze_bool_t onSubdevice;
+  uint32_t subdeviceId;
+  zes_mem_loc_t location;
+  uint64_t physicalSize;
+  int32_t busWidth;
+  int32_t numChannels;
+} zes_mem_properties_t;
+
+typedef struct _zes_mem_state_t {
+  zes_structure_type_t stype;
+  const void *pNext;
+  zes_mem_health_t health;
+  uint64_t free;
+  uint64_t size;
+} zes_mem_state_t;
+
+typedef struct oneapi_handle {
+  void *handle;
+  uint16_t verbose;
+
+  uint32_t num_drivers;
+  zes_driver_handle_t *drivers;
+  uint32_t *num_devices;
+  zes_device_handle_t **devices;
+
+  // TODO Driver major, minor information
+  // int driver_major;
+  // int driver_minor;
+
+  ze_result_t (*zesInit)(int);
+  ze_result_t (*zesDriverGet)(uint32_t *pCount, zes_driver_handle_t *phDrivers);
+  ze_result_t (*zesDeviceGet)(zes_driver_handle_t hDriver, uint32_t *pCount,
+                              zes_device_handle_t *phDevices);
+  ze_result_t (*zesDeviceGetProperties)(zes_device_handle_t hDevice,
+                                        zes_device_properties_t *pProperties);
+  ze_result_t (*zesDeviceEnumMemoryModules)(zes_device_handle_t hDevice,
+                                            uint32_t *pCount,
+                                            zes_mem_handle_t *phMemory);
+  ze_result_t (*zesMemoryGetProperties)(zes_mem_handle_t hMemory,
+                                        zes_mem_properties_t *pProperties);
+  ze_result_t (*zesMemoryGetState)(zes_mem_handle_t hMemory,
+                                   zes_mem_state_t *pState);
+
+} oneapi_handle_t;
+
+typedef struct oneapi_init_resp {
+  char *err; // If err is non-null handle is invalid
+  oneapi_handle_t oh;
+} oneapi_init_resp_t;
+
+typedef struct oneapi_version_resp {
+  ze_result_t status;
+  char *str; // Contains version or error string if status != 0
+} oneapi_version_resp_t;
+
+void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp);
+void oneapi_check_vram(oneapi_handle_t h, int driver, int device,
+                       mem_info_t *resp);
+void oneapi_release(oneapi_handle_t h);
+int oneapi_get_device_count(oneapi_handle_t h, int driver);
+
+#endif // __GPU_INFO_INTEL_H__
+#endif // __APPLE__
--- a/ollama/gpu/gpu_linux.go
+++ b/ollama/gpu/gpu_linux.go
+package gpu
+
+import (
+	"bufio"
+	"fmt"
+	"os"
+	"strings"
+
+	"github.com/ollama/ollama/format"
+)
+
+var CudartGlobs = []string{
+	"/usr/local/cuda/lib64/libcudart.so*",
+	"/usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so*",
+	"/usr/lib/x86_64-linux-gnu/libcudart.so*",
+	"/usr/lib/wsl/lib/libcudart.so*",
+	"/usr/lib/wsl/drivers/*/libcudart.so*",
+	"/opt/cuda/lib64/libcudart.so*",
+	"/usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so*",
+	"/usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so*",
+	"/usr/lib/aarch64-linux-gnu/libcudart.so*",
+	"/usr/local/cuda/lib*/libcudart.so*",
+	"/usr/lib*/libcudart.so*",
+	"/usr/local/lib*/libcudart.so*",
+}
+
+var NvmlGlobs = []string{}
+
+var NvcudaGlobs = []string{
+	"/usr/local/cuda*/targets/*/lib/libcuda.so*",
+	"/usr/lib/*-linux-gnu/nvidia/current/libcuda.so*",
+	"/usr/lib/*-linux-gnu/libcuda.so*",
+	"/usr/lib/wsl/lib/libcuda.so*",
+	"/usr/lib/wsl/drivers/*/libcuda.so*",
+	"/opt/cuda/lib*/libcuda.so*",
+	"/usr/local/cuda/lib*/libcuda.so*",
+	"/usr/lib*/libcuda.so*",
+	"/usr/local/lib*/libcuda.so*",
+}
+
+var OneapiGlobs = []string{
+	"/usr/lib/x86_64-linux-gnu/libze_intel_gpu.so*",
+	"/usr/lib*/libze_intel_gpu.so*",
+}
+
+var (
+	CudartMgmtName = "libcudart.so*"
+	NvcudaMgmtName = "libcuda.so*"
+	NvmlMgmtName   = "" // not currently wired on linux
+	OneapiMgmtName = "libze_intel_gpu.so"
+)
+
+func GetCPUMem() (memInfo, error) {
+	var mem memInfo
+	var total, available, free, buffers, cached, freeSwap uint64
+	f, err := os.Open("/proc/meminfo")
+	if err != nil {
+		return mem, err
+	}
+	defer f.Close()
+	s := bufio.NewScanner(f)
+	for s.Scan() {
+		line := s.Text()
+		switch {
+		case strings.HasPrefix(line, "MemTotal:"):
+			_, err = fmt.Sscanf(line, "MemTotal:%d", &total)
+		case strings.HasPrefix(line, "MemAvailable:"):
+			_, err = fmt.Sscanf(line, "MemAvailable:%d", &available)
+		case strings.HasPrefix(line, "MemFree:"):
+			_, err = fmt.Sscanf(line, "MemFree:%d", &free)
+		case strings.HasPrefix(line, "Buffers:"):
+			_, err = fmt.Sscanf(line, "Buffers:%d", &buffers)
+		case strings.HasPrefix(line, "Cached:"):
+			_, err = fmt.Sscanf(line, "Cached:%d", &cached)
+		case strings.HasPrefix(line, "SwapFree:"):
+			_, err = fmt.Sscanf(line, "SwapFree:%d", &freeSwap)
+		default:
+			continue
+		}
+		if err != nil {
+			return mem, err
+		}
+	}
+	mem.TotalMemory = total * format.KibiByte
+	mem.FreeSwap = freeSwap * format.KibiByte
+	if available > 0 {
+		mem.FreeMemory = available * format.KibiByte
+	} else {
+		mem.FreeMemory = (free + buffers + cached) * format.KibiByte
+	}
+	return mem, nil
+}
--- a/ollama/gpu/gpu_oneapi.go
+++ b/ollama/gpu/gpu_oneapi.go
+//go:build linux || windows
+
+package gpu
+
+import (
+	"log/slog"
+	"strings"
+)
+
+func oneapiGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
+	ids := []string{}
+	for _, info := range gpuInfo {
+		if info.Library != "oneapi" {
+			// TODO shouldn't happen if things are wired correctly...
+			slog.Debug("oneapiGetVisibleDevicesEnv skipping over non-sycl device", "library", info.Library)
+			continue
+		}
+		ids = append(ids, info.ID)
+	}
+	return "ONEAPI_DEVICE_SELECTOR", "level_zero:" + strings.Join(ids, ",")
+}
--- a/ollama/gpu/gpu_test.go
+++ b/ollama/gpu/gpu_test.go
+package gpu
+
+import (
+	"runtime"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestBasicGetGPUInfo(t *testing.T) {
+	info := GetGPUInfo()
+	assert.NotEmpty(t, len(info))
+	assert.Contains(t, "cuda rocm cpu metal", info[0].Library)
+	if info[0].Library != "cpu" {
+		assert.Greater(t, info[0].TotalMemory, uint64(0))
+		assert.Greater(t, info[0].FreeMemory, uint64(0))
+	}
+}
+
+func TestCPUMemInfo(t *testing.T) {
+	info, err := GetCPUMem()
+	require.NoError(t, err)
+	switch runtime.GOOS {
+	case "darwin":
+		t.Skip("CPU memory not populated on darwin")
+	case "linux", "windows":
+		assert.Greater(t, info.TotalMemory, uint64(0))
+		assert.Greater(t, info.FreeMemory, uint64(0))
+	default:
+		return
+	}
+}
+
+// TODO - add some logic to figure out card type through other means and actually verify we got back what we expected
--- a/ollama/gpu/gpu_windows.go
+++ b/ollama/gpu/gpu_windows.go
+package gpu
+
+import (
+	"fmt"
+	"syscall"
+	"unsafe"
+)
+
+type MEMORYSTATUSEX struct {
+	length               uint32
+	MemoryLoad           uint32
+	TotalPhys            uint64
+	AvailPhys            uint64
+	TotalPageFile        uint64
+	AvailPageFile        uint64
+	TotalVirtual         uint64
+	AvailVirtual         uint64
+	AvailExtendedVirtual uint64
+}
+
+var (
+	k32                      = syscall.NewLazyDLL("kernel32.dll")
+	globalMemoryStatusExProc = k32.NewProc("GlobalMemoryStatusEx")
+	sizeofMemoryStatusEx     = uint32(unsafe.Sizeof(MEMORYSTATUSEX{}))
+)
+
+var CudartGlobs = []string{
+	"c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll",
+}
+
+var NvmlGlobs = []string{
+	"c:\\Windows\\System32\\nvml.dll",
+}
+
+var NvcudaGlobs = []string{
+	"c:\\windows\\system*\\nvcuda.dll",
+}
+
+var OneapiGlobs = []string{
+	"c:\\Windows\\System32\\DriverStore\\FileRepository\\*\\ze_intel_gpu64.dll",
+}
+
+var (
+	CudartMgmtName = "cudart64_*.dll"
+	NvcudaMgmtName = "nvcuda.dll"
+	NvmlMgmtName   = "nvml.dll"
+	OneapiMgmtName = "ze_intel_gpu64.dll"
+)
+
+func GetCPUMem() (memInfo, error) {
+	memStatus := MEMORYSTATUSEX{length: sizeofMemoryStatusEx}
+	r1, _, err := globalMemoryStatusExProc.Call(uintptr(unsafe.Pointer(&memStatus)))
+	if r1 == 0 {
+		return memInfo{}, fmt.Errorf("GlobalMemoryStatusEx failed: %w", err)
+	}
+	return memInfo{TotalMemory: memStatus.TotalPhys, FreeMemory: memStatus.AvailPhys, FreeSwap: memStatus.AvailPageFile}, nil
+}
--- a/ollama/gpu/types.go
+++ b/ollama/gpu/types.go
+package gpu
+
+import (
+	"fmt"
+	"log/slog"
+
+	"github.com/ollama/ollama/format"
+)
+
+type memInfo struct {
+	TotalMemory uint64 `json:"total_memory,omitempty"`
+	FreeMemory  uint64 `json:"free_memory,omitempty"`
+	FreeSwap    uint64 `json:"free_swap,omitempty"`
+}
+
+// Beginning of an `ollama info` command
+type GpuInfo struct {
+	memInfo
+	Library string `json:"library,omitempty"`
+
+	// Optional variant to select (e.g. versions, cpu feature flags)
+	Variant CPUCapability `json:"variant"`
+
+	// MinimumMemory represents the minimum memory required to use the GPU
+	MinimumMemory uint64 `json:"-"`
+
+	// Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly
+	DependencyPath string `json:"lib_path,omitempty"`
+
+	// Extra environment variables specific to the GPU as list of [key,value]
+	EnvWorkarounds [][2]string `json:"envs,omitempty"`
+
+	// Set to true if we can NOT reliably discover FreeMemory.  A value of true indicates
+	// the FreeMemory is best effort, and may over or under report actual memory usage
+	// False indicates FreeMemory can generally be trusted on this GPU
+	UnreliableFreeMemory bool
+
+	// GPU information
+	ID      string `json:"gpu_id"`  // string to use for selection of this specific GPU
+	Name    string `json:"name"`    // user friendly name if available
+	Compute string `json:"compute"` // Compute Capability or gfx
+
+	// Driver Information - TODO no need to put this on each GPU
+	DriverMajor int `json:"driver_major,omitempty"`
+	DriverMinor int `json:"driver_minor,omitempty"`
+
+	// TODO other performance capability info to help in scheduling decisions
+}
+
+type CPUInfo struct {
+	GpuInfo
+}
+
+type CudaGPUInfo struct {
+	GpuInfo
+	OSOverhead uint64 // Memory overhead between the driver library and management library
+	index      int    //nolint:unused,nolintlint
+}
+type CudaGPUInfoList []CudaGPUInfo
+
+type RocmGPUInfo struct {
+	GpuInfo
+	usedFilepath string //nolint:unused,nolintlint
+	index        int    //nolint:unused,nolintlint
+}
+type RocmGPUInfoList []RocmGPUInfo
+
+type OneapiGPUInfo struct {
+	GpuInfo
+	driverIndex int //nolint:unused,nolintlint
+	gpuIndex    int //nolint:unused,nolintlint
+}
+type OneapiGPUInfoList []OneapiGPUInfo
+
+type GpuInfoList []GpuInfo
+
+// Split up the set of gpu info's by Library and variant
+func (l GpuInfoList) ByLibrary() []GpuInfoList {
+	resp := []GpuInfoList{}
+	libs := []string{}
+	for _, info := range l {
+		found := false
+		requested := info.Library
+		if info.Variant != CPUCapabilityNone {
+			requested += "_" + info.Variant.String()
+		}
+		for i, lib := range libs {
+			if lib == requested {
+				resp[i] = append(resp[i], info)
+				found = true
+				break
+			}
+		}
+		if !found {
+			libs = append(libs, info.Library)
+			resp = append(resp, []GpuInfo{info})
+		}
+	}
+	return resp
+}
+
+// Report the GPU information into the log an Info level
+func (l GpuInfoList) LogDetails() {
+	for _, g := range l {
+		slog.Info("inference compute",
+			"id", g.ID,
+			"library", g.Library,
+			"compute", g.Compute,
+			"driver", fmt.Sprintf("%d.%d", g.DriverMajor, g.DriverMinor),
+			"name", g.Name,
+			"total", format.HumanBytes2(g.TotalMemory),
+			"available", format.HumanBytes2(g.FreeMemory),
+		)
+	}
+}
+
+// Sort by Free Space
+type ByFreeMemory []GpuInfo
+
+func (a ByFreeMemory) Len() int           { return len(a) }
+func (a ByFreeMemory) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
+func (a ByFreeMemory) Less(i, j int) bool { return a[i].FreeMemory < a[j].FreeMemory }
+
+type CPUCapability uint32
+
+// Override at build time when building base GPU runners
+var GPURunnerCPUCapability = CPUCapabilityAVX
+
+const (
+	CPUCapabilityNone CPUCapability = iota
+	CPUCapabilityAVX
+	CPUCapabilityAVX2
+	// TODO AVX512
+)
+
+func (c CPUCapability) String() string {
+	switch c {
+	case CPUCapabilityAVX:
+		return "avx"
+	case CPUCapabilityAVX2:
+		return "avx2"
+	default:
+		return "no vector extensions"
+	}
+}
--- a/ollama/integration/README.md
+++ b/ollama/integration/README.md
+# Integration Tests
+
+This directory contains integration tests to exercise Ollama end-to-end to verify behavior
+
+By default, these tests are disabled so `go test ./...` will exercise only unit tests.  To run integration tests you must pass the integration tag.  `go test -tags=integration ./...`
+
+
+The integration tests have 2 modes of operating.
+
+1. By default, they will start the server on a random port, run the tests, and then shutdown the server.
+2. If `OLLAMA_TEST_EXISTING` is set to a non-empty string, the tests will run against an existing running server, which can be remote
--- a/ollama/integration/basic_test.go
+++ b/ollama/integration/basic_test.go
+//go:build integration
+
+package integration
+
+import (
+	"context"
+	"log/slog"
+	"os"
+	"runtime"
+	"testing"
+	"time"
+
+	"github.com/ollama/ollama/api"
+	"github.com/stretchr/testify/require"
+)
+
+func TestOrcaMiniBlueSky(t *testing.T) {
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
+	defer cancel()
+	// Set up the test data
+	req := api.GenerateRequest{
+		Model:  "orca-mini",
+		Prompt: "why is the sky blue?",
+		Stream: &stream,
+		Options: map[string]interface{}{
+			"temperature": 0,
+			"seed":        123,
+		},
+	}
+	GenerateTestHelper(ctx, t, req, []string{"rayleigh", "scattering"})
+}
+
+func TestUnicodeModelDir(t *testing.T) {
+	// This is only useful for Windows with utf-16 characters, so skip this test for other platforms
+	if runtime.GOOS != "windows" {
+		t.Skip("Unicode test only applicable to windows")
+	}
+	// Only works for local testing
+	if os.Getenv("OLLAMA_TEST_EXISTING") != "" {
+		t.Skip("TestUnicodeModelDir only works for local testing, skipping")
+	}
+
+	modelDir, err := os.MkdirTemp("", "ollama_埃")
+	require.NoError(t, err)
+	defer os.RemoveAll(modelDir)
+	slog.Info("unicode", "OLLAMA_MODELS", modelDir)
+
+	t.Setenv("OLLAMA_MODELS", modelDir)
+
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
+	defer cancel()
+
+	req := api.GenerateRequest{
+		Model:  "orca-mini",
+		Prompt: "why is the sky blue?",
+		Stream: &stream,
+		Options: map[string]interface{}{
+			"temperature": 0,
+			"seed":        123,
+		},
+	}
+	GenerateTestHelper(ctx, t, req, []string{"rayleigh", "scattering"})
+}
--- a/ollama/integration/concurrency_test.go
+++ b/ollama/integration/concurrency_test.go
+//go:build integration
+
+package integration
+
+import (
+	"context"
+	"log/slog"
+	"os"
+	"strconv"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/require"
+
+	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/format"
+)
+
+func TestMultiModelConcurrency(t *testing.T) {
+	var (
+		req = [2]api.GenerateRequest{
+			{
+				Model:     "orca-mini",
+				Prompt:    "why is the ocean blue?",
+				Stream:    &stream,
+				KeepAlive: &api.Duration{Duration: 10 * time.Second},
+				Options: map[string]interface{}{
+					"seed":        42,
+					"temperature": 0.0,
+				},
+			}, {
+				Model:     "tinydolphin",
+				Prompt:    "what is the origin of the us thanksgiving holiday?",
+				Stream:    &stream,
+				KeepAlive: &api.Duration{Duration: 10 * time.Second},
+				Options: map[string]interface{}{
+					"seed":        42,
+					"temperature": 0.0,
+				},
+			},
+		}
+		resp = [2][]string{
+			{"sunlight"},
+			{"england", "english", "massachusetts", "pilgrims", "british"},
+		}
+	)
+	var wg sync.WaitGroup
+	wg.Add(len(req))
+	ctx, cancel := context.WithTimeout(context.Background(), time.Second*240)
+	defer cancel()
+
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+
+	for i := 0; i < len(req); i++ {
+		require.NoError(t, PullIfMissing(ctx, client, req[i].Model))
+	}
+
+	for i := 0; i < len(req); i++ {
+		go func(i int) {
+			defer wg.Done()
+			DoGenerate(ctx, t, client, req[i], resp[i], 60*time.Second, 10*time.Second)
+		}(i)
+	}
+	wg.Wait()
+}
+
+func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
+	req, resp := GenerateRequests()
+	reqLimit := len(req)
+	iterLimit := 5
+
+	if s := os.Getenv("OLLAMA_MAX_VRAM"); s != "" {
+		maxVram, err := strconv.ParseUint(s, 10, 64)
+		require.NoError(t, err)
+		// Don't hammer on small VRAM cards...
+		if maxVram < 4*format.GibiByte {
+			reqLimit = min(reqLimit, 2)
+			iterLimit = 2
+		}
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 9*time.Minute)
+	defer cancel()
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+
+	// Get the server running (if applicable) warm the model up with a single initial request
+	DoGenerate(ctx, t, client, req[0], resp[0], 60*time.Second, 10*time.Second)
+
+	var wg sync.WaitGroup
+	wg.Add(reqLimit)
+	for i := 0; i < reqLimit; i++ {
+		go func(i int) {
+			defer wg.Done()
+			for j := 0; j < iterLimit; j++ {
+				slog.Info("Starting", "req", i, "iter", j)
+				// On slower GPUs it can take a while to process the concurrent requests
+				// so we allow a much longer initial timeout
+				DoGenerate(ctx, t, client, req[i], resp[i], 120*time.Second, 20*time.Second)
+			}
+		}(i)
+	}
+	wg.Wait()
+}
+
+// Stress the system if we know how much VRAM it has, and attempt to load more models than will fit
+func TestMultiModelStress(t *testing.T) {
+	s := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM
+	if s == "" {
+		t.Skip("OLLAMA_MAX_VRAM not specified, can't pick the right models for the stress test")
+	}
+
+	maxVram, err := strconv.ParseUint(s, 10, 64)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	type model struct {
+		name string
+		size uint64 // Approximate amount of VRAM they typically use when fully loaded in VRAM
+	}
+
+	smallModels := []model{
+		{
+			name: "orca-mini",
+			size: 2992 * format.MebiByte,
+		},
+		{
+			name: "phi",
+			size: 2616 * format.MebiByte,
+		},
+		{
+			name: "gemma:2b",
+			size: 2364 * format.MebiByte,
+		},
+		{
+			name: "stable-code:3b",
+			size: 2608 * format.MebiByte,
+		},
+		{
+			name: "starcoder2:3b",
+			size: 2166 * format.MebiByte,
+		},
+	}
+	mediumModels := []model{
+		{
+			name: "llama2",
+			size: 5118 * format.MebiByte,
+		},
+		{
+			name: "mistral",
+			size: 4620 * format.MebiByte,
+		},
+		{
+			name: "orca-mini:7b",
+			size: 5118 * format.MebiByte,
+		},
+		{
+			name: "dolphin-mistral",
+			size: 4620 * format.MebiByte,
+		},
+		{
+			name: "gemma:7b",
+			size: 5000 * format.MebiByte,
+		},
+		{
+			name: "codellama:7b",
+			size: 5118 * format.MebiByte,
+		},
+	}
+
+	// These seem to be too slow to be useful...
+	// largeModels := []model{
+	// 	{
+	// 		name: "llama2:13b",
+	// 		size: 7400 * format.MebiByte,
+	// 	},
+	// 	{
+	// 		name: "codellama:13b",
+	// 		size: 7400 * format.MebiByte,
+	// 	},
+	// 	{
+	// 		name: "orca-mini:13b",
+	// 		size: 7400 * format.MebiByte,
+	// 	},
+	// 	{
+	// 		name: "gemma:7b",
+	// 		size: 5000 * format.MebiByte,
+	// 	},
+	// 	{
+	// 		name: "starcoder2:15b",
+	// 		size: 9100 * format.MebiByte,
+	// 	},
+	// }
+
+	var chosenModels []model
+	switch {
+	case maxVram < 10000*format.MebiByte:
+		slog.Info("selecting small models")
+		chosenModels = smallModels
+	// case maxVram < 30000*format.MebiByte:
+	default:
+		slog.Info("selecting medium models")
+		chosenModels = mediumModels
+		// default:
+		// 	slog.Info("selecting large models")
+		// 	chosenModels = largModels
+	}
+
+	req, resp := GenerateRequests()
+
+	for i := range req {
+		if i > len(chosenModels) {
+			break
+		}
+		req[i].Model = chosenModels[i].name
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 15*time.Minute) // TODO baseline -- 10m too short
+	defer cancel()
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+
+	// Make sure all the models are pulled before we get started
+	for _, r := range req {
+		require.NoError(t, PullIfMissing(ctx, client, r.Model))
+	}
+
+	var wg sync.WaitGroup
+	consumed := uint64(256 * format.MebiByte) // Assume some baseline usage
+	for i := 0; i < len(req); i++ {
+		// Always get at least 2 models, but dont' overshoot VRAM too much or we'll take too long
+		if i > 1 && consumed > maxVram {
+			slog.Info("achieved target vram exhaustion", "count", i, "vram", format.HumanBytes2(maxVram), "models", format.HumanBytes2(consumed))
+			break
+		}
+		consumed += chosenModels[i].size
+		slog.Info("target vram", "count", i, "vram", format.HumanBytes2(maxVram), "models", format.HumanBytes2(consumed))
+
+		wg.Add(1)
+		go func(i int) {
+			defer wg.Done()
+			for j := 0; j < 3; j++ {
+				slog.Info("Starting", "req", i, "iter", j, "model", req[i].Model)
+				DoGenerate(ctx, t, client, req[i], resp[i], 120*time.Second, 5*time.Second)
+			}
+		}(i)
+	}
+	go func() {
+		for {
+			time.Sleep(2 * time.Second)
+			select {
+			case <-ctx.Done():
+				return
+			default:
+				models, err := client.ListRunning(ctx)
+				if err != nil {
+					slog.Warn("failed to list running models", "error", err)
+					continue
+				}
+				for _, m := range models.Models {
+					slog.Info("loaded model snapshot", "model", m)
+				}
+			}
+		}
+	}()
+	wg.Wait()
+}
--- a/ollama/integration/context_test.go
+++ b/ollama/integration/context_test.go
+//go:build integration
+
+package integration
+
+import (
+	"context"
+	"testing"
+	"time"
+
+	"github.com/ollama/ollama/api"
+)
+
+func TestContextExhaustion(t *testing.T) {
+	// Longer needed for small footprint GPUs
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
+	defer cancel()
+	// Set up the test data
+	req := api.GenerateRequest{
+		Model:  "llama2",
+		Prompt: "Write me a story with a ton of emojis?",
+		Stream: &stream,
+		Options: map[string]interface{}{
+			"temperature": 0,
+			"seed":        123,
+			"num_ctx":     128,
+		},
+	}
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+	if err := PullIfMissing(ctx, client, req.Model); err != nil {
+		t.Fatalf("PullIfMissing failed: %v", err)
+	}
+	DoGenerate(ctx, t, client, req, []string{"once", "upon", "lived"}, 120*time.Second, 10*time.Second)
+}