Merge pull request #1146 from dhiltgen/ext_server_cgo

Add cgo implementation for llama.cpp

Merge pull request #1146 from dhiltgen/ext_server_cgo
Add cgo implementation for llama.cpp
96fb441a · Daniel Hiltgen · GitHub · fabf2f34 · 495c06e4 · 96fb441a
Unverified Commit 96fb441a authored Dec 22, 2023 by Daniel Hiltgen Committed by GitHub Dec 22, 2023
20 changed files
--- a/.dockerignore
+++ b/.dockerignore
@@ -2,8 +2,7 @@
 ollama
 app
 dist
-scripts
-llm/llama.cpp/ggml
 llm/llama.cpp/gguf
 .env
 .cache
+test_data
\ No newline at end of file
--- a/.gitignore
+++ b/.gitignore
@@ -8,4 +8,5 @@ ollama
 ggml-metal.metal
 .cache
 *.exe
 .idea
\ No newline at end of file
+test_data
\ No newline at end of file
--- a/.gitmodules
+++ b/.gitmodules
-[submodule "llm/llama.cpp/ggml"]
-    path = llm/llama.cpp/ggml
-    url = https://github.com/ggerganov/llama.cpp.git
-    ignore = dirty
-    shallow = true
 [submodule "llm/llama.cpp/gguf"]
    path = llm/llama.cpp/gguf
    url = https://github.com/ggerganov/llama.cpp.git

--- a/Dockerfile.build
+++ b/Dockerfile.build
-# centos7 amd64 dependencies
+# Ubuntu 20.04 amd64 dependencies
-FROM --platform=linux/amd64 nvidia/cuda:11.3.1-devel-centos7 AS base-amd64
+FROM --platform=linux/amd64 ubuntu:20.04 AS base-amd64
-RUN yum install -y https://repo.ius.io/ius-release-el7.rpm centos-release-scl && \
+ARG CUDA_VERSION=11.3.1-1
-    yum update -y && \
+ARG CMAKE_VERSION=3.22.1
-    yum install -y devtoolset-10-gcc devtoolset-10-gcc-c++ git236 wget
+# ROCm only supports amd64
-RUN wget "https://github.com/Kitware/CMake/releases/download/v3.27.6/cmake-3.27.6-linux-x86_64.sh" -O cmake-installer.sh && chmod +x cmake-installer.sh && ./cmake-installer.sh --skip-license --prefix=/usr/local
+ARG ROCM_VERSION=6.0
-ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
+ARG CLBLAST_VER=1.6.1
-# centos8 arm64 dependencies
+# Note: https://rocm.docs.amd.com/en/latest/release/user_kernel_space_compat_matrix.html
-FROM --platform=linux/arm64 nvidia/cuda-arm64:11.3.1-devel-centos8 AS base-arm64
+RUN apt-get update && \
-RUN sed -i -e 's/mirrorlist/#mirrorlist/g' -e 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-*
+    apt-get install -y wget gnupg && \
-RUN yum install -y git cmake
+    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin && \
+    mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
+    echo "deb [by-hash=no] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /" > /etc/apt/sources.list.d/cuda.list && \
+    wget "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.sh" -O /tmp/cmake-installer.sh && \
+    chmod +x /tmp/cmake-installer.sh && /tmp/cmake-installer.sh --skip-license --prefix=/usr && \
+    mkdir --parents --mode=0755 /etc/apt/keyrings && \
+    wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor > /etc/apt/keyrings/rocm.gpg && \
+    echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/${ROCM_VERSION} focal main" > /etc/apt/sources.list.d/rocm.list && \
+    echo "Package: *" > /etc/apt/preferences.d/rocm-pin-600 && \
+    echo "Pin: release o=repo.radeon.com" >> /etc/apt/preferences.d/rocm-pin-600 && \
+    echo "Pin-Priority: 600" >> /etc/apt/preferences.d/rocm-pin-600 && \
+    apt-get update && \
+    DEBIAN_FRONTEND=noninteractive apt-get -y install cuda=${CUDA_VERSION} rocm-hip-libraries rocm-device-libs rocm-libs rocm-ocl-icd rocm-hip-sdk rocm-hip-libraries rocm-cmake rocm-clang-ocl rocm-dev
+# CLBlast
+RUN wget -qO- https://github.com/CNugteren/CLBlast/archive/refs/tags/${CLBLAST_VER}.tar.gz | tar zxv -C /tmp/ && \
+    cd /tmp/CLBlast-${CLBLAST_VER} && mkdir build && cd build && cmake .. && make && make install
+ENV ROCM_PATH=/opt/rocm
+# Ubuntu 22.04 arm64 dependencies
+FROM --platform=linux/arm64 ubuntu:20.04 AS base-arm64
+ARG CUDA_VERSION=11.3.1-1
+ARG CMAKE_VERSION=3.27.6
+RUN apt-get update && \
+    apt-get install -y wget gnupg && \
+    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/sbsa/cuda-ubuntu2004.pin && \
+    mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/sbsa//3bf863cc.pub && \
+    echo "deb [by-hash=no] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/sbsa/ /" > /etc/apt/sources.list.d/cuda.list && \
+    wget "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-aarch64.sh" -O /tmp/cmake-installer.sh && \
+    chmod +x /tmp/cmake-installer.sh && /tmp/cmake-installer.sh --skip-license --prefix=/usr && \
+    apt-get update && \
+    apt-cache madison cuda && \
+    DEBIAN_FRONTEND=noninteractive apt-get -y install cuda=${CUDA_VERSION} 
 FROM base-${TARGETARCH}
 ARG TARGETARCH
 ARG GOFLAGS="'-ldflags -w -s'"
+ARG CGO_CFLAGS
+ARG GOLANG_VERSION=1.21.3
+# Common toolchain
+RUN apt-get update && \
+    DEBIAN_FRONTEND=noninteractive apt-get install -y gcc-10 g++-10 cpp-10 git ocl-icd-opencl-dev && \
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 100 --slave /usr/bin/g++ g++ /usr/bin/g++-10 --slave /usr/bin/gcov gcov /usr/bin/gcov-10
 # install go
-ADD https://dl.google.com/go/go1.21.3.linux-$TARGETARCH.tar.gz /tmp/go1.21.3.tar.gz
+ADD https://dl.google.com/go/go${GOLANG_VERSION}.linux-$TARGETARCH.tar.gz /tmp/go${GOLANG_VERSION}.tar.gz
-RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go1.21.3.tar.gz
+RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go${GOLANG_VERSION}.tar.gz
 # build the final binary
 WORKDIR /go/src/github.com/jmorganca/ollama
@@ -26,6 +68,7 @@ COPY . .
 ENV GOOS=linux
 ENV GOARCH=$TARGETARCH
 ENV GOFLAGS=$GOFLAGS
+ENV CGO_CFLAGS=${CGO_CFLAGS}
 RUN /usr/local/go/bin/go generate ./... && \
    /usr/local/go/bin/go build .
--- a/README.md
+++ b/README.md
@@ -192,13 +192,19 @@ Install `cmake` and `go`:
 brew install cmake go
 ```
-Then generate dependencies and build:
+Then generate dependencies:
 ```
 go generate ./...
+```
+Then build the binary:
+```
 go build .
 ```
+More detailed instructions can be found in the [developer guide](https://github.com/jmorganca/ollama/blob/main/docs/development.md)
+### Running local builds
 Next, start the server:
 ```

--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -572,10 +572,30 @@ func generate(cmd *cobra.Command, opts generateOptions) error {
 	}
 	if err := client.Generate(ctx, &request, fn); err != nil {
-		if errors.Is(err, context.Canceled) {
+		switch {
+		case errors.Is(err, context.Canceled):
 			return nil
+		case strings.Contains(err.Error(), "unsupported model format"):
+			// pull and retry to see if the model has been updated
+			parts := strings.Split(opts.Model, string(os.PathSeparator))
+			if len(parts) == 1 {
+				// this is a library model, log some info
+				fmt.Fprintln(os.Stderr, "This model is no longer compatible with Ollama. Pulling a new version...")
+			}
+			if err := PullHandler(cmd, []string{opts.Model}); err != nil {
+				fmt.Printf("Error: %s\n", err)
+				return fmt.Errorf("unsupported model, please update this model to gguf format") // relay the original error
+			}
+			// retry
+			if err := client.Generate(ctx, &request, fn); err != nil {
+				if errors.Is(err, context.Canceled) {
+					return nil
+				}
+				return err
+			}
+		default:
+			return err
 		}
-		return err
 	}
 	if opts.Prompt != "" {
 		fmt.Println()

--- a/docs/development.md
+++ b/docs/development.md
@@ -34,6 +34,35 @@ Now you can run `ollama`:
 ## Building on Linux with GPU support
- Install cmake and nvidia-cuda-toolkit
- run `go generate ./...`
+### Linux/Windows CUDA (NVIDIA)
- run `go build .`
+*Your operating system distribution may already have packages for NVIDIA CUDA. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!*
+Install `cmake` and `golang` as well as [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads) development and runtime packages.
+Then generate dependencies:
+```
+go generate ./...
+```
+Then build the binary:
+```
+go build .
+```
+### Linux ROCm (AMD)
+*Your operating system distribution may already have packages for AMD ROCm and CLBlast. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!*
+Install [CLBlast](https://github.com/CNugteren/CLBlast/blob/master/doc/installation.md) and [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html) developement packages first, as well as `cmake` and `golang`.
+Adjust the paths below (correct for Arch) as appropriate for your distributions install locations and generate dependencies:
+```
+CLBlast_DIR=/usr/lib/cmake/CLBlast ROCM_PATH=/opt/rocm go generate ./...
+```
+Then build the binary:
+```
+go build .
+```
+ROCm requires elevated privileges to access the GPU at runtime.  On most distros you can add your user account to the `render` group, or run as root.
+## Containerized Build
+If you have Docker available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included.
\ No newline at end of file
--- a/go.mod
+++ b/go.mod
@@ -7,7 +7,7 @@ require (
 	github.com/gin-gonic/gin v1.9.1
 	github.com/olekukonko/tablewriter v0.0.5
 	github.com/spf13/cobra v1.7.0
-	github.com/stretchr/testify v1.8.3
+	github.com/stretchr/testify v1.8.4
 	golang.org/x/sync v0.3.0
 )

--- a/go.sum
+++ b/go.sum
@@ -98,8 +98,9 @@ github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/
 github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
 github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
 github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
-github.com/stretchr/testify v1.8.3 h1:RP3t2pwF7cMEbC1dqtB6poj3niw/9gnV4Cjg5oW5gtY=
 github.com/stretchr/testify v1.8.3/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
+github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
+github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
 github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS4MhqMhdFk5YI=
 github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08=
 github.com/ugorji/go v1.2.7/go.mod h1:nF9osbDWLy6bDVv/Rtoh6QgnvNDpmCalQV5urGCCS6M=

--- a/gpu/gpu.go
+++ b/gpu/gpu.go
+//go:build linux || windows
+package gpu
+/*
+#cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm
+#cgo windows LDFLAGS: -lpthread
+#include "gpu_info.h"
+*/
+import "C"
+import (
+	"fmt"
+	"log"
+	"sync"
+	"unsafe"
+	"github.com/jmorganca/ollama/api"
+)
+type handles struct {
+	cuda *C.cuda_handle_t
+	rocm *C.rocm_handle_t
+}
+var gpuMutex sync.Mutex
+var gpuHandles *handles = nil
+// Note: gpuMutex must already be held
+func initGPUHandles() {
+	// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
+	log.Printf("Detecting GPU type")
+	gpuHandles = &handles{nil, nil}
+	var resp C.cuda_init_resp_t
+	C.cuda_init(&resp)
+	if resp.err != nil {
+		log.Printf("CUDA not detected: %s", C.GoString(resp.err))
+		C.free(unsafe.Pointer(resp.err))
+		var resp C.rocm_init_resp_t
+		C.rocm_init(&resp)
+		if resp.err != nil {
+			log.Printf("ROCm not detected: %s", C.GoString(resp.err))
+			C.free(unsafe.Pointer(resp.err))
+		} else {
+			log.Printf("Radeon GPU detected")
+			rocm := resp.rh
+			gpuHandles.rocm = &rocm
+		}
+	} else {
+		log.Printf("Nvidia GPU detected")
+		cuda := resp.ch
+		gpuHandles.cuda = &cuda
+	}
+}
+func GetGPUInfo() GpuInfo {
+	// TODO - consider exploring lspci (and equivalent on windows) to check for
+	// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
+	gpuMutex.Lock()
+	defer gpuMutex.Unlock()
+	if gpuHandles == nil {
+		initGPUHandles()
+	}
+	var memInfo C.mem_info_t
+	resp := GpuInfo{"", "", 0, 0}
+	if gpuHandles.cuda != nil {
+		C.cuda_check_vram(*gpuHandles.cuda, &memInfo)
+		if memInfo.err != nil {
+			log.Printf("error looking up CUDA GPU memory: %s", C.GoString(memInfo.err))
+			C.free(unsafe.Pointer(memInfo.err))
+		} else {
+			resp.Driver = "CUDA"
+			resp.Library = "cuda_server"
+		}
+	} else if gpuHandles.rocm != nil {
+		C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
+		if memInfo.err != nil {
+			log.Printf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err))
+			C.free(unsafe.Pointer(memInfo.err))
+		} else {
+			resp.Driver = "ROCM"
+			resp.Library = "rocm_server"
+		}
+	}
+	if resp.Driver == "" {
+		C.cpu_check_ram(&memInfo)
+		resp.Driver = "CPU"
+		// In the future we may offer multiple CPU variants to tune CPU features
+		resp.Library = "default"
+	}
+	if memInfo.err != nil {
+		log.Printf("error looking up CPU memory: %s", C.GoString(memInfo.err))
+		C.free(unsafe.Pointer(memInfo.err))
+		return resp
+	}
+	resp.FreeMemory = uint64(memInfo.free)
+	resp.TotalMemory = uint64(memInfo.total)
+	return resp
+}
+func CheckVRAM() (int64, error) {
+	gpuInfo := GetGPUInfo()
+	if gpuInfo.FreeMemory > 0 && gpuInfo.Driver != "CPU" {
+		return int64(gpuInfo.FreeMemory), nil
+	}
+	return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation
+}
+func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
+	if opts.NumGPU != -1 {
+		return opts.NumGPU
+	}
+	info := GetGPUInfo()
+	if info.Driver == "CPU" {
+		return 0
+	}
+	/*
+		Calculate bytes per layer, this will roughly be the size of the model file divided by the number of layers.
+		We can store the model weights and the kv cache in vram,
+		to enable kv chache vram storage add two additional layers to the number of layers retrieved from the model file.
+	*/
+	bytesPerLayer := uint64(fileSizeBytes / numLayer)
+	// 75% of the absolute max number of layers we can fit in available VRAM, off-loading too many layers to the GPU can cause OOM errors
+	layers := int(info.FreeMemory/bytesPerLayer) * 3 / 4
+	log.Printf("%d MB VRAM available, loading up to %d %s GPU layers out of %d", info.FreeMemory/(1024*1024), layers, info.Driver, numLayer)
+	return layers
+}
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
+//go:build darwin
+package gpu
+import "C"
+import (
+	"runtime"
+	"github.com/jmorganca/ollama/api"
+)
+// CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
+func CheckVRAM() (int64, error) {
+	// TODO - assume metal, and return free memory?
+	return 0, nil
+}
+func GetGPUInfo() GpuInfo {
+	// TODO - Metal vs. x86 macs...
+	return GpuInfo{
+		Driver:      "METAL",
+		Library:     "default",
+		TotalMemory: 0,
+		FreeMemory:  0,
+	}
+}
+func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
+	if runtime.GOARCH == "arm64" {
+		return 1
+	}
+	// metal only supported on arm64
+	return 0
+}
+func nativeInit() error {
+	return nil
+}
--- a/gpu/gpu_info.h
+++ b/gpu/gpu_info.h
+#ifndef __APPLE__
+#ifndef __GPU_INFO_H__
+#define __GPU_INFO_H__
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#ifndef _WIN32
+#include <dlfcn.h>
+#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
+#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
+#define LOAD_ERR() dlerror()
+#define UNLOAD_LIBRARY(handle) dlclose(handle)
+#else
+#include <windows.h>
+#define LOAD_LIBRARY(lib, flags) LoadLibrary(lib)
+#define LOAD_SYMBOL(handle, sym) GetProcAddress(handle, sym)
+#define UNLOAD_LIBRARY(handle) FreeLibrary(handle)
+// TODO - refactor this with proper error message handling on windows
+inline static char *LOAD_ERR() {
+  static char errbuf[8];
+  snprintf(errbuf, 8, "0x%lx", GetLastError());
+  return errbuf;
+}
+#endif
+#ifdef __cplusplus
+extern "C" {
+#endif
+typedef struct mem_info {
+  uint64_t total;
+  uint64_t free;
+  char *err;  // If non-nill, caller responsible for freeing
+} mem_info_t;
+void cpu_check_ram(mem_info_t *resp);
+#ifdef __cplusplus
+}
+#endif
+#include "gpu_info_cuda.h"
+#include "gpu_info_rocm.h"
+#endif  // __GPU_INFO_H__
+#endif  // __APPLE__
\ No newline at end of file
--- a/gpu/gpu_info_cpu.c
+++ b/gpu/gpu_info_cpu.c
+#include "gpu_info.h"
+// Fallbacks for CPU mode
+#ifdef _WIN32
+#include <sysinfoapi.h>
+void cpu_check_ram(mem_info_t *resp) {
+  resp->err = NULL;
+  MEMORYSTATUSEX info;
+  if (GlobalMemoryStatusEx(&info) != 0) {
+    resp->total = info.ullTotalPhys;
+    resp->free = info.ullAvailPhys;
+  } else {
+    resp->err = strdup(LOAD_ERR());
+  }
+  return;
+}
+#elif __linux__
+#include <errno.h>
+#include <string.h>
+#include <sys/sysinfo.h>
+void cpu_check_ram(mem_info_t *resp) {
+  struct sysinfo info;
+  resp->err = NULL;
+  if (sysinfo(&info) != 0) {
+    resp->err = strdup(strerror(errno));
+  } else {
+    resp->total = info.totalram * info.mem_unit;
+    resp->free = info.freeram * info.mem_unit;
+  }
+  return;
+}
+#elif __APPLE__
+// TODO consider an Apple implementation that does something useful
+// mem_info_t cpu_check_ram() {
+//   mem_info_t resp = {0, 0, NULL};
+//   return resp;
+// }
+#else
+#error "Unsupported platform"
+#endif
--- a/gpu/gpu_info_cuda.c
+++ b/gpu/gpu_info_cuda.c
+#ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?
+#include "gpu_info_cuda.h"
+#include <string.h>
+#ifndef _WIN32
+const char *cuda_lib_paths[] = {
+    "libnvidia-ml.so",
+    "/usr/local/cuda/lib64/libnvidia-ml.so",
+    "/usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ml.so",
+    "/usr/lib/wsl/lib/libnvidia-ml.so.1",  // TODO Maybe glob?
+    NULL,
+};
+#else
+const char *cuda_lib_paths[] = {
+    "nvml.dll",
+    "",
+    NULL,
+};
+#endif
+void cuda_init(cuda_init_resp_t *resp) {
+  nvmlReturn_t ret;
+  resp->err = NULL;
+  const int buflen = 256;
+  char buf[buflen + 1];
+  int i;
+  struct lookup {
+    char *s;
+    void **p;
+  } l[4] = {
+      {"nvmlInit_v2", (void *)&resp->ch.initFn},
+      {"nvmlShutdown", (void *)&resp->ch.shutdownFn},
+      {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle},
+      {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
+  };
+  for (i = 0; cuda_lib_paths[i] != NULL && resp->ch.handle == NULL; i++) {
+    resp->ch.handle = LOAD_LIBRARY(cuda_lib_paths[i], RTLD_LAZY);
+  }
+  if (!resp->ch.handle) {
+    // TODO improve error message, as the LOAD_ERR will have typically have the
+    // final path that was checked which might be confusing.
+    snprintf(buf, buflen,
+             "Unable to load %s library to query for Nvidia GPUs: %s",
+             cuda_lib_paths[0], LOAD_ERR());
+    resp->err = strdup(buf);
+    return;
+  }
+  for (i = 0; i < 4; i++) {  // TODO - fix this to use a null terminated list
+    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
+    if (!l[i].p) {
+      UNLOAD_LIBRARY(resp->ch.handle);
+      resp->ch.handle = NULL;
+      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
+               LOAD_ERR());
+      resp->err = strdup(buf);
+      return;
+    }
+  }
+  ret = (*resp->ch.initFn)();
+  if (ret != NVML_SUCCESS) {
+    snprintf(buf, buflen, "nvml vram init failure: %d", ret);
+    resp->err = strdup(buf);
+  }
+  return;
+}
+void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
+  resp->err = NULL;
+  nvmlDevice_t device;
+  nvmlMemory_t memInfo = {0};
+  nvmlReturn_t ret;
+  const int buflen = 256;
+  char buf[buflen + 1];
+  int i;
+  if (h.handle == NULL) {
+    resp->err = strdup("nvml handle sn't initialized");
+    return;
+  }
+  // TODO - handle multiple GPUs
+  ret = (*h.getHandle)(0, &device);
+  if (ret != NVML_SUCCESS) {
+    snprintf(buf, buflen, "unable to get device handle: %d", ret);
+    resp->err = strdup(buf);
+    return;
+  }
+  ret = (*h.getMemInfo)(device, &memInfo);
+  if (ret != NVML_SUCCESS) {
+    snprintf(buf, buflen, "device memory info lookup failure: %d", ret);
+    resp->err = strdup(buf);
+    return;
+  }
+  resp->total = memInfo.total;
+  resp->free = memInfo.free;
+  return;
+}
+#endif  // __APPLE__
\ No newline at end of file
--- a/gpu/gpu_info_cuda.h
+++ b/gpu/gpu_info_cuda.h
+#ifndef __APPLE__
+#ifndef __GPU_INFO_CUDA_H__
+#define __GPU_INFO_CUDA_H__
+#include "gpu_info.h"
+// Just enough typedef's to dlopen/dlsym for memory information
+typedef enum nvmlReturn_enum {
+  NVML_SUCCESS = 0,
+  // Other values omitted for now...
+} nvmlReturn_t;
+typedef void *nvmlDevice_t;  // Opaque is sufficient
+typedef struct nvmlMemory_st {
+  unsigned long long total;
+  unsigned long long free;
+  unsigned long long used;
+} nvmlMemory_t;
+typedef struct cuda_handle {
+  void *handle;
+  nvmlReturn_t (*initFn)(void);
+  nvmlReturn_t (*shutdownFn)(void);
+  nvmlReturn_t (*getHandle)(unsigned int, nvmlDevice_t *);
+  nvmlReturn_t (*getMemInfo)(nvmlDevice_t, nvmlMemory_t *);
+} cuda_handle_t;
+typedef struct cuda_init_resp {
+  char *err;  // If err is non-null handle is invalid
+  cuda_handle_t ch;
+} cuda_init_resp_t;
+void cuda_init(cuda_init_resp_t *resp);
+void cuda_check_vram(cuda_handle_t ch, mem_info_t *resp);
+#endif  // __GPU_INFO_CUDA_H__
+#endif  // __APPLE__
\ No newline at end of file
--- a/gpu/gpu_info_rocm.c
+++ b/gpu/gpu_info_rocm.c
+#ifndef __APPLE__
+#include "gpu_info_rocm.h"
+#include <string.h>
+#ifndef _WIN32
+const char *rocm_lib_paths[] = {
+    "librocm_smi64.so",
+    "/opt/rocm/lib/librocm_smi64.so",
+    NULL,
+};
+#else
+// TODO untested
+const char *rocm_lib_paths[] = {
+    "rocm_smi64.dll",
+    "/opt/rocm/lib/rocm_smi64.dll",
+    NULL,
+};
+#endif
+void rocm_init(rocm_init_resp_t *resp) {
+  rsmi_status_t ret;
+  resp->err = NULL;
+  const int buflen = 256;
+  char buf[buflen + 1];
+  int i;
+  struct lookup {
+    char *s;
+    void **p;
+  } l[4] = {
+      {"rsmi_init", (void *)&resp->rh.initFn},
+      {"rsmi_shut_down", (void *)&resp->rh.shutdownFn},
+      {"rsmi_dev_memory_total_get", (void *)&resp->rh.totalMemFn},
+      {"rsmi_dev_memory_usage_get", (void *)&resp->rh.usageMemFn},
+      // { "rsmi_dev_id_get", (void*)&resp->rh.getHandle },
+  };
+  for (i = 0; rocm_lib_paths[i] != NULL && resp->rh.handle == NULL; i++) {
+    resp->rh.handle = LOAD_LIBRARY(rocm_lib_paths[i], RTLD_LAZY);
+  }
+  if (!resp->rh.handle) {
+    snprintf(buf, buflen,
+             "Unable to load %s library to query for Radeon GPUs: %s\n",
+             rocm_lib_paths[0], LOAD_ERR());
+    resp->err = strdup(buf);
+    return;
+  }
+  for (i = 0; i < 4; i++) {
+    *l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s);
+    if (!l[i].p) {
+      UNLOAD_LIBRARY(resp->rh.handle);
+      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
+               LOAD_ERR());
+      resp->err = strdup(buf);
+      return;
+    }
+  }
+  ret = (*resp->rh.initFn)(0);
+  if (ret != RSMI_STATUS_SUCCESS) {
+    snprintf(buf, buflen, "rocm vram init failure: %d", ret);
+    resp->err = strdup(buf);
+  }
+  return;
+}
+void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
+  resp->err = NULL;
+  // uint32_t num_devices;
+  // uint16_t device;
+  uint64_t totalMem = 0;
+  uint64_t usedMem = 0;
+  rsmi_status_t ret;
+  const int buflen = 256;
+  char buf[buflen + 1];
+  int i;
+  if (h.handle == NULL) {
+    resp->err = strdup("nvml handle sn't initialized");
+    return;
+  }
+  // TODO - iterate through devices...  ret =
+  // rsmi_num_monitor_devices(&num_devices);
+  // ret = (*h.getHandle)(0, &device);
+  // if (ret != RSMI_STATUS_SUCCESS) {
+  //     printf("rocm vram device lookup failure: %d\n", ret);
+  //     return -1;
+  // }
+  // Get total memory - used memory for available memory
+  ret = (*h.totalMemFn)(0, RSMI_MEM_TYPE_VRAM, &totalMem);
+  if (ret != RSMI_STATUS_SUCCESS) {
+    snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
+    resp->err = strdup(buf);
+    return;
+  }
+  ret = (*h.usageMemFn)(0, RSMI_MEM_TYPE_VRAM, &usedMem);
+  if (ret != RSMI_STATUS_SUCCESS) {
+    snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
+    resp->err = strdup(buf);
+    return;
+  }
+  resp->total = totalMem;
+  resp->free = totalMem - usedMem;
+  return;
+}
+#endif  // __APPLE__
\ No newline at end of file
--- a/gpu/gpu_info_rocm.h
+++ b/gpu/gpu_info_rocm.h
+#ifndef __APPLE__
+#ifndef __GPU_INFO_ROCM_H__
+#define __GPU_INFO_ROCM_H__
+#include "gpu_info.h"
+// Just enough typedef's to dlopen/dlsym for memory information
+typedef enum rsmi_status_return {
+  RSMI_STATUS_SUCCESS = 0,
+  // Other values omitted for now...
+} rsmi_status_t;
+typedef enum rsmi_memory_type {
+  RSMI_MEM_TYPE_VRAM = 0,
+  RSMI_MEM_TYPE_VIS_VRAM,
+  RSMI_MEM_TYPE_GTT,
+} rsmi_memory_type_t;
+typedef struct rocm_handle {
+  void *handle;
+  rsmi_status_t (*initFn)(uint64_t);
+  rsmi_status_t (*shutdownFn)(void);
+  rsmi_status_t (*totalMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
+  rsmi_status_t (*usageMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
+  // rsmi_status_t (*getHandle)(uint32_t, uint16_t *);
+} rocm_handle_t;
+typedef struct rocm_init_resp {
+  char *err;  // If err is non-null handle is invalid
+  rocm_handle_t rh;
+} rocm_init_resp_t;
+void rocm_init(rocm_init_resp_t *resp);
+void rocm_check_vram(rocm_handle_t rh, mem_info_t *resp);
+#endif  // __GPU_INFO_ROCM_H__
+#endif  // __APPLE__
\ No newline at end of file
--- a/gpu/gpu_test.go
+++ b/gpu/gpu_test.go
+package gpu
+import (
+	"runtime"
+	"testing"
+	"github.com/stretchr/testify/assert"
+)
+func TestBasicGetGPUInfo(t *testing.T) {
+	info := GetGPUInfo()
+	assert.Contains(t, "CUDA ROCM CPU METAL", info.Driver)
+	switch runtime.GOOS {
+	case "darwin":
+		// TODO - remove this once MacOS returns some size for CPU
+		return
+	case "linux", "windows":
+		assert.Greater(t, info.TotalMemory, uint64(0))
+		assert.Greater(t, info.FreeMemory, uint64(0))
+	default:
+		return
+	}
+}
+// TODO - add some logic to figure out card type through other means and actually verify we got back what we expected
--- a/gpu/types.go
+++ b/gpu/types.go
+package gpu
+// Beginning of an `ollama info` command
+type GpuInfo struct {
+	Driver      string `json:"driver,omitempty"`
+	Library     string `json:"library,omitempty"`
+	TotalMemory uint64 `json:"total_memory,omitempty"`
+	FreeMemory  uint64 `json:"free_memory,omitempty"`
+	// TODO add other useful attributes about the card here for discovery information
+}
--- a/llm/dynamic_shim.c
+++ b/llm/dynamic_shim.c
+#include "dynamic_shim.h"
+#include <stdio.h>
+#include <string.h>
+#ifdef __linux__
+#include <dlfcn.h>
+#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags | RTLD_DEEPBIND)
+#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
+#define LOAD_ERR() dlerror()
+#define UNLOAD_LIBRARY(handle) dlclose(handle)
+#elif _WIN32
+#include <windows.h>
+#define LOAD_LIBRARY(lib, flags) LoadLibrary(lib)
+#define LOAD_SYMBOL(handle, sym) GetProcAddress(handle, sym)
+#define UNLOAD_LIBRARY(handle) FreeLibrary(handle)
+// TODO - refactor this with proper error message handling on windows
+inline static char *LOAD_ERR() {
+  static char errbuf[8];
+  snprintf(errbuf, 8, "0x%lx", GetLastError());
+  return errbuf;
+}
+#else
+#include <dlfcn.h>
+#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
+#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
+#define LOAD_ERR() dlerror()
+#define UNLOAD_LIBRARY(handle) dlclose(handle)
+#endif
+void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s,
+                       ext_server_resp_t *err) {
+  int i = 0;
+  struct lookup {
+    char *s;
+    void **p;
+  } l[] = {
+      {"llama_server_init", (void *)&s->llama_server_init},
+      {"llama_server_start", (void *)&s->llama_server_start},
+      {"llama_server_stop", (void *)&s->llama_server_stop},
+      {"llama_server_completion", (void *)&s->llama_server_completion},
+      {"llama_server_completion_next_result",
+       (void *)&s->llama_server_completion_next_result},
+      {"llama_server_completion_cancel",
+       (void *)&s->llama_server_completion_cancel},
+      {"llama_server_release_task_result",
+       (void *)&s->llama_server_release_task_result},
+      {"llama_server_tokenize", (void *)&s->llama_server_tokenize},
+      {"llama_server_detokenize", (void *)&s->llama_server_detokenize},
+      {"llama_server_embedding", (void *)&s->llama_server_embedding},
+      {"llama_server_release_json_resp",
+       (void *)&s->llama_server_release_json_resp},
+      {"", NULL},
+  };
+  printf("Lazy loading %s library\n", libPath);
+  s->handle = LOAD_LIBRARY(libPath, RTLD_NOW);
+  if (!s->handle) {
+    err->id = -1;
+    snprintf(err->msg, err->msg_len,
+             "Unable to load dynamic server library: %s", LOAD_ERR());
+    return;
+  }
+  for (i = 0; l[i].p != NULL; i++) {
+    *l[i].p = LOAD_SYMBOL(s->handle, l[i].s);
+    if (!l[i].p) {
+      UNLOAD_LIBRARY(s->handle);
+      err->id = -1;
+      snprintf(err->msg, err->msg_len, "symbol lookup for %s failed: %s",
+               l[i].s, LOAD_ERR());
+      return;
+    }
+  }
+}
+inline void dynamic_shim_llama_server_init(struct dynamic_llama_server s,
+                                           ext_server_params_t *sparams,
+                                           ext_server_resp_t *err) {
+  s.llama_server_init(sparams, err);
+}
+inline void dynamic_shim_llama_server_start(struct dynamic_llama_server s) {
+  s.llama_server_start();
+}
+inline void dynamic_shim_llama_server_stop(struct dynamic_llama_server s) {
+  s.llama_server_stop();
+}
+inline void dynamic_shim_llama_server_completion(struct dynamic_llama_server s,
+                                                 const char *json_req,
+                                                 ext_server_resp_t *resp) {
+  s.llama_server_completion(json_req, resp);
+}
+inline void dynamic_shim_llama_server_completion_next_result(
+    struct dynamic_llama_server s, const int task_id,
+    ext_server_task_result_t *result) {
+  s.llama_server_completion_next_result(task_id, result);
+}
+inline void dynamic_shim_llama_server_completion_cancel(
+    struct dynamic_llama_server s, const int task_id, ext_server_resp_t *err) {
+  s.llama_server_completion_cancel(task_id, err);
+}
+inline void dynamic_shim_llama_server_release_task_result(
+    struct dynamic_llama_server s, ext_server_task_result_t *result) {
+  s.llama_server_release_task_result(result);
+}
+inline void dynamic_shim_llama_server_tokenize(struct dynamic_llama_server s,
+                                               const char *json_req,
+                                               char **json_resp,
+                                               ext_server_resp_t *err) {
+  s.llama_server_tokenize(json_req, json_resp, err);
+}
+inline void dynamic_shim_llama_server_detokenize(struct dynamic_llama_server s,
+                                                 const char *json_req,
+                                                 char **json_resp,
+                                                 ext_server_resp_t *err) {
+  s.llama_server_detokenize(json_req, json_resp, err);
+}
+inline void dynamic_shim_llama_server_embedding(struct dynamic_llama_server s,
+                                                const char *json_req,
+                                                char **json_resp,
+                                                ext_server_resp_t *err) {
+  s.llama_server_embedding(json_req, json_resp, err);
+}
+inline void dynamic_shim_llama_server_release_json_resp(
+    struct dynamic_llama_server s, char **json_resp) {
+  s.llama_server_release_json_resp(json_resp);
+}