Adapted rocm support to cgo based llama.cpp

35934b2e · Daniel Hiltgen · f8ef4439 · 35934b2e · 35934b2e · 35934b2e
Commit 35934b2e authored Nov 29, 2023 by Daniel Hiltgen
20 changed files
--- a/Dockerfile
+++ b/Dockerfile
@@ -11,8 +11,8 @@ RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go1.21.3.tar.gz
 COPY . .
 ENV GOARCH=$TARGETARCH
 ENV GOFLAGS=$GOFLAGS
-RUN /usr/local/go/bin/go generate -tags cuda ./... \
+RUN /usr/local/go/bin/go generate ./... \
-    && /usr/local/go/bin/go build -tags cuda .
+    && /usr/local/go/bin/go build .
 FROM ubuntu:22.04
 RUN apt-get update && apt-get install -y ca-certificates
@@ -27,5 +27,3 @@ ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
 ENTRYPOINT ["/bin/ollama"]
 CMD ["serve"]
--- a/Dockerfile.build
+++ b/Dockerfile.build
-# centos7 amd64 dependencies
+# Ubuntu 20.04 amd64 dependencies
-FROM --platform=linux/amd64 nvidia/cuda:11.3.1-devel-centos7 AS base-amd64
+FROM --platform=linux/amd64 nvidia/cuda:11.7.1-devel-ubuntu22.04 AS base-amd64
-RUN yum install -y https://repo.ius.io/ius-release-el7.rpm centos-release-scl && \
+# ROCm only supports amd64
-    yum update -y && \
+ARG ROCM_VERSION=5.7
-    yum install -y devtoolset-10-gcc devtoolset-10-gcc-c++ git236 wget
+# Note: https://rocm.docs.amd.com/en/latest/release/user_kernel_space_compat_matrix.html
-RUN wget "https://github.com/Kitware/CMake/releases/download/v3.27.6/cmake-3.27.6-linux-x86_64.sh" -O cmake-installer.sh && chmod +x cmake-installer.sh && ./cmake-installer.sh --skip-license --prefix=/usr/local
+RUN apt-get update && \
-ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
+    apt-get install -y wget && \
+    wget "https://github.com/Kitware/CMake/releases/download/v3.22.1/cmake-3.22.1-linux-x86_64.sh" -O /tmp/cmake-installer.sh && \
-# centos8 arm64 dependencies
+    chmod +x /tmp/cmake-installer.sh && /tmp/cmake-installer.sh --skip-license --prefix=/usr && \
-FROM --platform=linux/arm64 nvidia/cuda-arm64:11.3.1-devel-centos8 AS base-arm64
+    mkdir --parents --mode=0755 /etc/apt/keyrings && \
-RUN sed -i -e 's/mirrorlist/#mirrorlist/g' -e 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-*
+    wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor > /etc/apt/keyrings/rocm.gpg && \
-RUN yum install -y git cmake
+    echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/${ROCM_VERSION} focal main" > /etc/apt/sources.list.d/rocm.list && \
+    echo "Package: *" > /etc/apt/preferences.d/rocm-pin-600 && \
+    echo "Pin: release o=repo.radeon.com" >> /etc/apt/preferences.d/rocm-pin-600 && \
+    echo "Pin-Priority: 600" >> /etc/apt/preferences.d/rocm-pin-600 && \
+    apt-get update && \
+    DEBIAN_FRONTEND=noninteractive apt-get install -y rocm-hip-libraries rocm-device-libs rocm-libs rocm-ocl-icd rocm-hip-sdk rocm-hip-libraries rocm-cmake rocm-clang-ocl rocm-dev
+ENV ROCM_PATH=/opt/rocm
+# Ubuntu 22.04 arm64 dependencies
+FROM --platform=linux/arm64 nvidia/cuda:11.7.1-devel-ubuntu22.04 AS base-arm64
+RUN apt-get update && \
+    apt-get install -y wget && \
+    wget "https://github.com/Kitware/CMake/releases/download/v3.27.6/cmake-3.27.6-linux-aarch64.sh" -O /tmp/cmake-installer.sh && \
+    chmod +x /tmp/cmake-installer.sh && /tmp/cmake-installer.sh --skip-license --prefix=/usr
 FROM base-${TARGETARCH}
 ARG TARGETARCH
 ARG GOFLAGS="'-ldflags -w -s'"
+ARG CGO_CFLAGS
+ARG CLBLAST_VER=1.6.1
+# Common toolchain
+RUN apt-get update && \
+    DEBIAN_FRONTEND=noninteractive apt-get install -y gcc-11 g++-11 cpp-11 git ocl-icd-opencl-dev && \
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 100 --slave /usr/bin/g++ g++ /usr/bin/g++-11 --slave /usr/bin/gcov gcov /usr/bin/gcov-11
+# CLBlast
+RUN wget -qO- https://github.com/CNugteren/CLBlast/archive/refs/tags/${CLBLAST_VER}.tar.gz | tar zxv -C /tmp/ && \
+    cd /tmp/CLBlast-${CLBLAST_VER} && mkdir build && cd build && cmake .. && make && make install
 # install go
 ADD https://dl.google.com/go/go1.21.3.linux-$TARGETARCH.tar.gz /tmp/go1.21.3.tar.gz
@@ -26,6 +51,7 @@ COPY . .
 ENV GOOS=linux
 ENV GOARCH=$TARGETARCH
 ENV GOFLAGS=$GOFLAGS
+ENV CGO_CFLAGS=${CGO_CFLAGS}
-RUN /usr/local/go/bin/go generate -tags cuda ./... && \
+RUN /usr/local/go/bin/go generate ./... && \
-    /usr/local/go/bin/go build -tags cuda .
+    /usr/local/go/bin/go build .
--- a/README.md
+++ b/README.md
@@ -185,8 +185,6 @@ ollama list
 ## Building
-### Generic (CPU)
 Install `cmake` and `go`:
 ```
@@ -202,32 +200,36 @@ Then build the binary:
 go build .
 ```
-### CUDA (NVIDIA)
+### Linux/Windows CUDA (NVIDIA)
 *Your operating system distribution may already have packages for NVIDIA CUDA. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!*
-Install `cmake` and `golang` as well as [NVIDIA CUDA](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html) development and runtime packages.
+Note: at present, Ollama is optimized for GPU usage on linux, and requires the CUDA libraries at a minimum to compile even if you do not have an NVIDIA GPU.
+Install `cmake` and `golang` as well as [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads) development and runtime packages.
 Then generate dependencies:
 ```
-go generate -tags cuda ./...
+go generate ./...
 ```
 Then build the binary:
 ```
-go build -tags cuda .
+go build .
 ```
-### ROCm (AMD)
+### Linux ROCm (AMD)
 *Your operating system distribution may already have packages for AMD ROCm and CLBlast. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!*
 Install [CLBlast](https://github.com/CNugteren/CLBlast/blob/master/doc/installation.md) and [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html) developement packages first, as well as `cmake` and `golang`.
 Adjust the paths below (correct for Arch) as appropriate for your distributions install locations and generate dependencies:
 ```
-CLBlast_DIR=/usr/lib/cmake/CLBlast ROCM_PATH=/opt/rocm go generate -tags rocm ./...
+CLBlast_DIR=/usr/lib/cmake/CLBlast ROCM_PATH=/opt/rocm go generate ./...
 ```
 Then build the binary:
 ```
-go build -tags rocm
+go build .
 ```
+ROCm requires elevated privileges to access the GPU at runtime.  On most distros you can add your user account to the `render` group, or run as root.
 ### Running local builds
 Next, start the server:

--- a/llm/gpu_cuda.go
+++ b/llm/gpu_cuda.go
 //go:build linux || windows
-package llm
+package gpu
+/*
+#include "gpu_info.h"
+*/
+import "C"
 import (
-	"errors"
+	"fmt"
 	"log"
+	"sync"
+	"unsafe"
 	"github.com/jmorganca/ollama/api"
 )
-/*
+type handles struct {
-#cgo windows LDFLAGS: -L"/Program Files/NVIDIA Corporation/NVSMI/"
+	cuda *C.cuda_handle_t
-#cgo linux LDFLAGS: -lnvidia-ml
+	rocm *C.rocm_handle_t
+}
-#include <stdlib.h>
+var gpuMutex sync.Mutex
-#include "examples/server/server.h"
+var gpuHandles *handles = nil
-*/
-import "C"
+// Note: gpuMutex must already be held
+func initGPUHandles() {
+	log.Printf("Detecting GPU type")
+	gpuHandles = &handles{nil, nil}
+	var resp C.cuda_init_resp_t
+	C.cuda_init(&resp)
+	if resp.err != nil {
+		log.Printf("CUDA not detected: %s", C.GoString(resp.err))
+		C.free(unsafe.Pointer(resp.err))
+		var resp C.rocm_init_resp_t
+		C.rocm_init(&resp)
+		if resp.err != nil {
+			log.Printf("ROCm not detected: %s", C.GoString(resp.err))
+			C.free(unsafe.Pointer(resp.err))
+		} else {
+			log.Printf("Radeon GPU detected")
+			rocm := resp.rh
+			gpuHandles.rocm = &rocm
+		}
+	} else {
+		log.Printf("Nvidia GPU detected")
+		cuda := resp.ch
+		gpuHandles.cuda = &cuda
+	}
+}
+func GetGPUInfo() GpuInfo {
+	// TODO - consider exploring lspci (and equivalent on windows) to check for
+	// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
+	gpuMutex.Lock()
+	defer gpuMutex.Unlock()
+	if gpuHandles == nil {
+		initGPUHandles()
+	}
+	var memInfo C.mem_info_t
+	var resp GpuInfo
+	if gpuHandles.cuda != nil {
+		C.cuda_check_vram(*gpuHandles.cuda, &memInfo)
+		resp.Driver = "CUDA"
+	} else if gpuHandles.rocm != nil {
+		C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
+		resp.Driver = "ROCM"
+	} else {
+		C.cpu_check_ram(&memInfo)
+		resp.Driver = "CPU"
+	}
+	if memInfo.err != nil {
+		log.Printf("error looking up GPU memory: %s", C.GoString(memInfo.err))
+		C.free(unsafe.Pointer(memInfo.err))
+	}
+	resp.FreeMemory = uint64(memInfo.free)
+	resp.TotalMemory = uint64(memInfo.total)
+	return resp
+}
-// CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
 func CheckVRAM() (int64, error) {
-	return int64(C.check_vram()), nil
+	gpuInfo := GetGPUInfo()
+	if gpuInfo.FreeMemory > 0 && gpuInfo.Driver != "CPU" {
+		return int64(gpuInfo.FreeMemory), nil
+	}
+	return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation
 }
 func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
 	if opts.NumGPU != -1 {
 		return opts.NumGPU
 	}
-	freeBytes, err := CheckVRAM()
+	info := GetGPUInfo()
-	if err != nil {
+	if info.Driver == "CPU" {
-		if !errors.Is(err, errNvidiaSMI) {
-			log.Print(err.Error())
-		}
-		// nvidia driver not installed or no nvidia GPU found
 		return 0
 	}
@@ -41,17 +103,17 @@ func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
 		We can store the model weights and the kv cache in vram,
 		to enable kv chache vram storage add two additional layers to the number of layers retrieved from the model file.
 	*/
-	bytesPerLayer := fileSizeBytes / numLayer
+	bytesPerLayer := uint64(fileSizeBytes / numLayer)
 	// 75% of the absolute max number of layers we can fit in available VRAM, off-loading too many layers to the GPU can cause OOM errors
-	layers := int(freeBytes/bytesPerLayer) * 3 / 4
+	layers := int(info.FreeMemory/bytesPerLayer) * 3 / 4
 	// TODO - not sure on this part... if we can't fit all the layers, just fallback to CPU
 	// if int64(layers) < numLayer {
 	// 	log.Printf("%d MB VRAM available, insufficient to load current model (reuires %d MB) - falling back to CPU %d", freeBytes/(1024*1024), fileSizeBytes/(1024*1024))
 	// 	return 0
 	// }
-	log.Printf("%d MB VRAM available, loading up to %d GPU layers out of %d", freeBytes/(1024*1024), layers, numLayer)
+	log.Printf("%d MB VRAM available, loading up to %d GPU layers out of %d", info.FreeMemory/(1024*1024), layers, numLayer)
 	return layers
 }
--- a/llm/gpu_darwin.go
+++ b/llm/gpu_darwin.go
 //go:build darwin
-package llm
+package gpu
+import "C"
 import (
 	"github.com/jmorganca/ollama/api"
 )
@@ -9,11 +10,25 @@ import (
 // CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
 func CheckVRAM() (int64, error) {
 	// TODO - assume metal, and return free memory?
-	return 0, errNvidiaSMI
+	return 0, nil
 }
+func GetGPUInfo() GpuInfo {
+	// TODO - Metal vs. x86 macs...
+	return GpuInfo{
+		Driver:      "METAL",
+		TotalMemory: 0,
+		FreeMemory:  0,
+	}
+}
 func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
 	// default to enable metal on macOS
 	return 1
 }
+func nativeInit() error {
+	return nil
+}
--- a/gpu/gpu_info.h
+++ b/gpu/gpu_info.h
+#ifndef __APPLE__
+#ifndef __GPU_INFO_H__
+#define __GPU_INFO_H__
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#ifndef _WIN32
+#include <dlfcn.h>
+#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
+#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
+#define LOAD_ERR() dlerror()
+#define UNLOAD_LIBRARY(handle) dlclose(handle)
+#else
+#include <windows.h>
+#define LOAD_LIBRARY(lib, flags) LoadLibrary(lib)
+#define LOAD_SYMBOL(handle, sym) GetProcAddress(handle, sym)
+#define UNLOAD_LIBRARY(handle) FreeLibrary(handle)
+// TODO - refactor this with proper error message handling on windows
+inline static char *LOAD_ERR() {
+  static char errbuf[8];
+  snprintf(errbuf, 8, "0x%lx", GetLastError());
+  return errbuf;
+}
+#endif
+#ifdef __cplusplus
+extern "C" {
+#endif
+typedef struct mem_info {
+  uint64_t total;
+  uint64_t free;
+  char *err;  // If non-nill, caller responsible for freeing
+} mem_info_t;
+void cpu_check_ram(mem_info_t *resp);
+#ifdef __cplusplus
+}
+#endif
+#include "gpu_info_cuda.h"
+#include "gpu_info_rocm.h"
+#endif  // __GPU_INFO_H__
+#endif  // __APPLE__
\ No newline at end of file
--- a/gpu/gpu_info_cpu.c
+++ b/gpu/gpu_info_cpu.c
+#include "gpu_info.h"
+// Fallbacks for CPU mode
+#ifdef _WIN32
+#include <sysinfoapi.h>
+void cpu_check_ram(mem_info_t *resp) {
+  resp->err = NULL;
+  MEMORYSTATUSEX info;
+  if (GlobalMemoryStatusEx(&info) != 0) {
+    resp->total = info.ullTotalPhys;
+    resp->free = info.ullAvailPhys;
+  } else {
+    resp->err = strdup(LOAD_ERR());
+  }
+  return;
+}
+#elif __linux__
+#include <errno.h>
+#include <string.h>
+#include <sys/sysinfo.h>
+void cpu_check_ram(mem_info_t *resp) {
+  struct sysinfo info;
+  resp->err = NULL;
+  if (sysinfo(&info) != 0) {
+    resp->err = strdup(strerror(errno));
+  } else {
+    resp->total = info.totalram * info.mem_unit;
+    resp->free = info.freeram * info.mem_unit;
+  }
+  return;
+}
+#elif __APPLE__
+// TODO consider an Apple implementation that does something useful
+// mem_info_t cpu_check_ram() {
+//   mem_info_t resp = {0, 0, NULL};
+//   return resp;
+// }
+#else
+#error "Unsupported platform"
+#endif
--- a/gpu/gpu_info_cuda.c
+++ b/gpu/gpu_info_cuda.c
+#ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?
+#include "gpu_info_cuda.h"
+#include <string.h>
+#ifndef _WIN32
+const char *cuda_lib_paths[] = {
+    "libnvidia-ml.so",
+    "/usr/local/cuda/lib64/libnvidia-ml.so",
+    NULL,
+};
+#else
+const char *cuda_lib_paths[] = {
+    "nvml.dll",
+    "",
+    NULL,
+};
+#endif
+void cuda_init(cuda_init_resp_t *resp) {
+  resp->err = NULL;
+  const int buflen = 256;
+  char buf[buflen + 1];
+  int i;
+  struct lookup {
+    char *s;
+    void **p;
+  } l[4] = {
+      {"nvmlInit_v2", (void *)&resp->ch.initFn},
+      {"nvmlShutdown", (void *)&resp->ch.shutdownFn},
+      {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle},
+      {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
+  };
+  for (i = 0; cuda_lib_paths[i] != NULL && resp->ch.handle == NULL; i++) {
+    resp->ch.handle = LOAD_LIBRARY(cuda_lib_paths[i], RTLD_LAZY);
+  }
+  if (!resp->ch.handle) {
+    snprintf(buf, buflen,
+             "Unable to load %s library to query for Nvidia GPUs: %s",
+             cuda_lib_paths[0], LOAD_ERR());
+    resp->err = strdup(buf);
+    return;
+  }
+  for (i = 0; i < 4; i++) {  // TODO - fix this to use a null terminated list
+    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
+    if (!l[i].p) {
+      UNLOAD_LIBRARY(resp->ch.handle);
+      resp->ch.handle = NULL;
+      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
+               LOAD_ERR());
+      resp->err = strdup(buf);
+      return;
+    }
+  }
+  return;
+}
+void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
+  resp->err = NULL;
+  nvmlDevice_t device;
+  nvmlMemory_t memInfo = {0};
+  nvmlReturn_t ret;
+  const int buflen = 256;
+  char buf[buflen + 1];
+  int i;
+  if (h.handle == NULL) {
+    resp->err = strdup("nvml handle sn't initialized");
+    return;
+  }
+  ret = (*h.initFn)();
+  if (ret != NVML_SUCCESS) {
+    snprintf(buf, buflen, "nvml vram init failure: %d", ret);
+    resp->err = strdup(buf);
+    return;
+  }
+  // TODO - handle multiple GPUs
+  ret = (*h.getHandle)(0, &device);
+  if (ret != NVML_SUCCESS) {
+    (*h.shutdownFn)();
+    snprintf(buf, buflen, "unable to get device handle: %d", ret);
+    resp->err = strdup(buf);
+    return;
+  }
+  ret = (*h.getMemInfo)(device, &memInfo);
+  if (ret != NVML_SUCCESS) {
+    (*h.shutdownFn)();
+    snprintf(buf, buflen, "device memory info lookup failure: %d", ret);
+    resp->err = strdup(buf);
+    return;
+  }
+  resp->total = memInfo.total;
+  resp->free = memInfo.free;
+  ret = (*h.shutdownFn)();
+  if (ret != NVML_SUCCESS) {
+    snprintf(buf, buflen, "nvml vram shutdown failure: %d", ret);
+    resp->err = strdup(buf);
+  }
+  return;
+}
+#endif  // __APPLE__
\ No newline at end of file
--- a/gpu/gpu_info_cuda.h
+++ b/gpu/gpu_info_cuda.h
+#ifndef __APPLE__
+#ifndef __GPU_INFO_CUDA_H__
+#define __GPU_INFO_CUDA_H__
+#include "gpu_info.h"
+// Just enough typedef's to dlopen/dlsym for memory information
+typedef enum nvmlReturn_enum {
+  NVML_SUCCESS = 0,
+  // Other values omitted for now...
+} nvmlReturn_t;
+typedef void *nvmlDevice_t;  // Opaque is sufficient
+typedef struct nvmlMemory_st {
+  unsigned long long total;
+  unsigned long long free;
+  unsigned long long used;
+} nvmlMemory_t;
+typedef struct cuda_handle {
+  void *handle;
+  nvmlReturn_t (*initFn)(void);
+  nvmlReturn_t (*shutdownFn)(void);
+  nvmlReturn_t (*getHandle)(unsigned int, nvmlDevice_t *);
+  nvmlReturn_t (*getMemInfo)(nvmlDevice_t, nvmlMemory_t *);
+} cuda_handle_t;
+typedef struct cuda_init_resp {
+  char *err;  // If err is non-null handle is invalid
+  cuda_handle_t ch;
+} cuda_init_resp_t;
+void cuda_init(cuda_init_resp_t *resp);
+void cuda_check_vram(cuda_handle_t ch, mem_info_t *resp);
+#endif  // __GPU_INFO_CUDA_H__
+#endif  // __APPLE__
\ No newline at end of file
--- a/gpu/gpu_info_rocm.c
+++ b/gpu/gpu_info_rocm.c
+#ifndef __APPLE__
+#include "gpu_info_rocm.h"
+#include <string.h>
+#ifndef _WIN32
+const char *rocm_lib_paths[] = {
+    "librocm_smi64.so",
+    "/opt/rocm/lib/librocm_smi64.so",
+    NULL,
+};
+#else
+// TODO untested
+const char *rocm_lib_paths[] = {
+    "rocm_smi64.dll",
+    "/opt/rocm/lib/rocm_smi64.dll",
+    NULL,
+};
+#endif
+void rocm_init(rocm_init_resp_t *resp) {
+  resp->err = NULL;
+  const int buflen = 256;
+  char buf[buflen + 1];
+  int i;
+  struct lookup {
+    char *s;
+    void **p;
+  } l[4] = {
+      {"rsmi_init", (void *)&resp->rh.initFn},
+      {"rsmi_shut_down", (void *)&resp->rh.shutdownFn},
+      {"rsmi_dev_memory_total_get", (void *)&resp->rh.totalMemFn},
+      {"rsmi_dev_memory_usage_get", (void *)&resp->rh.usageMemFn},
+      // { "rsmi_dev_id_get", (void*)&resp->rh.getHandle },
+  };
+  for (i = 0; rocm_lib_paths[i] != NULL && resp->rh.handle == NULL; i++) {
+    resp->rh.handle = LOAD_LIBRARY(rocm_lib_paths[i], RTLD_LAZY);
+  }
+  if (!resp->rh.handle) {
+    snprintf(buf, buflen,
+             "Unable to load %s library to query for Radeon GPUs: %s\n",
+             rocm_lib_paths[0], LOAD_ERR());
+    resp->err = strdup(buf);
+    return;
+  }
+  for (i = 0; i < 4; i++) {
+    *l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s);
+    if (!l[i].p) {
+      UNLOAD_LIBRARY(resp->rh.handle);
+      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
+               LOAD_ERR());
+      resp->err = strdup(buf);
+      return;
+    }
+  }
+  return;
+}
+void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
+  resp->err = NULL;
+  // uint32_t num_devices;
+  // uint16_t device;
+  uint64_t totalMem = 0;
+  uint64_t usedMem = 0;
+  rsmi_status_t ret;
+  const int buflen = 256;
+  char buf[buflen + 1];
+  int i;
+  ret = (*h.initFn)(0);
+  if (ret != RSMI_STATUS_SUCCESS) {
+    snprintf(buf, buflen, "rocm vram init failure: %d", ret);
+    resp->err = strdup(buf);
+    return;
+  }
+  // TODO - iterate through devices...  ret =
+  // rsmi_num_monitor_devices(&num_devices);
+  // ret = (*h.getHandle)(0, &device);
+  // if (ret != RSMI_STATUS_SUCCESS) {
+  //     printf("rocm vram device lookup failure: %d\n", ret);
+  //     return -1;
+  // }
+  // Get total memory - used memory for available memory
+  ret = (*h.totalMemFn)(0, RSMI_MEM_TYPE_VRAM, &totalMem);
+  if (ret != RSMI_STATUS_SUCCESS) {
+    (*h.shutdownFn)();
+    snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
+    resp->err = strdup(buf);
+    return;
+  }
+  ret = (*h.usageMemFn)(0, RSMI_MEM_TYPE_VRAM, &usedMem);
+  if (ret != RSMI_STATUS_SUCCESS) {
+    (*h.shutdownFn)();
+    snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
+    resp->err = strdup(buf);
+    return;
+  }
+  (*h.shutdownFn)();
+  resp->total = totalMem;
+  resp->free = totalMem - usedMem;
+  return;
+}
+#endif  // __APPLE__
\ No newline at end of file
--- a/gpu/gpu_info_rocm.h
+++ b/gpu/gpu_info_rocm.h
+#ifndef __APPLE__
+#ifndef __GPU_INFO_ROCM_H__
+#define __GPU_INFO_ROCM_H__
+#include "gpu_info.h"
+// Just enough typedef's to dlopen/dlsym for memory information
+typedef enum rsmi_status_return {
+  RSMI_STATUS_SUCCESS = 0,
+  // Other values omitted for now...
+} rsmi_status_t;
+typedef enum rsmi_memory_type {
+  RSMI_MEM_TYPE_VRAM = 0,
+  RSMI_MEM_TYPE_VIS_VRAM,
+  RSMI_MEM_TYPE_GTT,
+} rsmi_memory_type_t;
+typedef struct rocm_handle {
+  void *handle;
+  rsmi_status_t (*initFn)(uint64_t);
+  rsmi_status_t (*shutdownFn)(void);
+  rsmi_status_t (*totalMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
+  rsmi_status_t (*usageMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
+  // rsmi_status_t (*getHandle)(uint32_t, uint16_t *);
+} rocm_handle_t;
+typedef struct rocm_init_resp {
+  char *err;  // If err is non-null handle is invalid
+  rocm_handle_t rh;
+} rocm_init_resp_t;
+void rocm_init(rocm_init_resp_t *resp);
+void rocm_check_vram(rocm_handle_t rh, mem_info_t *resp);
+#endif  // __GPU_INFO_ROCM_H__
+#endif  // __APPLE__
\ No newline at end of file
--- a/gpu/gpu_test.go
+++ b/gpu/gpu_test.go
+package gpu
+import (
+	"runtime"
+	"testing"
+	"github.com/stretchr/testify/assert"
+)
+func TestBasicGetGPUInfo(t *testing.T) {
+	info := GetGPUInfo()
+	assert.Contains(t, "CUDA ROCM CPU METAL", info.Driver)
+	switch runtime.GOOS {
+	case "darwin":
+		// TODO - remove this once MacOS returns some size for CPU
+		return
+	case "linux", "windows":
+		assert.Greater(t, info.TotalMemory, uint64(0))
+		assert.Greater(t, info.FreeMemory, uint64(0))
+	default:
+		return
+	}
+}
+// TODO - add some logic to figure out card type through other means and actually verify we got back what we expected
--- a/gpu/types.go
+++ b/gpu/types.go
+package gpu
+// Beginning of an `ollama info` command
+type GpuInfo struct {
+	Driver      string `json:"driver,omitempty"`
+	TotalMemory uint64 `json:"total_memory,omitempty"`
+	FreeMemory  uint64 `json:"free_memory,omitempty"`
+	// TODO add other useful attributes about the card here for discovery information
+}
--- a/llm/accelerator_cuda.go
+++ b/llm/accelerator_cuda.go
-//go:build cuda
-package llm
-import (
-	"bufio"
-	"bytes"
-	"errors"
-	"fmt"
-	"log"
-	"os/exec"
-	"path"
-	"strconv"
-	"strings"
-	"github.com/jmorganca/ollama/format"
-)
-var (
-	errNvidiaSMI     = errors.New("warning: gpu support may not be enabled, check that you have installed GPU drivers: nvidia-smi command failed")
-	errAvailableVRAM = errors.New("not enough VRAM available, falling back to CPU only")
-)
-// acceleratedRunner returns the runner for this accelerator given the provided buildPath string.
-func acceleratedRunner(buildPath string) []ModelRunner {
-	return []ModelRunner{
-		ModelRunner{
-			Path:        path.Join(buildPath, "cuda", "bin", "ollama-runner"),
-			Accelerated: true,
-		},
-	}
-}
-// CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
-func CheckVRAM() (int64, error) {
-	cmd := exec.Command("nvidia-smi", "--query-gpu=memory.free", "--format=csv,noheader,nounits")
-	var stdout bytes.Buffer
-	cmd.Stdout = &stdout
-	err := cmd.Run()
-	if err != nil {
-		return 0, errNoAccel
-	}
-	var freeMiB int64
-	scanner := bufio.NewScanner(&stdout)
-	for scanner.Scan() {
-		line := scanner.Text()
-		if strings.Contains(line, "[Insufficient Permissions]") {
-			return 0, fmt.Errorf("GPU support may not enabled, check you have installed GPU drivers and have the necessary permissions to run nvidia-smi")
-		}
-		vram, err := strconv.ParseInt(strings.TrimSpace(line), 10, 64)
-		if err != nil {
-			return 0, fmt.Errorf("failed to parse available VRAM: %v", err)
-		}
-		freeMiB += vram
-	}
-	freeBytes := freeMiB * 1024 * 1024
-	if freeBytes < 2*format.GigaByte {
-		log.Printf("less than 2 GB VRAM available")
-		return 0, errAvailableVRAM
-	}
-	return freeBytes, nil
-}
--- a/llm/accelerator_none.go
+++ b/llm/accelerator_none.go
-//go:build !rocm && !cuda
-package llm
-import (
-	"errors"
-)
-var (
-	errNoAccel = errors.New("no accelerator support in this binary")
-)
-// acceleratedRunner returns the runner for this accelerator given the provided buildPath string.
-func acceleratedRunner(buildPath string) []ModelRunner {
-	return make([]ModelRunner, 0, 1)
-}
-// CheckVRAM is a stub with no accelerator.
-func CheckVRAM() (int64, error) {
-	return 0, errNoGPU
-}
--- a/llm/accelerator_rocm.go
+++ b/llm/accelerator_rocm.go
-//go:build rocm
-package llm
-import (
-	"bytes"
-	"encoding/csv"
-	"errors"
-	"fmt"
-	"io"
-	"log"
-	"os"
-	"os/exec"
-	"path"
-	"path/filepath"
-	"strconv"
-	"strings"
-)
-var errNoAccel = errors.New("rocm-smi command failed")
-// acceleratedRunner returns the runner for this accelerator given the provided buildPath string.
-func acceleratedRunner(buildPath string) []ModelRunner {
-	return []ModelRunner{
-		ModelRunner{
-			Path:        path.Join(buildPath, "rocm", "bin", "ollama-runner"),
-			Accelerated: true,
-		},
-	}
-}
-// CheckVRAM returns the available VRAM in MiB on Linux machines with AMD GPUs
-func CheckVRAM() (int64, error) {
-	rocmHome := os.Getenv("ROCM_PATH")
-	if rocmHome == "" {
-		rocmHome = os.Getenv("ROCM_HOME")
-	}
-	if rocmHome == "" {
-		log.Println("warning: ROCM_PATH is not set. Trying a likely fallback path, but it is recommended to set this variable in the environment.")
-		rocmHome = "/opt/rocm"
-	}
-	cmd := exec.Command(filepath.Join(rocmHome, "bin/rocm-smi"), "--showmeminfo", "VRAM", "--csv")
-	var stdout bytes.Buffer
-	cmd.Stdout = &stdout
-	err := cmd.Run()
-	if err != nil {
-		return 0, errNoAccel
-	}
-	csvData := csv.NewReader(&stdout)
-	// llama.cpp or ROCm don't seem to understand splitting the VRAM allocations across them properly, so try to find the biggest card instead :(. FIXME.
-	totalBiggestCard := int64(0)
-	bigCardName := ""
-	for {
-		record, err := csvData.Read()
-		if err == io.EOF {
-			break
-		}
-		if err != nil {
-			return 0, fmt.Errorf("failed to parse available VRAM: %v", err)
-		}
-		if !strings.HasPrefix(record[0], "card") {
-			continue
-		}
-		cardTotal, err := strconv.ParseInt(record[1], 10, 64)
-		if err != nil {
-			return 0, err
-		}
-		cardUsed, err := strconv.ParseInt(record[2], 10, 64)
-		if err != nil {
-			return 0, err
-		}
-		possible := (cardTotal - cardUsed)
-		log.Printf("ROCm found %d MiB of available VRAM on device %q", possible/1024/1024, record[0])
-		if possible > totalBiggestCard {
-			totalBiggestCard = possible
-			bigCardName = record[0]
-		}
-	}
-	if totalBiggestCard == 0 {
-		log.Printf("found ROCm GPU but failed to parse free VRAM!")
-		return 0, errNoAccel
-	}
-	log.Printf("ROCm selecting device %q", bigCardName)
-	return totalBiggestCard, nil
-}
--- a/llm/ext_server.go
+++ b/llm/ext_server.go
 package llm
 /*
-#cgo CFLAGS: -I${SRCDIR}/llama.cpp/gguf -I${SRCDIR}/llama.cpp/gguf/common
+#cgo CFLAGS: -I${SRCDIR}/llama.cpp/gguf -I${SRCDIR}/llama.cpp/gguf/common -I${SRCDIR}/llama.cpp/gguf/examples/server
 #cgo CFLAGS: -DNDEBUG -DLLAMA_SERVER_LIBRARY=1 -D_XOPEN_SOURCE=600 -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
 #cgo CFLAGS: -Wmissing-noreturn -Wall -Wextra -Wcast-qual -Wno-unused-function -Wno-array-bounds
 #cgo CPPFLAGS: -Ofast -Wall -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations -Wno-unused-but-set-variable
@@ -25,6 +25,8 @@ package llm
 #cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/common/libcommon.a
 #cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/libllama.a
 #cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/libggml_static.a
+// Note: the following requires cuda library presence on linux to build, even if you only have rocm or CPU only
 #cgo linux LDFLAGS: /usr/local/cuda/lib64/libcudart_static.a
 #cgo linux LDFLAGS: /usr/local/cuda/lib64/libcublas_static.a
 #cgo linux LDFLAGS: /usr/local/cuda/lib64/libcublasLt_static.a
@@ -35,7 +37,7 @@ package llm
 #cgo windows LDFLAGS: -lext_server_shared -lpthread
 #include <stdlib.h>
-#include "examples/server/server.h"
+#include "server.h"
 */
 import "C"
@@ -43,25 +45,51 @@ import (
 	"bytes"
 	"context"
 	"encoding/json"
-	"errors"
 	"fmt"
 	"log"
 	"os"
 	"runtime"
+	"strings"
 	"sync"
 	"time"
 	"unsafe"
 	"github.com/jmorganca/ollama/api"
+	"github.com/jmorganca/ollama/gpu"
 )
-func errWrap(resp C.ext_server_err) error {
+func newExtServerResp(len C.size_t) C.ext_server_resp_t {
-	if resp.code == 0 {
+	var resp C.ext_server_resp_t
-		return nil
+	resp.msg_len = len
+	bytes := make([]byte, len)
+	resp.msg = (*C.char)(C.CBytes(bytes))
+	return resp
+}
+func freeExtServerResp(resp C.ext_server_resp_t) {
+	if resp.msg_len == 0 {
+		return
 	}
-	err := fmt.Errorf(C.GoString(resp.err))
+	C.free(unsafe.Pointer(resp.msg))
-	C.free(unsafe.Pointer(resp.err))
+}
-	return err
+func extServerResponseToErr(resp C.ext_server_resp_t) error {
+	return fmt.Errorf(C.GoString(resp.msg))
+}
+type extServer interface {
+	LLM
+	llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t)
+	llama_server_start()
+	llama_server_stop()
+	llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t)
+	llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t)
+	llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t)
+	llama_server_release_task_result(result *C.ext_server_task_result_t)
+	llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
+	llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
+	llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
+	llama_server_release_json_resp(json_resp **C.char)
 }
 type llamaExtServer struct {
@@ -71,21 +99,61 @@ type llamaExtServer struct {
 // Note: current implementation does not support concurrent instantiations
 var mutex sync.Mutex
-func newLlamaExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (*llamaExtServer, error) {
+func (llm *llamaExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) {
+	C.llama_server_init(sparams, err)
+}
+func (llm *llamaExtServer) llama_server_start() {
+	C.llama_server_start()
+}
+func (llm *llamaExtServer) llama_server_stop() {
+	C.llama_server_stop()
+}
+func (llm *llamaExtServer) llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) {
+	C.llama_server_completion(json_req, resp)
+}
+func (llm *llamaExtServer) llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) {
+	C.llama_server_completion_next_result(task_id, resp)
+}
+func (llm *llamaExtServer) llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) {
+	C.llama_server_completion_cancel(task_id, err)
+}
+func (llm *llamaExtServer) llama_server_release_task_result(result *C.ext_server_task_result_t) {
+	C.llama_server_release_task_result(result)
+}
+func (llm *llamaExtServer) llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
+	C.llama_server_tokenize(json_req, json_resp, err)
+}
+func (llm *llamaExtServer) llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
+	C.llama_server_detokenize(json_req, json_resp, err)
+}
+func (llm *llamaExtServer) llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
+	C.llama_server_embedding(json_req, json_resp, err)
+}
+func (llm *llamaExtServer) llama_server_release_json_resp(json_resp **C.char) {
+	C.llama_server_release_json_resp(json_resp)
+}
+func newLlamaExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
+	server := &llamaExtServer{opts}
+	return newExtServer(server, model, adapters, projectors, numLayers, opts)
+}
+func newExtServer(server extServer, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
 	if !mutex.TryLock() {
 		log.Printf("concurrent llm servers not yet supported, waiting for prior server to complete")
 		mutex.Lock()
 	}
-	server := &llamaExtServer{opts}
 	fileInfo, err := os.Stat(model)
 	if err != nil {
 		return nil, err
 	}
-	var sparams C.ext_server_params
+	var sparams C.ext_server_params_t
 	sparams.model = C.CString(model)
 	defer C.free(unsafe.Pointer(sparams.model))
-	numGPU := NumGPU(numLayers, fileInfo.Size(), opts)
+	numGPU := gpu.NumGPU(numLayers, fileInfo.Size(), opts)
 	sparams.embedding = true
 	sparams.n_ctx = C.uint(opts.NumCtx)
@@ -97,10 +165,14 @@ func newLlamaExtServer(model string, adapters, projectors []string, numLayers in
 	// Always use the value encoded in the model
 	sparams.rope_freq_base = 0.0
 	sparams.rope_freq_scale = 0.0
+	sparams.memory_f16 = C.bool(opts.F16KV)
+	sparams.use_mlock = C.bool(opts.UseMLock)
+	sparams.use_mmap = C.bool(opts.UseMMap)
+	sparams.numa = C.bool(opts.UseNUMA)
 	sparams.lora_adapters = nil
 	for i := 0; i < len(adapters); i++ {
-		la := (*C.ext_server_lora_adapter)(C.malloc(C.sizeof_struct_ext_server_lora_adapter))
+		la := (*C.ext_server_lora_adapter_t)(C.malloc(C.sizeof_ext_server_lora_adapter_t))
 		defer C.free(unsafe.Pointer(la))
 		la.adapter = C.CString(adapters[i])
 		defer C.free(unsafe.Pointer(la.adapter))
@@ -116,11 +188,13 @@ func newLlamaExtServer(model string, adapters, projectors []string, numLayers in
 		}
 	}
-	// TODO - implement ME
+	if len(projectors) > 0 {
-	// if len(projectors) > 0 {
+		// TODO: applying multiple projectors is not supported by the llama.cpp server yet
-	// 	// TODO: applying multiple projectors is not supported by the llama.cpp server yet
+		sparams.mmproj = C.CString(projectors[0])
-	// 	params = append(params, "--mmproj", projectors[0])
+		defer C.free(unsafe.Pointer(sparams.mmproj))
-	// }
+	} else {
+		sparams.mmproj = nil
+	}
 	if opts.NumThread > 0 {
 		sparams.n_threads = C.uint(opts.NumThread)
@@ -128,136 +202,167 @@ func newLlamaExtServer(model string, adapters, projectors []string, numLayers in
 		sparams.n_threads = C.uint(runtime.NumCPU())
 	}
-	sparams.memory_f16 = false
-	if opts.F16KV {
-		sparams.memory_f16 = true
-	}
-	sparams.use_mlock = false
-	if opts.UseMLock {
-		sparams.use_mlock = true
-	}
-	sparams.use_mmap = true
-	if !opts.UseMMap {
-		sparams.use_mmap = false
-	}
-	sparams.numa = false
-	if opts.UseNUMA {
-		sparams.numa = true
-	}
 	log.Printf("Initializing internal llama server")
-	err = errWrap(C.llama_server_init(&sparams))
+	resp := newExtServerResp(128)
-	if err != nil {
+	defer freeExtServerResp(resp)
-		return nil, err
+	server.llama_server_init(&sparams, &resp)
+	if resp.id < 0 {
+		return nil, extServerResponseToErr(resp)
 	}
 	log.Printf("Starting internal llama main loop")
-	C.llama_server_start()
+	server.llama_server_start()
 	return server, nil
 }
-func (llm *llamaExtServer) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
+func (llm *llamaExtServer) Predict(ctx context.Context, pred PredictOpts, fn func(PredictResult)) error {
+	return predict(llm, llm.Options, ctx, pred, fn)
+}
+func predict(llm extServer, opts api.Options, ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
+	resp := newExtServerResp(128)
+	defer freeExtServerResp(resp)
+	var imageData []ImageData
+	if len(predict.Images) > 0 {
+		for cnt, i := range predict.Images {
+			imageData = append(imageData, ImageData{Data: i, ID: cnt})
+		}
+	}
+	log.Printf("loaded %d images", len(imageData))
 	request := map[string]any{
 		"prompt":            predict.Prompt,
 		"stream":            true,
-		"n_predict":         llm.NumPredict,
+		"n_predict":         opts.NumPredict,
-		"n_keep":            llm.NumKeep,
+		"n_keep":            opts.NumKeep,
-		"temperature":       llm.Temperature,
+		"temperature":       opts.Temperature,
-		"top_k":             llm.TopK,
+		"top_k":             opts.TopK,
-		"top_p":             llm.TopP,
+		"top_p":             opts.TopP,
-		"tfs_z":             llm.TFSZ,
+		"tfs_z":             opts.TFSZ,
-		"typical_p":         llm.TypicalP,
+		"typical_p":         opts.TypicalP,
-		"repeat_last_n":     llm.RepeatLastN,
+		"repeat_last_n":     opts.RepeatLastN,
-		"repeat_penalty":    llm.RepeatPenalty,
+		"repeat_penalty":    opts.RepeatPenalty,
-		"presence_penalty":  llm.PresencePenalty,
+		"presence_penalty":  opts.PresencePenalty,
-		"frequency_penalty": llm.FrequencyPenalty,
+		"frequency_penalty": opts.FrequencyPenalty,
-		"mirostat":          llm.Mirostat,
+		"mirostat":          opts.Mirostat,
-		"mirostat_tau":      llm.MirostatTau,
+		"mirostat_tau":      opts.MirostatTau,
-		"mirostat_eta":      llm.MirostatEta,
+		"mirostat_eta":      opts.MirostatEta,
-		"penalize_nl":       llm.PenalizeNewline,
+		"penalize_nl":       opts.PenalizeNewline,
-		"seed":              llm.Seed,
+		"seed":              opts.Seed,
-		"stop":              llm.Stop,
+		"stop":              opts.Stop,
+		"image_data":        imageData,
 	}
 	if predict.Format == "json" {
 		request["grammar"] = jsonGrammar
 	}
-	// Handling JSON marshaling with special characters unescaped.
+	retryDelay := 100 * time.Microsecond
-	buffer := &bytes.Buffer{}
+	for retries := 0; retries < maxRetries; retries++ {
-	enc := json.NewEncoder(buffer)
+		if retries > 0 {
-	enc.SetEscapeHTML(false)
+			time.Sleep(retryDelay) // wait before retrying
+			retryDelay *= 2        // exponential backoff
-	if err := enc.Encode(request); err != nil {
+		}
-		return fmt.Errorf("failed to marshal data: %w", err)
-	}
-	req := C.CString(buffer.String())
+		// Handling JSON marshaling with special characters unescaped.
-	defer C.free(unsafe.Pointer(req))
+		buffer := &bytes.Buffer{}
+		enc := json.NewEncoder(buffer)
+		enc.SetEscapeHTML(false)
-	cmpCtx := C.llama_server_completion(req)
+		if err := enc.Encode(request); err != nil {
-	if cmpCtx.task_id < 0 {
+			return fmt.Errorf("failed to marshal data: %w", err)
-		defer C.free(unsafe.Pointer(cmpCtx.err))
+		}
-		return fmt.Errorf(C.GoString(cmpCtx.err))
-	}
-	for {
+		req := C.CString(buffer.String())
-		select {
+		defer C.free(unsafe.Pointer(req))
-		case <-ctx.Done():
-			// This handles the request cancellation
-			return errWrap(C.llama_server_completion_cancel(cmpCtx.task_id))
-		default:
-			result := C.llama_server_completion_next_result(cmpCtx.task_id)
-			if result.result_json != nil {
-				defer C.free(unsafe.Pointer(result.result_json))
-			}
-			var p prediction
-			if err := json.Unmarshal([]byte(C.GoString(result.result_json)), &p); err != nil {
-				err2 := errWrap(C.llama_server_completion_cancel(cmpCtx.task_id))
-				return errors.Join(fmt.Errorf("error unmarshaling llm prediction response: %w", err), err2)
-			}
-			if p.Content != "" {
+		llm.llama_server_completion(req, &resp)
-				fn(PredictResult{
+		if resp.id < 0 {
-					// Model:     predict.Model, // XXX remove or replace?
+			return extServerResponseToErr(resp)
-					CreatedAt: time.Now().UTC(),
+		}
-					Content:   p.Content,
-				})
-			}
-			if p.Stop {
+		retryNeeded := false
-				fn(PredictResult{
+	out:
-					// Model:              predict.Model, // XXX remove or replace?
+		for {
-					CreatedAt:          time.Now().UTC(),
+			select {
-					TotalDuration:      time.Since(predict.CheckpointStart),
+			case <-ctx.Done():
-					Done:               true,
+				// This handles the request cancellation
-					PromptEvalCount:    p.Timings.PromptN,
+				llm.llama_server_completion_cancel(resp.id, &resp)
-					PromptEvalDuration: parseDurationMs(p.Timings.PromptMS),
+				if resp.id < 0 {
-					EvalCount:          p.Timings.PredictedN,
+					return extServerResponseToErr(resp)
-					EvalDuration:       parseDurationMs(p.Timings.PredictedMS),
+				} else {
-				})
+					return nil
-				return nil
+				}
+			default:
+				var result C.ext_server_task_result_t
+				llm.llama_server_completion_next_result(resp.id, &result)
+				json_resp := C.GoString(result.json_resp)
+				llm.llama_server_release_task_result(&result)
+				var p prediction
+				if err := json.Unmarshal([]byte(json_resp), &p); err != nil {
+					llm.llama_server_completion_cancel(resp.id, &resp)
+					if resp.id < 0 {
+						return fmt.Errorf("error unmarshaling llm prediction response: %w and cancel %s", err, C.GoString(resp.msg))
+					} else {
+						return fmt.Errorf("error unmarshaling llm prediction response: %w", err)
+					}
+				}
+				if bool(result.error) && strings.Contains(json_resp, "slot unavailable") {
+					retryNeeded = true
+					// task will already be canceled
+					break out
+				}
+				if p.Content != "" {
+					fn(PredictResult{
+						Content: p.Content,
+					})
+				}
+				if p.Stop {
+					fn(PredictResult{
+						Done:               true,
+						PromptEvalCount:    p.Timings.PromptN,
+						PromptEvalDuration: parseDurationMs(p.Timings.PromptMS),
+						EvalCount:          p.Timings.PredictedN,
+						EvalDuration:       parseDurationMs(p.Timings.PredictedMS),
+					})
+					return nil
+				}
 			}
 		}
+		if !retryNeeded {
+			return nil // success
+		}
 	}
-}
+	// should never reach here ideally
+	return fmt.Errorf("max retries exceeded")
+}
 func (llm *llamaExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
+	return encode(llm, ctx, prompt)
+}
+func encode(llm extServer, ctx context.Context, prompt string) ([]int, error) {
 	data, err := json.Marshal(TokenizeRequest{Content: prompt})
 	if err != nil {
 		return nil, fmt.Errorf("marshaling encode data: %w", err)
 	}
 	req := C.CString(string(data))
 	defer C.free(unsafe.Pointer(req))
-	var resp C.ext_server_resp
+	var json_resp *C.char
-	err = errWrap(C.llama_server_tokenize(req, &resp))
+	resp := newExtServerResp(128)
-	if resp.json_resp != nil {
+	defer freeExtServerResp(resp)
-		defer C.free(unsafe.Pointer(resp.json_resp))
+	llm.llama_server_tokenize(req, &json_resp, &resp)
+	if resp.id < 0 {
+		return nil, extServerResponseToErr(resp)
 	}
+	defer llm.llama_server_release_json_resp(&json_resp)
 	var encoded TokenizeResponse
-	if err2 := json.Unmarshal([]byte(C.GoString(resp.json_resp)), &encoded); err2 != nil {
+	if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &encoded); err2 != nil {
 		return nil, fmt.Errorf("unmarshal encode response: %w", err2)
 	}
@@ -265,6 +370,10 @@ func (llm *llamaExtServer) Encode(ctx context.Context, prompt string) ([]int, er
 }
 func (llm *llamaExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
+	return decode(llm, ctx, tokens)
+}
+func decode(llm extServer, ctx context.Context, tokens []int) (string, error) {
 	if len(tokens) == 0 {
 		return "", nil
 	}
@@ -275,14 +384,17 @@ func (llm *llamaExtServer) Decode(ctx context.Context, tokens []int) (string, er
 	req := C.CString(string(data))
 	defer C.free(unsafe.Pointer(req))
-	var resp C.ext_server_resp
+	var json_resp *C.char
-	err = errWrap(C.llama_server_detokenize(req, &resp))
+	resp := newExtServerResp(128)
-	if resp.json_resp != nil {
+	defer freeExtServerResp(resp)
-		defer C.free(unsafe.Pointer(resp.json_resp))
+	llm.llama_server_detokenize(req, &json_resp, &resp)
+	if resp.id < 0 {
+		return "", extServerResponseToErr(resp)
 	}
+	defer llm.llama_server_release_json_resp(&json_resp)
 	var decoded DetokenizeResponse
-	if err2 := json.Unmarshal([]byte(C.GoString(resp.json_resp)), &decoded); err2 != nil {
+	if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &decoded); err2 != nil {
 		return "", fmt.Errorf("unmarshal encode response: %w", err2)
 	}
@@ -290,6 +402,9 @@ func (llm *llamaExtServer) Decode(ctx context.Context, tokens []int) (string, er
 }
 func (llm *llamaExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
+	return embedding(llm, ctx, input)
+}
+func embedding(llm extServer, ctx context.Context, input string) ([]float64, error) {
 	data, err := json.Marshal(TokenizeRequest{Content: input})
 	if err != nil {
 		return nil, fmt.Errorf("error marshaling embed data: %w", err)
@@ -297,29 +412,28 @@ func (llm *llamaExtServer) Embedding(ctx context.Context, input string) ([]float
 	req := C.CString(string(data))
 	defer C.free(unsafe.Pointer(req))
-	var resp C.ext_server_resp
+	var json_resp *C.char
-	err = errWrap(C.llama_server_embedding(req, &resp))
+	resp := newExtServerResp(128)
-	if resp.json_resp != nil {
+	defer freeExtServerResp(resp)
-		defer C.free(unsafe.Pointer(resp.json_resp))
+	llm.llama_server_embedding(req, &json_resp, &resp)
-	}
+	if resp.id < 0 {
-	if err != nil {
+		return nil, extServerResponseToErr(resp)
-		return nil, err
 	}
+	defer llm.llama_server_release_json_resp(&json_resp)
 	var embedding EmbeddingResponse
-	if err := json.Unmarshal([]byte(C.GoString(resp.json_resp)), &embedding); err != nil {
+	if err := json.Unmarshal([]byte(C.GoString(json_resp)), &embedding); err != nil {
 		return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
 	}
 	return embedding.Embedding, nil
 }
-func (llm *llamaExtServer) Ping(ctx context.Context) error {
+func (llm *llamaExtServer) Close() {
-	// TODO - consider some mechanism to check if the main loop and llama.cpp are in a good state
+	close(llm)
-	return nil
 }
-func (llm *llamaExtServer) Close() {
+func close(llm extServer) {
-	C.llama_server_stop()
+	llm.llama_server_stop()
 	mutex.Unlock()
 }
--- a/llm/llama.cpp/gen_common.sh
+++ b/llm/llama.cpp/gen_common.sh
 # common logic accross linux and darwin
 init_vars() {
+    LLAMACPP_DIR=gguf
    PATCHES="0001-Expose-callable-API-for-server.patch"
    CMAKE_DEFS="-DLLAMA_ACCELERATE=on"
    # TODO - LLAMA_K_QUANTS is stale and needs to be mapped to newer cmake settings
-    CMAKE_TARGETS="--target ggml --target ggml_static --target llama --target build_info --target common --target ext_server"
+    CMAKE_TARGETS="--target ggml --target ggml_static --target llama --target build_info --target common --target ext_server --target llava_static"
    if echo "${CGO_CFLAGS}" | grep -- '-g' > /dev/null ; then
        CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on ${CMAKE_DEFS}"
    else
@@ -29,6 +30,6 @@ apply_patches() {
 }
 build() {
-    cmake -S gguf -B ${BUILD_DIR} ${CMAKE_DEFS}
+    cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS}
-    cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8 
+    cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8
 }
\ No newline at end of file
--- a/llm/llama.cpp/gen_darwin.sh
+++ b/llm/llama.cpp/gen_darwin.sh
-#!/bin/sh
+#!/bin/bash
 # This script is intended to run inside the go generate
 # working directory must be ../llm/llama.cpp
@@ -30,6 +30,7 @@ git_module_setup
 apply_patches
 build
+# TODO - improve this to handle test cases that need it to be in "." around the tree
 # Enable local debug/run usecase
 if [ -e "gguf/ggml-metal.metal" ]; then
    cp gguf/ggml-metal.metal ../../

--- a/llm/llama.cpp/gen_linux.sh
+++ b/llm/llama.cpp/gen_linux.sh
-#!/bin/sh
+#!/bin/bash
 # This script is intended to run inside the go generate
 # working directory must be ../llm/llama.cpp
 set -ex
 set -o pipefail
-# TODO - stopped here - map the variables from above over and refine the case statement below
 echo "Starting linux generate script"
+if [ -z "${CUDACXX}" -a -x /usr/local/cuda/bin/nvcc ] ; then
+    export CUDACXX=/usr/local/cuda/bin/nvcc
+fi
 source $(dirname $0)/gen_common.sh
 init_vars
-CMAKE_DEFS="-DLLAMA_CUBLAS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
-BUILD_DIR="gguf/build/cuda"
 git_module_setup
 apply_patches
+CMAKE_DEFS="-DLLAMA_CUBLAS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+BUILD_DIR="gguf/build/cuda"
+LIB_DIR="${BUILD_DIR}/lib"
+mkdir -p ../../dist/
 build
+# TODO - explore mechanism to soften the hard cuda dependency on linux
+#        by conditionally building some archive here that aggregates the cuda libs if present
+#        so that the cgo flags link this intermediate archive instead of the underlying cuda libs
+# 
+# gcc -fPIC -g -shared -o ${LIB_DIR}/libcuda_server.so \
+#     -Wl,--whole-archive \
+#     ${BUILD_DIR}/examples/server/CMakeFiles/ext_server.dir/server.cpp.o \
+#     ${BUILD_DIR}/common/libcommon.a \
+#     ${BUILD_DIR}/libllama.a \
+#     ${BUILD_DIR}/examples/llava/libllava_static.a \
+#     -Wl,--no-whole-archive \
+#     -lrt -lpthread -ldl -lstdc++ -lm \
+#     /usr/local/cuda/lib64/libcudart_static.a \
+#     /usr/local/cuda/lib64/libcublas_static.a \
+#     /usr/local/cuda/lib64/libcublasLt_static.a \
+#     /usr/local/cuda/lib64/libcudadevrt.a \
+#     /usr/local/cuda/lib64/libculibos.a
+if [ -z "${ROCM_PATH}" ] ; then
+    # Try the default location in case it exists
+    ROCM_PATH=/opt/rocm
+fi
+if [ -z "${CLBlast_DIR}" ] ; then
+    # Try the default location in case it exists
+    if [ -d /usr/lib/cmake/CLBlast ]; then
+        export CLBlast_DIR=/usr/lib/cmake/CLBlast
+    fi
+fi
+BUILD_DIR="gguf/build/rocm"
+LIB_DIR="${BUILD_DIR}/lib"
+mkdir -p ${LIB_DIR}
+# Ensure we have at least one file present for the embed
+touch ${LIB_DIR}/.generated 
+if [ -d "${ROCM_PATH}" ] ; then
+    echo "Building ROCm"
+    init_vars
+    CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102' -DGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102'"
+    CMAKE_DEFS="-DLLAMA_ACCELERATE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+    build
+    gcc -fPIC -g -shared -o ${LIB_DIR}/librocm_server.so \
+        -Wl,--whole-archive \
+        ${BUILD_DIR}/examples/server/libext_server.a \
+        ${BUILD_DIR}/common/libcommon.a \
+        ${BUILD_DIR}/libllama.a \
+        -Wl,--no-whole-archive \
+        -lrt -lpthread -ldl -lstdc++ -lm \
+        -L/opt/rocm/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ \
+        -Wl,-rpath,/opt/rocm/lib,-rpath,/opt/amdgpu/lib/x86_64-linux-gnu/ \
+        -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu
+fi