Commit 35934b2e authored by Daniel Hiltgen's avatar Daniel Hiltgen
Browse files

Adapted rocm support to cgo based llama.cpp

parent f8ef4439
...@@ -11,8 +11,8 @@ RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go1.21.3.tar.gz ...@@ -11,8 +11,8 @@ RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go1.21.3.tar.gz
COPY . . COPY . .
ENV GOARCH=$TARGETARCH ENV GOARCH=$TARGETARCH
ENV GOFLAGS=$GOFLAGS ENV GOFLAGS=$GOFLAGS
RUN /usr/local/go/bin/go generate -tags cuda ./... \ RUN /usr/local/go/bin/go generate ./... \
&& /usr/local/go/bin/go build -tags cuda . && /usr/local/go/bin/go build .
FROM ubuntu:22.04 FROM ubuntu:22.04
RUN apt-get update && apt-get install -y ca-certificates RUN apt-get update && apt-get install -y ca-certificates
...@@ -27,5 +27,3 @@ ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility ...@@ -27,5 +27,3 @@ ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
ENTRYPOINT ["/bin/ollama"] ENTRYPOINT ["/bin/ollama"]
CMD ["serve"] CMD ["serve"]
# centos7 amd64 dependencies # Ubuntu 20.04 amd64 dependencies
FROM --platform=linux/amd64 nvidia/cuda:11.3.1-devel-centos7 AS base-amd64 FROM --platform=linux/amd64 nvidia/cuda:11.7.1-devel-ubuntu22.04 AS base-amd64
RUN yum install -y https://repo.ius.io/ius-release-el7.rpm centos-release-scl && \ # ROCm only supports amd64
yum update -y && \ ARG ROCM_VERSION=5.7
yum install -y devtoolset-10-gcc devtoolset-10-gcc-c++ git236 wget # Note: https://rocm.docs.amd.com/en/latest/release/user_kernel_space_compat_matrix.html
RUN wget "https://github.com/Kitware/CMake/releases/download/v3.27.6/cmake-3.27.6-linux-x86_64.sh" -O cmake-installer.sh && chmod +x cmake-installer.sh && ./cmake-installer.sh --skip-license --prefix=/usr/local RUN apt-get update && \
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH apt-get install -y wget && \
wget "https://github.com/Kitware/CMake/releases/download/v3.22.1/cmake-3.22.1-linux-x86_64.sh" -O /tmp/cmake-installer.sh && \
# centos8 arm64 dependencies chmod +x /tmp/cmake-installer.sh && /tmp/cmake-installer.sh --skip-license --prefix=/usr && \
FROM --platform=linux/arm64 nvidia/cuda-arm64:11.3.1-devel-centos8 AS base-arm64 mkdir --parents --mode=0755 /etc/apt/keyrings && \
RUN sed -i -e 's/mirrorlist/#mirrorlist/g' -e 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-* wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor > /etc/apt/keyrings/rocm.gpg && \
RUN yum install -y git cmake echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/${ROCM_VERSION} focal main" > /etc/apt/sources.list.d/rocm.list && \
echo "Package: *" > /etc/apt/preferences.d/rocm-pin-600 && \
echo "Pin: release o=repo.radeon.com" >> /etc/apt/preferences.d/rocm-pin-600 && \
echo "Pin-Priority: 600" >> /etc/apt/preferences.d/rocm-pin-600 && \
apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get install -y rocm-hip-libraries rocm-device-libs rocm-libs rocm-ocl-icd rocm-hip-sdk rocm-hip-libraries rocm-cmake rocm-clang-ocl rocm-dev
ENV ROCM_PATH=/opt/rocm
# Ubuntu 22.04 arm64 dependencies
FROM --platform=linux/arm64 nvidia/cuda:11.7.1-devel-ubuntu22.04 AS base-arm64
RUN apt-get update && \
apt-get install -y wget && \
wget "https://github.com/Kitware/CMake/releases/download/v3.27.6/cmake-3.27.6-linux-aarch64.sh" -O /tmp/cmake-installer.sh && \
chmod +x /tmp/cmake-installer.sh && /tmp/cmake-installer.sh --skip-license --prefix=/usr
FROM base-${TARGETARCH} FROM base-${TARGETARCH}
ARG TARGETARCH ARG TARGETARCH
ARG GOFLAGS="'-ldflags -w -s'" ARG GOFLAGS="'-ldflags -w -s'"
ARG CGO_CFLAGS
ARG CLBLAST_VER=1.6.1
# Common toolchain
RUN apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get install -y gcc-11 g++-11 cpp-11 git ocl-icd-opencl-dev && \
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 100 --slave /usr/bin/g++ g++ /usr/bin/g++-11 --slave /usr/bin/gcov gcov /usr/bin/gcov-11
# CLBlast
RUN wget -qO- https://github.com/CNugteren/CLBlast/archive/refs/tags/${CLBLAST_VER}.tar.gz | tar zxv -C /tmp/ && \
cd /tmp/CLBlast-${CLBLAST_VER} && mkdir build && cd build && cmake .. && make && make install
# install go # install go
ADD https://dl.google.com/go/go1.21.3.linux-$TARGETARCH.tar.gz /tmp/go1.21.3.tar.gz ADD https://dl.google.com/go/go1.21.3.linux-$TARGETARCH.tar.gz /tmp/go1.21.3.tar.gz
...@@ -26,6 +51,7 @@ COPY . . ...@@ -26,6 +51,7 @@ COPY . .
ENV GOOS=linux ENV GOOS=linux
ENV GOARCH=$TARGETARCH ENV GOARCH=$TARGETARCH
ENV GOFLAGS=$GOFLAGS ENV GOFLAGS=$GOFLAGS
ENV CGO_CFLAGS=${CGO_CFLAGS}
RUN /usr/local/go/bin/go generate -tags cuda ./... && \ RUN /usr/local/go/bin/go generate ./... && \
/usr/local/go/bin/go build -tags cuda . /usr/local/go/bin/go build .
...@@ -185,8 +185,6 @@ ollama list ...@@ -185,8 +185,6 @@ ollama list
## Building ## Building
### Generic (CPU)
Install `cmake` and `go`: Install `cmake` and `go`:
``` ```
...@@ -202,32 +200,36 @@ Then build the binary: ...@@ -202,32 +200,36 @@ Then build the binary:
go build . go build .
``` ```
### CUDA (NVIDIA) ### Linux/Windows CUDA (NVIDIA)
*Your operating system distribution may already have packages for NVIDIA CUDA. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!* *Your operating system distribution may already have packages for NVIDIA CUDA. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!*
Install `cmake` and `golang` as well as [NVIDIA CUDA](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html) development and runtime packages. Note: at present, Ollama is optimized for GPU usage on linux, and requires the CUDA libraries at a minimum to compile even if you do not have an NVIDIA GPU.
Install `cmake` and `golang` as well as [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads) development and runtime packages.
Then generate dependencies: Then generate dependencies:
``` ```
go generate -tags cuda ./... go generate ./...
``` ```
Then build the binary: Then build the binary:
``` ```
go build -tags cuda . go build .
``` ```
### ROCm (AMD) ### Linux ROCm (AMD)
*Your operating system distribution may already have packages for AMD ROCm and CLBlast. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!* *Your operating system distribution may already have packages for AMD ROCm and CLBlast. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!*
Install [CLBlast](https://github.com/CNugteren/CLBlast/blob/master/doc/installation.md) and [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html) developement packages first, as well as `cmake` and `golang`. Install [CLBlast](https://github.com/CNugteren/CLBlast/blob/master/doc/installation.md) and [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html) developement packages first, as well as `cmake` and `golang`.
Adjust the paths below (correct for Arch) as appropriate for your distributions install locations and generate dependencies: Adjust the paths below (correct for Arch) as appropriate for your distributions install locations and generate dependencies:
``` ```
CLBlast_DIR=/usr/lib/cmake/CLBlast ROCM_PATH=/opt/rocm go generate -tags rocm ./... CLBlast_DIR=/usr/lib/cmake/CLBlast ROCM_PATH=/opt/rocm go generate ./...
``` ```
Then build the binary: Then build the binary:
``` ```
go build -tags rocm go build .
``` ```
ROCm requires elevated privileges to access the GPU at runtime. On most distros you can add your user account to the `render` group, or run as root.
### Running local builds ### Running local builds
Next, start the server: Next, start the server:
......
//go:build linux || windows //go:build linux || windows
package llm package gpu
/*
#include "gpu_info.h"
*/
import "C"
import ( import (
"errors" "fmt"
"log" "log"
"sync"
"unsafe"
"github.com/jmorganca/ollama/api" "github.com/jmorganca/ollama/api"
) )
/* type handles struct {
#cgo windows LDFLAGS: -L"/Program Files/NVIDIA Corporation/NVSMI/" cuda *C.cuda_handle_t
#cgo linux LDFLAGS: -lnvidia-ml rocm *C.rocm_handle_t
}
#include <stdlib.h> var gpuMutex sync.Mutex
#include "examples/server/server.h" var gpuHandles *handles = nil
*/
import "C" // Note: gpuMutex must already be held
func initGPUHandles() {
log.Printf("Detecting GPU type")
gpuHandles = &handles{nil, nil}
var resp C.cuda_init_resp_t
C.cuda_init(&resp)
if resp.err != nil {
log.Printf("CUDA not detected: %s", C.GoString(resp.err))
C.free(unsafe.Pointer(resp.err))
var resp C.rocm_init_resp_t
C.rocm_init(&resp)
if resp.err != nil {
log.Printf("ROCm not detected: %s", C.GoString(resp.err))
C.free(unsafe.Pointer(resp.err))
} else {
log.Printf("Radeon GPU detected")
rocm := resp.rh
gpuHandles.rocm = &rocm
}
} else {
log.Printf("Nvidia GPU detected")
cuda := resp.ch
gpuHandles.cuda = &cuda
}
}
func GetGPUInfo() GpuInfo {
// TODO - consider exploring lspci (and equivalent on windows) to check for
// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
gpuMutex.Lock()
defer gpuMutex.Unlock()
if gpuHandles == nil {
initGPUHandles()
}
var memInfo C.mem_info_t
var resp GpuInfo
if gpuHandles.cuda != nil {
C.cuda_check_vram(*gpuHandles.cuda, &memInfo)
resp.Driver = "CUDA"
} else if gpuHandles.rocm != nil {
C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
resp.Driver = "ROCM"
} else {
C.cpu_check_ram(&memInfo)
resp.Driver = "CPU"
}
if memInfo.err != nil {
log.Printf("error looking up GPU memory: %s", C.GoString(memInfo.err))
C.free(unsafe.Pointer(memInfo.err))
}
resp.FreeMemory = uint64(memInfo.free)
resp.TotalMemory = uint64(memInfo.total)
return resp
}
// CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
func CheckVRAM() (int64, error) { func CheckVRAM() (int64, error) {
return int64(C.check_vram()), nil gpuInfo := GetGPUInfo()
if gpuInfo.FreeMemory > 0 && gpuInfo.Driver != "CPU" {
return int64(gpuInfo.FreeMemory), nil
}
return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation
} }
func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int { func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
if opts.NumGPU != -1 { if opts.NumGPU != -1 {
return opts.NumGPU return opts.NumGPU
} }
freeBytes, err := CheckVRAM() info := GetGPUInfo()
if err != nil { if info.Driver == "CPU" {
if !errors.Is(err, errNvidiaSMI) {
log.Print(err.Error())
}
// nvidia driver not installed or no nvidia GPU found
return 0 return 0
} }
...@@ -41,17 +103,17 @@ func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int { ...@@ -41,17 +103,17 @@ func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
We can store the model weights and the kv cache in vram, We can store the model weights and the kv cache in vram,
to enable kv chache vram storage add two additional layers to the number of layers retrieved from the model file. to enable kv chache vram storage add two additional layers to the number of layers retrieved from the model file.
*/ */
bytesPerLayer := fileSizeBytes / numLayer bytesPerLayer := uint64(fileSizeBytes / numLayer)
// 75% of the absolute max number of layers we can fit in available VRAM, off-loading too many layers to the GPU can cause OOM errors // 75% of the absolute max number of layers we can fit in available VRAM, off-loading too many layers to the GPU can cause OOM errors
layers := int(freeBytes/bytesPerLayer) * 3 / 4 layers := int(info.FreeMemory/bytesPerLayer) * 3 / 4
// TODO - not sure on this part... if we can't fit all the layers, just fallback to CPU // TODO - not sure on this part... if we can't fit all the layers, just fallback to CPU
// if int64(layers) < numLayer { // if int64(layers) < numLayer {
// log.Printf("%d MB VRAM available, insufficient to load current model (reuires %d MB) - falling back to CPU %d", freeBytes/(1024*1024), fileSizeBytes/(1024*1024)) // log.Printf("%d MB VRAM available, insufficient to load current model (reuires %d MB) - falling back to CPU %d", freeBytes/(1024*1024), fileSizeBytes/(1024*1024))
// return 0 // return 0
// } // }
log.Printf("%d MB VRAM available, loading up to %d GPU layers out of %d", freeBytes/(1024*1024), layers, numLayer) log.Printf("%d MB VRAM available, loading up to %d GPU layers out of %d", info.FreeMemory/(1024*1024), layers, numLayer)
return layers return layers
} }
//go:build darwin //go:build darwin
package llm package gpu
import "C"
import ( import (
"github.com/jmorganca/ollama/api" "github.com/jmorganca/ollama/api"
) )
...@@ -9,11 +10,25 @@ import ( ...@@ -9,11 +10,25 @@ import (
// CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs // CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
func CheckVRAM() (int64, error) { func CheckVRAM() (int64, error) {
// TODO - assume metal, and return free memory? // TODO - assume metal, and return free memory?
return 0, errNvidiaSMI return 0, nil
} }
func GetGPUInfo() GpuInfo {
// TODO - Metal vs. x86 macs...
return GpuInfo{
Driver: "METAL",
TotalMemory: 0,
FreeMemory: 0,
}
}
func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int { func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
// default to enable metal on macOS // default to enable metal on macOS
return 1 return 1
} }
func nativeInit() error {
return nil
}
#ifndef __APPLE__
#ifndef __GPU_INFO_H__
#define __GPU_INFO_H__
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#ifndef _WIN32
#include <dlfcn.h>
#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
#define LOAD_ERR() dlerror()
#define UNLOAD_LIBRARY(handle) dlclose(handle)
#else
#include <windows.h>
#define LOAD_LIBRARY(lib, flags) LoadLibrary(lib)
#define LOAD_SYMBOL(handle, sym) GetProcAddress(handle, sym)
#define UNLOAD_LIBRARY(handle) FreeLibrary(handle)
// TODO - refactor this with proper error message handling on windows
inline static char *LOAD_ERR() {
static char errbuf[8];
snprintf(errbuf, 8, "0x%lx", GetLastError());
return errbuf;
}
#endif
#ifdef __cplusplus
extern "C" {
#endif
typedef struct mem_info {
uint64_t total;
uint64_t free;
char *err; // If non-nill, caller responsible for freeing
} mem_info_t;
void cpu_check_ram(mem_info_t *resp);
#ifdef __cplusplus
}
#endif
#include "gpu_info_cuda.h"
#include "gpu_info_rocm.h"
#endif // __GPU_INFO_H__
#endif // __APPLE__
\ No newline at end of file
#include "gpu_info.h"
// Fallbacks for CPU mode
#ifdef _WIN32
#include <sysinfoapi.h>
void cpu_check_ram(mem_info_t *resp) {
resp->err = NULL;
MEMORYSTATUSEX info;
if (GlobalMemoryStatusEx(&info) != 0) {
resp->total = info.ullTotalPhys;
resp->free = info.ullAvailPhys;
} else {
resp->err = strdup(LOAD_ERR());
}
return;
}
#elif __linux__
#include <errno.h>
#include <string.h>
#include <sys/sysinfo.h>
void cpu_check_ram(mem_info_t *resp) {
struct sysinfo info;
resp->err = NULL;
if (sysinfo(&info) != 0) {
resp->err = strdup(strerror(errno));
} else {
resp->total = info.totalram * info.mem_unit;
resp->free = info.freeram * info.mem_unit;
}
return;
}
#elif __APPLE__
// TODO consider an Apple implementation that does something useful
// mem_info_t cpu_check_ram() {
// mem_info_t resp = {0, 0, NULL};
// return resp;
// }
#else
#error "Unsupported platform"
#endif
#ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
#include "gpu_info_cuda.h"
#include <string.h>
#ifndef _WIN32
const char *cuda_lib_paths[] = {
"libnvidia-ml.so",
"/usr/local/cuda/lib64/libnvidia-ml.so",
NULL,
};
#else
const char *cuda_lib_paths[] = {
"nvml.dll",
"",
NULL,
};
#endif
void cuda_init(cuda_init_resp_t *resp) {
resp->err = NULL;
const int buflen = 256;
char buf[buflen + 1];
int i;
struct lookup {
char *s;
void **p;
} l[4] = {
{"nvmlInit_v2", (void *)&resp->ch.initFn},
{"nvmlShutdown", (void *)&resp->ch.shutdownFn},
{"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle},
{"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
};
for (i = 0; cuda_lib_paths[i] != NULL && resp->ch.handle == NULL; i++) {
resp->ch.handle = LOAD_LIBRARY(cuda_lib_paths[i], RTLD_LAZY);
}
if (!resp->ch.handle) {
snprintf(buf, buflen,
"Unable to load %s library to query for Nvidia GPUs: %s",
cuda_lib_paths[0], LOAD_ERR());
resp->err = strdup(buf);
return;
}
for (i = 0; i < 4; i++) { // TODO - fix this to use a null terminated list
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
if (!l[i].p) {
UNLOAD_LIBRARY(resp->ch.handle);
resp->ch.handle = NULL;
snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
LOAD_ERR());
resp->err = strdup(buf);
return;
}
}
return;
}
void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
resp->err = NULL;
nvmlDevice_t device;
nvmlMemory_t memInfo = {0};
nvmlReturn_t ret;
const int buflen = 256;
char buf[buflen + 1];
int i;
if (h.handle == NULL) {
resp->err = strdup("nvml handle sn't initialized");
return;
}
ret = (*h.initFn)();
if (ret != NVML_SUCCESS) {
snprintf(buf, buflen, "nvml vram init failure: %d", ret);
resp->err = strdup(buf);
return;
}
// TODO - handle multiple GPUs
ret = (*h.getHandle)(0, &device);
if (ret != NVML_SUCCESS) {
(*h.shutdownFn)();
snprintf(buf, buflen, "unable to get device handle: %d", ret);
resp->err = strdup(buf);
return;
}
ret = (*h.getMemInfo)(device, &memInfo);
if (ret != NVML_SUCCESS) {
(*h.shutdownFn)();
snprintf(buf, buflen, "device memory info lookup failure: %d", ret);
resp->err = strdup(buf);
return;
}
resp->total = memInfo.total;
resp->free = memInfo.free;
ret = (*h.shutdownFn)();
if (ret != NVML_SUCCESS) {
snprintf(buf, buflen, "nvml vram shutdown failure: %d", ret);
resp->err = strdup(buf);
}
return;
}
#endif // __APPLE__
\ No newline at end of file
#ifndef __APPLE__
#ifndef __GPU_INFO_CUDA_H__
#define __GPU_INFO_CUDA_H__
#include "gpu_info.h"
// Just enough typedef's to dlopen/dlsym for memory information
typedef enum nvmlReturn_enum {
NVML_SUCCESS = 0,
// Other values omitted for now...
} nvmlReturn_t;
typedef void *nvmlDevice_t; // Opaque is sufficient
typedef struct nvmlMemory_st {
unsigned long long total;
unsigned long long free;
unsigned long long used;
} nvmlMemory_t;
typedef struct cuda_handle {
void *handle;
nvmlReturn_t (*initFn)(void);
nvmlReturn_t (*shutdownFn)(void);
nvmlReturn_t (*getHandle)(unsigned int, nvmlDevice_t *);
nvmlReturn_t (*getMemInfo)(nvmlDevice_t, nvmlMemory_t *);
} cuda_handle_t;
typedef struct cuda_init_resp {
char *err; // If err is non-null handle is invalid
cuda_handle_t ch;
} cuda_init_resp_t;
void cuda_init(cuda_init_resp_t *resp);
void cuda_check_vram(cuda_handle_t ch, mem_info_t *resp);
#endif // __GPU_INFO_CUDA_H__
#endif // __APPLE__
\ No newline at end of file
#ifndef __APPLE__
#include "gpu_info_rocm.h"
#include <string.h>
#ifndef _WIN32
const char *rocm_lib_paths[] = {
"librocm_smi64.so",
"/opt/rocm/lib/librocm_smi64.so",
NULL,
};
#else
// TODO untested
const char *rocm_lib_paths[] = {
"rocm_smi64.dll",
"/opt/rocm/lib/rocm_smi64.dll",
NULL,
};
#endif
void rocm_init(rocm_init_resp_t *resp) {
resp->err = NULL;
const int buflen = 256;
char buf[buflen + 1];
int i;
struct lookup {
char *s;
void **p;
} l[4] = {
{"rsmi_init", (void *)&resp->rh.initFn},
{"rsmi_shut_down", (void *)&resp->rh.shutdownFn},
{"rsmi_dev_memory_total_get", (void *)&resp->rh.totalMemFn},
{"rsmi_dev_memory_usage_get", (void *)&resp->rh.usageMemFn},
// { "rsmi_dev_id_get", (void*)&resp->rh.getHandle },
};
for (i = 0; rocm_lib_paths[i] != NULL && resp->rh.handle == NULL; i++) {
resp->rh.handle = LOAD_LIBRARY(rocm_lib_paths[i], RTLD_LAZY);
}
if (!resp->rh.handle) {
snprintf(buf, buflen,
"Unable to load %s library to query for Radeon GPUs: %s\n",
rocm_lib_paths[0], LOAD_ERR());
resp->err = strdup(buf);
return;
}
for (i = 0; i < 4; i++) {
*l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s);
if (!l[i].p) {
UNLOAD_LIBRARY(resp->rh.handle);
snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
LOAD_ERR());
resp->err = strdup(buf);
return;
}
}
return;
}
void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
resp->err = NULL;
// uint32_t num_devices;
// uint16_t device;
uint64_t totalMem = 0;
uint64_t usedMem = 0;
rsmi_status_t ret;
const int buflen = 256;
char buf[buflen + 1];
int i;
ret = (*h.initFn)(0);
if (ret != RSMI_STATUS_SUCCESS) {
snprintf(buf, buflen, "rocm vram init failure: %d", ret);
resp->err = strdup(buf);
return;
}
// TODO - iterate through devices... ret =
// rsmi_num_monitor_devices(&num_devices);
// ret = (*h.getHandle)(0, &device);
// if (ret != RSMI_STATUS_SUCCESS) {
// printf("rocm vram device lookup failure: %d\n", ret);
// return -1;
// }
// Get total memory - used memory for available memory
ret = (*h.totalMemFn)(0, RSMI_MEM_TYPE_VRAM, &totalMem);
if (ret != RSMI_STATUS_SUCCESS) {
(*h.shutdownFn)();
snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
resp->err = strdup(buf);
return;
}
ret = (*h.usageMemFn)(0, RSMI_MEM_TYPE_VRAM, &usedMem);
if (ret != RSMI_STATUS_SUCCESS) {
(*h.shutdownFn)();
snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
resp->err = strdup(buf);
return;
}
(*h.shutdownFn)();
resp->total = totalMem;
resp->free = totalMem - usedMem;
return;
}
#endif // __APPLE__
\ No newline at end of file
#ifndef __APPLE__
#ifndef __GPU_INFO_ROCM_H__
#define __GPU_INFO_ROCM_H__
#include "gpu_info.h"
// Just enough typedef's to dlopen/dlsym for memory information
typedef enum rsmi_status_return {
RSMI_STATUS_SUCCESS = 0,
// Other values omitted for now...
} rsmi_status_t;
typedef enum rsmi_memory_type {
RSMI_MEM_TYPE_VRAM = 0,
RSMI_MEM_TYPE_VIS_VRAM,
RSMI_MEM_TYPE_GTT,
} rsmi_memory_type_t;
typedef struct rocm_handle {
void *handle;
rsmi_status_t (*initFn)(uint64_t);
rsmi_status_t (*shutdownFn)(void);
rsmi_status_t (*totalMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
rsmi_status_t (*usageMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
// rsmi_status_t (*getHandle)(uint32_t, uint16_t *);
} rocm_handle_t;
typedef struct rocm_init_resp {
char *err; // If err is non-null handle is invalid
rocm_handle_t rh;
} rocm_init_resp_t;
void rocm_init(rocm_init_resp_t *resp);
void rocm_check_vram(rocm_handle_t rh, mem_info_t *resp);
#endif // __GPU_INFO_ROCM_H__
#endif // __APPLE__
\ No newline at end of file
package gpu
import (
"runtime"
"testing"
"github.com/stretchr/testify/assert"
)
func TestBasicGetGPUInfo(t *testing.T) {
info := GetGPUInfo()
assert.Contains(t, "CUDA ROCM CPU METAL", info.Driver)
switch runtime.GOOS {
case "darwin":
// TODO - remove this once MacOS returns some size for CPU
return
case "linux", "windows":
assert.Greater(t, info.TotalMemory, uint64(0))
assert.Greater(t, info.FreeMemory, uint64(0))
default:
return
}
}
// TODO - add some logic to figure out card type through other means and actually verify we got back what we expected
package gpu
// Beginning of an `ollama info` command
type GpuInfo struct {
Driver string `json:"driver,omitempty"`
TotalMemory uint64 `json:"total_memory,omitempty"`
FreeMemory uint64 `json:"free_memory,omitempty"`
// TODO add other useful attributes about the card here for discovery information
}
//go:build cuda
package llm
import (
"bufio"
"bytes"
"errors"
"fmt"
"log"
"os/exec"
"path"
"strconv"
"strings"
"github.com/jmorganca/ollama/format"
)
var (
errNvidiaSMI = errors.New("warning: gpu support may not be enabled, check that you have installed GPU drivers: nvidia-smi command failed")
errAvailableVRAM = errors.New("not enough VRAM available, falling back to CPU only")
)
// acceleratedRunner returns the runner for this accelerator given the provided buildPath string.
func acceleratedRunner(buildPath string) []ModelRunner {
return []ModelRunner{
ModelRunner{
Path: path.Join(buildPath, "cuda", "bin", "ollama-runner"),
Accelerated: true,
},
}
}
// CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
func CheckVRAM() (int64, error) {
cmd := exec.Command("nvidia-smi", "--query-gpu=memory.free", "--format=csv,noheader,nounits")
var stdout bytes.Buffer
cmd.Stdout = &stdout
err := cmd.Run()
if err != nil {
return 0, errNoAccel
}
var freeMiB int64
scanner := bufio.NewScanner(&stdout)
for scanner.Scan() {
line := scanner.Text()
if strings.Contains(line, "[Insufficient Permissions]") {
return 0, fmt.Errorf("GPU support may not enabled, check you have installed GPU drivers and have the necessary permissions to run nvidia-smi")
}
vram, err := strconv.ParseInt(strings.TrimSpace(line), 10, 64)
if err != nil {
return 0, fmt.Errorf("failed to parse available VRAM: %v", err)
}
freeMiB += vram
}
freeBytes := freeMiB * 1024 * 1024
if freeBytes < 2*format.GigaByte {
log.Printf("less than 2 GB VRAM available")
return 0, errAvailableVRAM
}
return freeBytes, nil
}
//go:build !rocm && !cuda
package llm
import (
"errors"
)
var (
errNoAccel = errors.New("no accelerator support in this binary")
)
// acceleratedRunner returns the runner for this accelerator given the provided buildPath string.
func acceleratedRunner(buildPath string) []ModelRunner {
return make([]ModelRunner, 0, 1)
}
// CheckVRAM is a stub with no accelerator.
func CheckVRAM() (int64, error) {
return 0, errNoGPU
}
//go:build rocm
package llm
import (
"bytes"
"encoding/csv"
"errors"
"fmt"
"io"
"log"
"os"
"os/exec"
"path"
"path/filepath"
"strconv"
"strings"
)
var errNoAccel = errors.New("rocm-smi command failed")
// acceleratedRunner returns the runner for this accelerator given the provided buildPath string.
func acceleratedRunner(buildPath string) []ModelRunner {
return []ModelRunner{
ModelRunner{
Path: path.Join(buildPath, "rocm", "bin", "ollama-runner"),
Accelerated: true,
},
}
}
// CheckVRAM returns the available VRAM in MiB on Linux machines with AMD GPUs
func CheckVRAM() (int64, error) {
rocmHome := os.Getenv("ROCM_PATH")
if rocmHome == "" {
rocmHome = os.Getenv("ROCM_HOME")
}
if rocmHome == "" {
log.Println("warning: ROCM_PATH is not set. Trying a likely fallback path, but it is recommended to set this variable in the environment.")
rocmHome = "/opt/rocm"
}
cmd := exec.Command(filepath.Join(rocmHome, "bin/rocm-smi"), "--showmeminfo", "VRAM", "--csv")
var stdout bytes.Buffer
cmd.Stdout = &stdout
err := cmd.Run()
if err != nil {
return 0, errNoAccel
}
csvData := csv.NewReader(&stdout)
// llama.cpp or ROCm don't seem to understand splitting the VRAM allocations across them properly, so try to find the biggest card instead :(. FIXME.
totalBiggestCard := int64(0)
bigCardName := ""
for {
record, err := csvData.Read()
if err == io.EOF {
break
}
if err != nil {
return 0, fmt.Errorf("failed to parse available VRAM: %v", err)
}
if !strings.HasPrefix(record[0], "card") {
continue
}
cardTotal, err := strconv.ParseInt(record[1], 10, 64)
if err != nil {
return 0, err
}
cardUsed, err := strconv.ParseInt(record[2], 10, 64)
if err != nil {
return 0, err
}
possible := (cardTotal - cardUsed)
log.Printf("ROCm found %d MiB of available VRAM on device %q", possible/1024/1024, record[0])
if possible > totalBiggestCard {
totalBiggestCard = possible
bigCardName = record[0]
}
}
if totalBiggestCard == 0 {
log.Printf("found ROCm GPU but failed to parse free VRAM!")
return 0, errNoAccel
}
log.Printf("ROCm selecting device %q", bigCardName)
return totalBiggestCard, nil
}
package llm package llm
/* /*
#cgo CFLAGS: -I${SRCDIR}/llama.cpp/gguf -I${SRCDIR}/llama.cpp/gguf/common #cgo CFLAGS: -I${SRCDIR}/llama.cpp/gguf -I${SRCDIR}/llama.cpp/gguf/common -I${SRCDIR}/llama.cpp/gguf/examples/server
#cgo CFLAGS: -DNDEBUG -DLLAMA_SERVER_LIBRARY=1 -D_XOPEN_SOURCE=600 -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 #cgo CFLAGS: -DNDEBUG -DLLAMA_SERVER_LIBRARY=1 -D_XOPEN_SOURCE=600 -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
#cgo CFLAGS: -Wmissing-noreturn -Wall -Wextra -Wcast-qual -Wno-unused-function -Wno-array-bounds #cgo CFLAGS: -Wmissing-noreturn -Wall -Wextra -Wcast-qual -Wno-unused-function -Wno-array-bounds
#cgo CPPFLAGS: -Ofast -Wall -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations -Wno-unused-but-set-variable #cgo CPPFLAGS: -Ofast -Wall -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations -Wno-unused-but-set-variable
...@@ -25,6 +25,8 @@ package llm ...@@ -25,6 +25,8 @@ package llm
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/common/libcommon.a #cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/common/libcommon.a
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/libllama.a #cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/libllama.a
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/libggml_static.a #cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/libggml_static.a
// Note: the following requires cuda library presence on linux to build, even if you only have rocm or CPU only
#cgo linux LDFLAGS: /usr/local/cuda/lib64/libcudart_static.a #cgo linux LDFLAGS: /usr/local/cuda/lib64/libcudart_static.a
#cgo linux LDFLAGS: /usr/local/cuda/lib64/libcublas_static.a #cgo linux LDFLAGS: /usr/local/cuda/lib64/libcublas_static.a
#cgo linux LDFLAGS: /usr/local/cuda/lib64/libcublasLt_static.a #cgo linux LDFLAGS: /usr/local/cuda/lib64/libcublasLt_static.a
...@@ -35,7 +37,7 @@ package llm ...@@ -35,7 +37,7 @@ package llm
#cgo windows LDFLAGS: -lext_server_shared -lpthread #cgo windows LDFLAGS: -lext_server_shared -lpthread
#include <stdlib.h> #include <stdlib.h>
#include "examples/server/server.h" #include "server.h"
*/ */
import "C" import "C"
...@@ -43,25 +45,51 @@ import ( ...@@ -43,25 +45,51 @@ import (
"bytes" "bytes"
"context" "context"
"encoding/json" "encoding/json"
"errors"
"fmt" "fmt"
"log" "log"
"os" "os"
"runtime" "runtime"
"strings"
"sync" "sync"
"time" "time"
"unsafe" "unsafe"
"github.com/jmorganca/ollama/api" "github.com/jmorganca/ollama/api"
"github.com/jmorganca/ollama/gpu"
) )
func errWrap(resp C.ext_server_err) error { func newExtServerResp(len C.size_t) C.ext_server_resp_t {
if resp.code == 0 { var resp C.ext_server_resp_t
return nil resp.msg_len = len
bytes := make([]byte, len)
resp.msg = (*C.char)(C.CBytes(bytes))
return resp
}
func freeExtServerResp(resp C.ext_server_resp_t) {
if resp.msg_len == 0 {
return
} }
err := fmt.Errorf(C.GoString(resp.err)) C.free(unsafe.Pointer(resp.msg))
C.free(unsafe.Pointer(resp.err)) }
return err
func extServerResponseToErr(resp C.ext_server_resp_t) error {
return fmt.Errorf(C.GoString(resp.msg))
}
type extServer interface {
LLM
llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t)
llama_server_start()
llama_server_stop()
llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t)
llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t)
llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t)
llama_server_release_task_result(result *C.ext_server_task_result_t)
llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
llama_server_release_json_resp(json_resp **C.char)
} }
type llamaExtServer struct { type llamaExtServer struct {
...@@ -71,21 +99,61 @@ type llamaExtServer struct { ...@@ -71,21 +99,61 @@ type llamaExtServer struct {
// Note: current implementation does not support concurrent instantiations // Note: current implementation does not support concurrent instantiations
var mutex sync.Mutex var mutex sync.Mutex
func newLlamaExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (*llamaExtServer, error) { func (llm *llamaExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) {
C.llama_server_init(sparams, err)
}
func (llm *llamaExtServer) llama_server_start() {
C.llama_server_start()
}
func (llm *llamaExtServer) llama_server_stop() {
C.llama_server_stop()
}
func (llm *llamaExtServer) llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) {
C.llama_server_completion(json_req, resp)
}
func (llm *llamaExtServer) llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) {
C.llama_server_completion_next_result(task_id, resp)
}
func (llm *llamaExtServer) llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) {
C.llama_server_completion_cancel(task_id, err)
}
func (llm *llamaExtServer) llama_server_release_task_result(result *C.ext_server_task_result_t) {
C.llama_server_release_task_result(result)
}
func (llm *llamaExtServer) llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
C.llama_server_tokenize(json_req, json_resp, err)
}
func (llm *llamaExtServer) llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
C.llama_server_detokenize(json_req, json_resp, err)
}
func (llm *llamaExtServer) llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
C.llama_server_embedding(json_req, json_resp, err)
}
func (llm *llamaExtServer) llama_server_release_json_resp(json_resp **C.char) {
C.llama_server_release_json_resp(json_resp)
}
func newLlamaExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
server := &llamaExtServer{opts}
return newExtServer(server, model, adapters, projectors, numLayers, opts)
}
func newExtServer(server extServer, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
if !mutex.TryLock() { if !mutex.TryLock() {
log.Printf("concurrent llm servers not yet supported, waiting for prior server to complete") log.Printf("concurrent llm servers not yet supported, waiting for prior server to complete")
mutex.Lock() mutex.Lock()
} }
server := &llamaExtServer{opts}
fileInfo, err := os.Stat(model) fileInfo, err := os.Stat(model)
if err != nil { if err != nil {
return nil, err return nil, err
} }
var sparams C.ext_server_params var sparams C.ext_server_params_t
sparams.model = C.CString(model) sparams.model = C.CString(model)
defer C.free(unsafe.Pointer(sparams.model)) defer C.free(unsafe.Pointer(sparams.model))
numGPU := NumGPU(numLayers, fileInfo.Size(), opts) numGPU := gpu.NumGPU(numLayers, fileInfo.Size(), opts)
sparams.embedding = true sparams.embedding = true
sparams.n_ctx = C.uint(opts.NumCtx) sparams.n_ctx = C.uint(opts.NumCtx)
...@@ -97,10 +165,14 @@ func newLlamaExtServer(model string, adapters, projectors []string, numLayers in ...@@ -97,10 +165,14 @@ func newLlamaExtServer(model string, adapters, projectors []string, numLayers in
// Always use the value encoded in the model // Always use the value encoded in the model
sparams.rope_freq_base = 0.0 sparams.rope_freq_base = 0.0
sparams.rope_freq_scale = 0.0 sparams.rope_freq_scale = 0.0
sparams.memory_f16 = C.bool(opts.F16KV)
sparams.use_mlock = C.bool(opts.UseMLock)
sparams.use_mmap = C.bool(opts.UseMMap)
sparams.numa = C.bool(opts.UseNUMA)
sparams.lora_adapters = nil sparams.lora_adapters = nil
for i := 0; i < len(adapters); i++ { for i := 0; i < len(adapters); i++ {
la := (*C.ext_server_lora_adapter)(C.malloc(C.sizeof_struct_ext_server_lora_adapter)) la := (*C.ext_server_lora_adapter_t)(C.malloc(C.sizeof_ext_server_lora_adapter_t))
defer C.free(unsafe.Pointer(la)) defer C.free(unsafe.Pointer(la))
la.adapter = C.CString(adapters[i]) la.adapter = C.CString(adapters[i])
defer C.free(unsafe.Pointer(la.adapter)) defer C.free(unsafe.Pointer(la.adapter))
...@@ -116,11 +188,13 @@ func newLlamaExtServer(model string, adapters, projectors []string, numLayers in ...@@ -116,11 +188,13 @@ func newLlamaExtServer(model string, adapters, projectors []string, numLayers in
} }
} }
// TODO - implement ME if len(projectors) > 0 {
// if len(projectors) > 0 { // TODO: applying multiple projectors is not supported by the llama.cpp server yet
// // TODO: applying multiple projectors is not supported by the llama.cpp server yet sparams.mmproj = C.CString(projectors[0])
// params = append(params, "--mmproj", projectors[0]) defer C.free(unsafe.Pointer(sparams.mmproj))
// } } else {
sparams.mmproj = nil
}
if opts.NumThread > 0 { if opts.NumThread > 0 {
sparams.n_threads = C.uint(opts.NumThread) sparams.n_threads = C.uint(opts.NumThread)
...@@ -128,136 +202,167 @@ func newLlamaExtServer(model string, adapters, projectors []string, numLayers in ...@@ -128,136 +202,167 @@ func newLlamaExtServer(model string, adapters, projectors []string, numLayers in
sparams.n_threads = C.uint(runtime.NumCPU()) sparams.n_threads = C.uint(runtime.NumCPU())
} }
sparams.memory_f16 = false
if opts.F16KV {
sparams.memory_f16 = true
}
sparams.use_mlock = false
if opts.UseMLock {
sparams.use_mlock = true
}
sparams.use_mmap = true
if !opts.UseMMap {
sparams.use_mmap = false
}
sparams.numa = false
if opts.UseNUMA {
sparams.numa = true
}
log.Printf("Initializing internal llama server") log.Printf("Initializing internal llama server")
err = errWrap(C.llama_server_init(&sparams)) resp := newExtServerResp(128)
if err != nil { defer freeExtServerResp(resp)
return nil, err server.llama_server_init(&sparams, &resp)
if resp.id < 0 {
return nil, extServerResponseToErr(resp)
} }
log.Printf("Starting internal llama main loop") log.Printf("Starting internal llama main loop")
C.llama_server_start() server.llama_server_start()
return server, nil return server, nil
} }
func (llm *llamaExtServer) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error { func (llm *llamaExtServer) Predict(ctx context.Context, pred PredictOpts, fn func(PredictResult)) error {
return predict(llm, llm.Options, ctx, pred, fn)
}
func predict(llm extServer, opts api.Options, ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
resp := newExtServerResp(128)
defer freeExtServerResp(resp)
var imageData []ImageData
if len(predict.Images) > 0 {
for cnt, i := range predict.Images {
imageData = append(imageData, ImageData{Data: i, ID: cnt})
}
}
log.Printf("loaded %d images", len(imageData))
request := map[string]any{ request := map[string]any{
"prompt": predict.Prompt, "prompt": predict.Prompt,
"stream": true, "stream": true,
"n_predict": llm.NumPredict, "n_predict": opts.NumPredict,
"n_keep": llm.NumKeep, "n_keep": opts.NumKeep,
"temperature": llm.Temperature, "temperature": opts.Temperature,
"top_k": llm.TopK, "top_k": opts.TopK,
"top_p": llm.TopP, "top_p": opts.TopP,
"tfs_z": llm.TFSZ, "tfs_z": opts.TFSZ,
"typical_p": llm.TypicalP, "typical_p": opts.TypicalP,
"repeat_last_n": llm.RepeatLastN, "repeat_last_n": opts.RepeatLastN,
"repeat_penalty": llm.RepeatPenalty, "repeat_penalty": opts.RepeatPenalty,
"presence_penalty": llm.PresencePenalty, "presence_penalty": opts.PresencePenalty,
"frequency_penalty": llm.FrequencyPenalty, "frequency_penalty": opts.FrequencyPenalty,
"mirostat": llm.Mirostat, "mirostat": opts.Mirostat,
"mirostat_tau": llm.MirostatTau, "mirostat_tau": opts.MirostatTau,
"mirostat_eta": llm.MirostatEta, "mirostat_eta": opts.MirostatEta,
"penalize_nl": llm.PenalizeNewline, "penalize_nl": opts.PenalizeNewline,
"seed": llm.Seed, "seed": opts.Seed,
"stop": llm.Stop, "stop": opts.Stop,
"image_data": imageData,
} }
if predict.Format == "json" { if predict.Format == "json" {
request["grammar"] = jsonGrammar request["grammar"] = jsonGrammar
} }
// Handling JSON marshaling with special characters unescaped. retryDelay := 100 * time.Microsecond
buffer := &bytes.Buffer{} for retries := 0; retries < maxRetries; retries++ {
enc := json.NewEncoder(buffer) if retries > 0 {
enc.SetEscapeHTML(false) time.Sleep(retryDelay) // wait before retrying
retryDelay *= 2 // exponential backoff
if err := enc.Encode(request); err != nil { }
return fmt.Errorf("failed to marshal data: %w", err)
}
req := C.CString(buffer.String()) // Handling JSON marshaling with special characters unescaped.
defer C.free(unsafe.Pointer(req)) buffer := &bytes.Buffer{}
enc := json.NewEncoder(buffer)
enc.SetEscapeHTML(false)
cmpCtx := C.llama_server_completion(req) if err := enc.Encode(request); err != nil {
if cmpCtx.task_id < 0 { return fmt.Errorf("failed to marshal data: %w", err)
defer C.free(unsafe.Pointer(cmpCtx.err)) }
return fmt.Errorf(C.GoString(cmpCtx.err))
}
for { req := C.CString(buffer.String())
select { defer C.free(unsafe.Pointer(req))
case <-ctx.Done():
// This handles the request cancellation
return errWrap(C.llama_server_completion_cancel(cmpCtx.task_id))
default:
result := C.llama_server_completion_next_result(cmpCtx.task_id)
if result.result_json != nil {
defer C.free(unsafe.Pointer(result.result_json))
}
var p prediction
if err := json.Unmarshal([]byte(C.GoString(result.result_json)), &p); err != nil {
err2 := errWrap(C.llama_server_completion_cancel(cmpCtx.task_id))
return errors.Join(fmt.Errorf("error unmarshaling llm prediction response: %w", err), err2)
}
if p.Content != "" { llm.llama_server_completion(req, &resp)
fn(PredictResult{ if resp.id < 0 {
// Model: predict.Model, // XXX remove or replace? return extServerResponseToErr(resp)
CreatedAt: time.Now().UTC(), }
Content: p.Content,
})
}
if p.Stop { retryNeeded := false
fn(PredictResult{ out:
// Model: predict.Model, // XXX remove or replace? for {
CreatedAt: time.Now().UTC(), select {
TotalDuration: time.Since(predict.CheckpointStart), case <-ctx.Done():
Done: true, // This handles the request cancellation
PromptEvalCount: p.Timings.PromptN, llm.llama_server_completion_cancel(resp.id, &resp)
PromptEvalDuration: parseDurationMs(p.Timings.PromptMS), if resp.id < 0 {
EvalCount: p.Timings.PredictedN, return extServerResponseToErr(resp)
EvalDuration: parseDurationMs(p.Timings.PredictedMS), } else {
}) return nil
return nil }
default:
var result C.ext_server_task_result_t
llm.llama_server_completion_next_result(resp.id, &result)
json_resp := C.GoString(result.json_resp)
llm.llama_server_release_task_result(&result)
var p prediction
if err := json.Unmarshal([]byte(json_resp), &p); err != nil {
llm.llama_server_completion_cancel(resp.id, &resp)
if resp.id < 0 {
return fmt.Errorf("error unmarshaling llm prediction response: %w and cancel %s", err, C.GoString(resp.msg))
} else {
return fmt.Errorf("error unmarshaling llm prediction response: %w", err)
}
}
if bool(result.error) && strings.Contains(json_resp, "slot unavailable") {
retryNeeded = true
// task will already be canceled
break out
}
if p.Content != "" {
fn(PredictResult{
Content: p.Content,
})
}
if p.Stop {
fn(PredictResult{
Done: true,
PromptEvalCount: p.Timings.PromptN,
PromptEvalDuration: parseDurationMs(p.Timings.PromptMS),
EvalCount: p.Timings.PredictedN,
EvalDuration: parseDurationMs(p.Timings.PredictedMS),
})
return nil
}
} }
} }
if !retryNeeded {
return nil // success
}
} }
}
// should never reach here ideally
return fmt.Errorf("max retries exceeded")
}
func (llm *llamaExtServer) Encode(ctx context.Context, prompt string) ([]int, error) { func (llm *llamaExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
return encode(llm, ctx, prompt)
}
func encode(llm extServer, ctx context.Context, prompt string) ([]int, error) {
data, err := json.Marshal(TokenizeRequest{Content: prompt}) data, err := json.Marshal(TokenizeRequest{Content: prompt})
if err != nil { if err != nil {
return nil, fmt.Errorf("marshaling encode data: %w", err) return nil, fmt.Errorf("marshaling encode data: %w", err)
} }
req := C.CString(string(data)) req := C.CString(string(data))
defer C.free(unsafe.Pointer(req)) defer C.free(unsafe.Pointer(req))
var resp C.ext_server_resp var json_resp *C.char
err = errWrap(C.llama_server_tokenize(req, &resp)) resp := newExtServerResp(128)
if resp.json_resp != nil { defer freeExtServerResp(resp)
defer C.free(unsafe.Pointer(resp.json_resp)) llm.llama_server_tokenize(req, &json_resp, &resp)
if resp.id < 0 {
return nil, extServerResponseToErr(resp)
} }
defer llm.llama_server_release_json_resp(&json_resp)
var encoded TokenizeResponse var encoded TokenizeResponse
if err2 := json.Unmarshal([]byte(C.GoString(resp.json_resp)), &encoded); err2 != nil { if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &encoded); err2 != nil {
return nil, fmt.Errorf("unmarshal encode response: %w", err2) return nil, fmt.Errorf("unmarshal encode response: %w", err2)
} }
...@@ -265,6 +370,10 @@ func (llm *llamaExtServer) Encode(ctx context.Context, prompt string) ([]int, er ...@@ -265,6 +370,10 @@ func (llm *llamaExtServer) Encode(ctx context.Context, prompt string) ([]int, er
} }
func (llm *llamaExtServer) Decode(ctx context.Context, tokens []int) (string, error) { func (llm *llamaExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
return decode(llm, ctx, tokens)
}
func decode(llm extServer, ctx context.Context, tokens []int) (string, error) {
if len(tokens) == 0 { if len(tokens) == 0 {
return "", nil return "", nil
} }
...@@ -275,14 +384,17 @@ func (llm *llamaExtServer) Decode(ctx context.Context, tokens []int) (string, er ...@@ -275,14 +384,17 @@ func (llm *llamaExtServer) Decode(ctx context.Context, tokens []int) (string, er
req := C.CString(string(data)) req := C.CString(string(data))
defer C.free(unsafe.Pointer(req)) defer C.free(unsafe.Pointer(req))
var resp C.ext_server_resp var json_resp *C.char
err = errWrap(C.llama_server_detokenize(req, &resp)) resp := newExtServerResp(128)
if resp.json_resp != nil { defer freeExtServerResp(resp)
defer C.free(unsafe.Pointer(resp.json_resp)) llm.llama_server_detokenize(req, &json_resp, &resp)
if resp.id < 0 {
return "", extServerResponseToErr(resp)
} }
defer llm.llama_server_release_json_resp(&json_resp)
var decoded DetokenizeResponse var decoded DetokenizeResponse
if err2 := json.Unmarshal([]byte(C.GoString(resp.json_resp)), &decoded); err2 != nil { if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &decoded); err2 != nil {
return "", fmt.Errorf("unmarshal encode response: %w", err2) return "", fmt.Errorf("unmarshal encode response: %w", err2)
} }
...@@ -290,6 +402,9 @@ func (llm *llamaExtServer) Decode(ctx context.Context, tokens []int) (string, er ...@@ -290,6 +402,9 @@ func (llm *llamaExtServer) Decode(ctx context.Context, tokens []int) (string, er
} }
func (llm *llamaExtServer) Embedding(ctx context.Context, input string) ([]float64, error) { func (llm *llamaExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
return embedding(llm, ctx, input)
}
func embedding(llm extServer, ctx context.Context, input string) ([]float64, error) {
data, err := json.Marshal(TokenizeRequest{Content: input}) data, err := json.Marshal(TokenizeRequest{Content: input})
if err != nil { if err != nil {
return nil, fmt.Errorf("error marshaling embed data: %w", err) return nil, fmt.Errorf("error marshaling embed data: %w", err)
...@@ -297,29 +412,28 @@ func (llm *llamaExtServer) Embedding(ctx context.Context, input string) ([]float ...@@ -297,29 +412,28 @@ func (llm *llamaExtServer) Embedding(ctx context.Context, input string) ([]float
req := C.CString(string(data)) req := C.CString(string(data))
defer C.free(unsafe.Pointer(req)) defer C.free(unsafe.Pointer(req))
var resp C.ext_server_resp var json_resp *C.char
err = errWrap(C.llama_server_embedding(req, &resp)) resp := newExtServerResp(128)
if resp.json_resp != nil { defer freeExtServerResp(resp)
defer C.free(unsafe.Pointer(resp.json_resp)) llm.llama_server_embedding(req, &json_resp, &resp)
} if resp.id < 0 {
if err != nil { return nil, extServerResponseToErr(resp)
return nil, err
} }
defer llm.llama_server_release_json_resp(&json_resp)
var embedding EmbeddingResponse var embedding EmbeddingResponse
if err := json.Unmarshal([]byte(C.GoString(resp.json_resp)), &embedding); err != nil { if err := json.Unmarshal([]byte(C.GoString(json_resp)), &embedding); err != nil {
return nil, fmt.Errorf("unmarshal tokenize response: %w", err) return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
} }
return embedding.Embedding, nil return embedding.Embedding, nil
} }
func (llm *llamaExtServer) Ping(ctx context.Context) error { func (llm *llamaExtServer) Close() {
// TODO - consider some mechanism to check if the main loop and llama.cpp are in a good state close(llm)
return nil
} }
func (llm *llamaExtServer) Close() { func close(llm extServer) {
C.llama_server_stop() llm.llama_server_stop()
mutex.Unlock() mutex.Unlock()
} }
# common logic accross linux and darwin # common logic accross linux and darwin
init_vars() { init_vars() {
LLAMACPP_DIR=gguf
PATCHES="0001-Expose-callable-API-for-server.patch" PATCHES="0001-Expose-callable-API-for-server.patch"
CMAKE_DEFS="-DLLAMA_ACCELERATE=on" CMAKE_DEFS="-DLLAMA_ACCELERATE=on"
# TODO - LLAMA_K_QUANTS is stale and needs to be mapped to newer cmake settings # TODO - LLAMA_K_QUANTS is stale and needs to be mapped to newer cmake settings
CMAKE_TARGETS="--target ggml --target ggml_static --target llama --target build_info --target common --target ext_server" CMAKE_TARGETS="--target ggml --target ggml_static --target llama --target build_info --target common --target ext_server --target llava_static"
if echo "${CGO_CFLAGS}" | grep -- '-g' > /dev/null ; then if echo "${CGO_CFLAGS}" | grep -- '-g' > /dev/null ; then
CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on ${CMAKE_DEFS}" CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on ${CMAKE_DEFS}"
else else
...@@ -29,6 +30,6 @@ apply_patches() { ...@@ -29,6 +30,6 @@ apply_patches() {
} }
build() { build() {
cmake -S gguf -B ${BUILD_DIR} ${CMAKE_DEFS} cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS}
cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8 cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8
} }
\ No newline at end of file
#!/bin/sh #!/bin/bash
# This script is intended to run inside the go generate # This script is intended to run inside the go generate
# working directory must be ../llm/llama.cpp # working directory must be ../llm/llama.cpp
...@@ -30,6 +30,7 @@ git_module_setup ...@@ -30,6 +30,7 @@ git_module_setup
apply_patches apply_patches
build build
# TODO - improve this to handle test cases that need it to be in "." around the tree
# Enable local debug/run usecase # Enable local debug/run usecase
if [ -e "gguf/ggml-metal.metal" ]; then if [ -e "gguf/ggml-metal.metal" ]; then
cp gguf/ggml-metal.metal ../../ cp gguf/ggml-metal.metal ../../
......
#!/bin/sh #!/bin/bash
# This script is intended to run inside the go generate # This script is intended to run inside the go generate
# working directory must be ../llm/llama.cpp # working directory must be ../llm/llama.cpp
set -ex set -ex
set -o pipefail set -o pipefail
# TODO - stopped here - map the variables from above over and refine the case statement below
echo "Starting linux generate script" echo "Starting linux generate script"
if [ -z "${CUDACXX}" -a -x /usr/local/cuda/bin/nvcc ] ; then
export CUDACXX=/usr/local/cuda/bin/nvcc
fi
source $(dirname $0)/gen_common.sh source $(dirname $0)/gen_common.sh
init_vars init_vars
CMAKE_DEFS="-DLLAMA_CUBLAS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="gguf/build/cuda"
git_module_setup git_module_setup
apply_patches apply_patches
CMAKE_DEFS="-DLLAMA_CUBLAS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="gguf/build/cuda"
LIB_DIR="${BUILD_DIR}/lib"
mkdir -p ../../dist/
build build
# TODO - explore mechanism to soften the hard cuda dependency on linux
# by conditionally building some archive here that aggregates the cuda libs if present
# so that the cgo flags link this intermediate archive instead of the underlying cuda libs
#
# gcc -fPIC -g -shared -o ${LIB_DIR}/libcuda_server.so \
# -Wl,--whole-archive \
# ${BUILD_DIR}/examples/server/CMakeFiles/ext_server.dir/server.cpp.o \
# ${BUILD_DIR}/common/libcommon.a \
# ${BUILD_DIR}/libllama.a \
# ${BUILD_DIR}/examples/llava/libllava_static.a \
# -Wl,--no-whole-archive \
# -lrt -lpthread -ldl -lstdc++ -lm \
# /usr/local/cuda/lib64/libcudart_static.a \
# /usr/local/cuda/lib64/libcublas_static.a \
# /usr/local/cuda/lib64/libcublasLt_static.a \
# /usr/local/cuda/lib64/libcudadevrt.a \
# /usr/local/cuda/lib64/libculibos.a
if [ -z "${ROCM_PATH}" ] ; then
# Try the default location in case it exists
ROCM_PATH=/opt/rocm
fi
if [ -z "${CLBlast_DIR}" ] ; then
# Try the default location in case it exists
if [ -d /usr/lib/cmake/CLBlast ]; then
export CLBlast_DIR=/usr/lib/cmake/CLBlast
fi
fi
BUILD_DIR="gguf/build/rocm"
LIB_DIR="${BUILD_DIR}/lib"
mkdir -p ${LIB_DIR}
# Ensure we have at least one file present for the embed
touch ${LIB_DIR}/.generated
if [ -d "${ROCM_PATH}" ] ; then
echo "Building ROCm"
init_vars
CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102' -DGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102'"
CMAKE_DEFS="-DLLAMA_ACCELERATE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
build
gcc -fPIC -g -shared -o ${LIB_DIR}/librocm_server.so \
-Wl,--whole-archive \
${BUILD_DIR}/examples/server/libext_server.a \
${BUILD_DIR}/common/libcommon.a \
${BUILD_DIR}/libllama.a \
-Wl,--no-whole-archive \
-lrt -lpthread -ldl -lstdc++ -lm \
-L/opt/rocm/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ \
-Wl,-rpath,/opt/rocm/lib,-rpath,/opt/amdgpu/lib/x86_64-linux-gnu/ \
-lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu
fi
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment