Commit 58d95cc9 authored by Daniel Hiltgen's avatar Daniel Hiltgen
Browse files

Switch back to subprocessing for llama.cpp

This should resolve a number of memory leak and stability defects by allowing
us to isolate llama.cpp in a separate process and shutdown when idle, and
gracefully restart if it has problems.  This also serves as a first step to be
able to run multiple copies to support multiple models concurrently.
parent 3b6a9154
...@@ -56,10 +56,12 @@ jobs: ...@@ -56,10 +56,12 @@ jobs:
- run: go get ./... - run: go get ./...
- run: | - run: |
$gopath=(get-command go).source | split-path -parent $gopath=(get-command go).source | split-path -parent
$gccpath=(get-command gcc).source | split-path -parent
& "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1" & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
cd $env:GITHUB_WORKSPACE cd $env:GITHUB_WORKSPACE
$env:CMAKE_SYSTEM_VERSION="10.0.22621.0" $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
$env:PATH="$gopath;$env:PATH" $env:PATH="$gopath;$gccpath;$env:PATH"
echo $env:PATH
go generate -x ./... go generate -x ./...
if: ${{ startsWith(matrix.os, 'windows-') }} if: ${{ startsWith(matrix.os, 'windows-') }}
name: "Windows Go Generate" name: "Windows Go Generate"
...@@ -69,7 +71,9 @@ jobs: ...@@ -69,7 +71,9 @@ jobs:
- uses: actions/upload-artifact@v4 - uses: actions/upload-artifact@v4
with: with:
name: ${{ matrix.os }}-${{ matrix.arch }}-libraries name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
path: llm/llama.cpp/build/**/lib/* path: |
llm/build/**/bin/*
llm/build/**/*.a
generate-cuda: generate-cuda:
needs: [changes] needs: [changes]
if: ${{ needs.changes.outputs.GENERATE_CUDA == 'True' }} if: ${{ needs.changes.outputs.GENERATE_CUDA == 'True' }}
...@@ -100,7 +104,7 @@ jobs: ...@@ -100,7 +104,7 @@ jobs:
- uses: actions/upload-artifact@v4 - uses: actions/upload-artifact@v4
with: with:
name: cuda-${{ matrix.cuda-version }}-libraries name: cuda-${{ matrix.cuda-version }}-libraries
path: llm/llama.cpp/build/**/lib/* path: llm/build/**/bin/*
generate-rocm: generate-rocm:
needs: [changes] needs: [changes]
if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }} if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }}
...@@ -131,7 +135,7 @@ jobs: ...@@ -131,7 +135,7 @@ jobs:
- uses: actions/upload-artifact@v4 - uses: actions/upload-artifact@v4
with: with:
name: rocm-${{ matrix.rocm-version }}-libraries name: rocm-${{ matrix.rocm-version }}-libraries
path: llm/llama.cpp/build/**/lib/* path: llm/build/**/lib/*
# ROCm generation step # ROCm generation step
generate-windows-rocm: generate-windows-rocm:
...@@ -244,17 +248,17 @@ jobs: ...@@ -244,17 +248,17 @@ jobs:
esac >>$GITHUB_ENV esac >>$GITHUB_ENV
shell: bash shell: bash
- run: | - run: |
mkdir -p llm/llama.cpp/build/linux/$ARCH/stub/lib/ mkdir -p llm/build/linux/$ARCH/stub/bin/
touch llm/llama.cpp/build/linux/$ARCH/stub/lib/stub.so touch llm/build/linux/$ARCH/stub/bin/stub.so
if: ${{ startsWith(matrix.os, 'ubuntu-') }} if: ${{ startsWith(matrix.os, 'ubuntu-') }}
- run: | - run: |
mkdir -p llm/llama.cpp/build/darwin/$ARCH/stub/lib/ mkdir -p llm/build/darwin/$ARCH/stub/bin/
touch llm/llama.cpp/build/darwin/$ARCH/stub/lib/stub.dylib touch llm/build/darwin/$ARCH/stub/bin/stub.dylib
touch llm/llama.cpp/ggml-metal.metal touch llm/ggml-metal.metal
if: ${{ startsWith(matrix.os, 'macos-') }} if: ${{ startsWith(matrix.os, 'macos-') }}
- run: | - run: |
mkdir -p llm/llama.cpp/build/windows/$ARCH/stub/stub/lib/ mkdir -p llm/build/windows/$ARCH/stub/stub/bin/
touch llm/llama.cpp/build/windows/$ARCH/stub/stub/lib/stub.dll touch llm/build/windows/$ARCH/stub/stub/bin/stub.dll
if: ${{ startsWith(matrix.os, 'windows-') }} if: ${{ startsWith(matrix.os, 'windows-') }}
- uses: golangci/golangci-lint-action@v3 - uses: golangci/golangci-lint-action@v3
test: test:
...@@ -271,6 +275,7 @@ jobs: ...@@ -271,6 +275,7 @@ jobs:
env: env:
GOARCH: ${{ matrix.arch }} GOARCH: ${{ matrix.arch }}
CGO_ENABLED: '1' CGO_ENABLED: '1'
OLLAMA_CPU_TARGET: "static"
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
with: with:
...@@ -287,18 +292,19 @@ jobs: ...@@ -287,18 +292,19 @@ jobs:
esac >>$GITHUB_ENV esac >>$GITHUB_ENV
shell: bash shell: bash
- run: | - run: |
mkdir -p llm/llama.cpp/build/linux/$ARCH/stub/lib/ mkdir -p llm/build/linux/$ARCH/stub/bin/
touch llm/llama.cpp/build/linux/$ARCH/stub/lib/stub.so touch llm//build/linux/$ARCH/stub/bin/stub.so
if: ${{ startsWith(matrix.os, 'ubuntu-') }} if: ${{ startsWith(matrix.os, 'ubuntu-') }}
- run: | - run: |
mkdir -p llm/llama.cpp/build/darwin/$ARCH/stub/lib/ mkdir -p llm/build/darwin/$ARCH/stub/bin/
touch llm/llama.cpp/build/darwin/$ARCH/stub/lib/stub.dylib touch llm/build/darwin/$ARCH/stub/bin/stub.dylib
touch llm/llama.cpp/ggml-metal.metal touch llm/ggml-metal.metal
if: ${{ startsWith(matrix.os, 'macos-') }} if: ${{ startsWith(matrix.os, 'macos-') }}
- run: | - run: |
mkdir -p llm/llama.cpp/build/windows/$ARCH/stub/stub/lib/ mkdir -p llm/build/windows/$ARCH/stub/stub/bin/
touch llm/llama.cpp/build/windows/$ARCH/stub/stub/lib/stub.dll touch llm/build/windows/$ARCH/stub/stub/bin/stub.dll
if: ${{ startsWith(matrix.os, 'windows-') }} if: ${{ startsWith(matrix.os, 'windows-') }}
- run: go generate ./...
- run: go build - run: go build
- run: go test -v ./... - run: go test -v ./...
- uses: actions/upload-artifact@v4 - uses: actions/upload-artifact@v4
......
...@@ -10,4 +10,5 @@ ggml-metal.metal ...@@ -10,4 +10,5 @@ ggml-metal.metal
*.exe *.exe
.idea .idea
test_data test_data
*.crt *.crt
\ No newline at end of file llm/build
\ No newline at end of file
...@@ -61,6 +61,8 @@ ARG OLLAMA_CUSTOM_CPU_DEFS ...@@ -61,6 +61,8 @@ ARG OLLAMA_CUSTOM_CPU_DEFS
ARG CGO_CFLAGS ARG CGO_CFLAGS
WORKDIR /go/src/github.com/ollama/ollama/llm/generate WORKDIR /go/src/github.com/ollama/ollama/llm/generate
FROM --platform=linux/amd64 cpu-builder-amd64 AS static-build-amd64
RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh
FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu-build-amd64 FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu-build-amd64
RUN OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh RUN OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx-build-amd64 FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx-build-amd64
...@@ -68,28 +70,33 @@ RUN OLLAMA_CPU_TARGET="cpu_avx" sh gen_linux.sh ...@@ -68,28 +70,33 @@ RUN OLLAMA_CPU_TARGET="cpu_avx" sh gen_linux.sh
FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx2-build-amd64 FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx2-build-amd64
RUN OLLAMA_CPU_TARGET="cpu_avx2" sh gen_linux.sh RUN OLLAMA_CPU_TARGET="cpu_avx2" sh gen_linux.sh
FROM --platform=linux/arm64 centos:7 AS cpu-build-arm64 FROM --platform=linux/arm64 centos:7 AS cpu-builder-arm64
ARG CMAKE_VERSION ARG CMAKE_VERSION
ARG GOLANG_VERSION ARG GOLANG_VERSION
COPY ./scripts/rh_linux_deps.sh / COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
COPY --from=llm-code / /go/src/github.com/ollama/ollama/ COPY --from=llm-code / /go/src/github.com/ollama/ollama/
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
# Note, we only build the "base" CPU variant on arm since avx/avx2 are x86 features
ARG OLLAMA_CUSTOM_CPU_DEFS ARG OLLAMA_CUSTOM_CPU_DEFS
ARG CGO_CFLAGS ARG CGO_CFLAGS
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
FROM --platform=linux/arm64 cpu-builder-arm64 AS static-build-arm64
RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh
FROM --platform=linux/arm64 cpu-builder-arm64 AS cpu-build-arm64
RUN OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh RUN OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
# Intermediate stage used for ./scripts/build_linux.sh # Intermediate stage used for ./scripts/build_linux.sh
FROM --platform=linux/amd64 cpu-build-amd64 AS build-amd64 FROM --platform=linux/amd64 cpu-build-amd64 AS build-amd64
ENV CGO_ENABLED 1 ENV CGO_ENABLED 1
WORKDIR /go/src/github.com/ollama/ollama WORKDIR /go/src/github.com/ollama/ollama
COPY . . COPY . .
COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/ COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/ COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/ COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/ COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/deps/ ./dist/deps/ COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/deps/ ./dist/deps/
ARG GOFLAGS ARG GOFLAGS
ARG CGO_CFLAGS ARG CGO_CFLAGS
...@@ -101,8 +108,8 @@ ENV CGO_ENABLED 1 ...@@ -101,8 +108,8 @@ ENV CGO_ENABLED 1
ARG GOLANG_VERSION ARG GOLANG_VERSION
WORKDIR /go/src/github.com/ollama/ollama WORKDIR /go/src/github.com/ollama/ollama
COPY . . COPY . .
COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/ COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
RUN mkdir -p /go/src/github.com/ollama/ollama/dist/deps/ COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
ARG GOFLAGS ARG GOFLAGS
ARG CGO_CFLAGS ARG CGO_CFLAGS
RUN go build -trimpath . RUN go build -trimpath .
......
...@@ -9,6 +9,7 @@ import ( ...@@ -9,6 +9,7 @@ import (
"os" "os"
"os/exec" "os/exec"
"path/filepath" "path/filepath"
"syscall"
"time" "time"
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
...@@ -83,6 +84,28 @@ func SpawnServer(ctx context.Context, command string) (chan int, error) { ...@@ -83,6 +84,28 @@ func SpawnServer(ctx context.Context, command string) (chan int, error) {
io.Copy(logFile, stderr) //nolint:errcheck io.Copy(logFile, stderr) //nolint:errcheck
}() }()
// Re-wire context done behavior to attempt a graceful shutdown of the server
cmd.Cancel = func() error {
if cmd.Process != nil {
cmd.Process.Signal(os.Interrupt) //nolint:errcheck
tick := time.NewTicker(10 * time.Millisecond)
defer tick.Stop()
for {
select {
case <-tick.C:
// OS agnostic "is it still running"
if proc, err := os.FindProcess(int(cmd.Process.Pid)); err != nil || errors.Is(proc.Signal(syscall.Signal(0)), os.ErrProcessDone) {
return nil //nolint:nilerr
}
case <-time.After(5 * time.Second):
slog.Warn("graceful server shutdown timeout, killing", "pid", cmd.Process.Pid)
cmd.Process.Kill() //nolint:errcheck
}
}
}
return nil
}
// run the command and wait for it to finish // run the command and wait for it to finish
if err := cmd.Start(); err != nil { if err := cmd.Start(); err != nil {
return done, fmt.Errorf("failed to start server %w", err) return done, fmt.Errorf("failed to start server %w", err)
...@@ -105,7 +128,7 @@ func SpawnServer(ctx context.Context, command string) (chan int, error) { ...@@ -105,7 +128,7 @@ func SpawnServer(ctx context.Context, command string) (chan int, error) {
select { select {
case <-ctx.Done(): case <-ctx.Done():
slog.Debug(fmt.Sprintf("server shutdown with exit code %d", code)) slog.Info(fmt.Sprintf("server shutdown with exit code %d", code))
done <- code done <- code
return return
default: default:
......
...@@ -100,6 +100,8 @@ func AMDGetGPUInfo(resp *GpuInfo) { ...@@ -100,6 +100,8 @@ func AMDGetGPUInfo(resp *GpuInfo) {
return return
} }
updateLibPath(libDir)
gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION") gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION")
if gfxOverride == "" { if gfxOverride == "" {
supported, err := GetSupportedGFX(libDir) supported, err := GetSupportedGFX(libDir)
...@@ -143,6 +145,21 @@ func AMDGetGPUInfo(resp *GpuInfo) { ...@@ -143,6 +145,21 @@ func AMDGetGPUInfo(resp *GpuInfo) {
} }
} }
func updateLibPath(libDir string) {
ldPaths := []string{}
if val, ok := os.LookupEnv("LD_LIBRARY_PATH"); ok {
ldPaths = strings.Split(val, ":")
}
for _, d := range ldPaths {
if d == libDir {
return
}
}
val := strings.Join(append(ldPaths, libDir), ":")
slog.Debug("updated lib path", "LD_LIBRARY_PATH", val)
os.Setenv("LD_LIBRARY_PATH", val)
}
// Walk the sysfs nodes for the available GPUs and gather information from them // Walk the sysfs nodes for the available GPUs and gather information from them
// skipping over any devices in the skip map // skipping over any devices in the skip map
func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) { func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
......
...@@ -11,6 +11,7 @@ import ( ...@@ -11,6 +11,7 @@ import (
"strings" "strings"
"sync" "sync"
"syscall" "syscall"
"time"
) )
var ( var (
...@@ -84,7 +85,12 @@ func Cleanup() { ...@@ -84,7 +85,12 @@ func Cleanup() {
slog.Debug("cleaning up", "dir", tmpDir) slog.Debug("cleaning up", "dir", tmpDir)
err := os.RemoveAll(tmpDir) err := os.RemoveAll(tmpDir)
if err != nil { if err != nil {
slog.Warn("failed to clean up", "dir", tmpDir, "err", err) // On windows, if we remove too quickly the llama.dll may still be in-use and fail to remove
time.Sleep(1000 * time.Millisecond)
err = os.RemoveAll(tmpDir)
if err != nil {
slog.Warn("failed to clean up", "dir", tmpDir, "err", err)
}
} }
} }
} }
......
#include "dyn_ext_server.h"
#include <stdio.h>
#include <string.h>
#ifdef __linux__
#include <dlfcn.h>
#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
#define LOAD_ERR() strdup(dlerror())
#define UNLOAD_LIBRARY(handle) dlclose(handle)
#elif _WIN32
#include <windows.h>
#define LOAD_LIBRARY(lib, flags) LoadLibrary(lib)
#define LOAD_SYMBOL(handle, sym) GetProcAddress(handle, sym)
#define UNLOAD_LIBRARY(handle) FreeLibrary(handle)
#define LOAD_ERR() ({\
LPSTR messageBuffer = NULL; \
size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, \
NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&messageBuffer, 0, NULL); \
char *resp = strdup(messageBuffer); \
LocalFree(messageBuffer); \
resp; \
})
#else
#include <dlfcn.h>
#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
#define LOAD_ERR() strdup(dlerror())
#define UNLOAD_LIBRARY(handle) dlclose(handle)
#endif
void dyn_init(const char *libPath, struct dynamic_llama_server *s,
ext_server_resp_t *err) {
int i = 0;
struct lookup {
char *s;
void **p;
} l[] = {
{"llama_server_init", (void *)&s->llama_server_init},
{"llama_server_start", (void *)&s->llama_server_start},
{"llama_server_stop", (void *)&s->llama_server_stop},
{"llama_server_completion", (void *)&s->llama_server_completion},
{"llama_server_completion_next_result",
(void *)&s->llama_server_completion_next_result},
{"llama_server_completion_cancel",
(void *)&s->llama_server_completion_cancel},
{"llama_server_release_task_result",
(void *)&s->llama_server_release_task_result},
{"llama_server_tokenize", (void *)&s->llama_server_tokenize},
{"llama_server_detokenize", (void *)&s->llama_server_detokenize},
{"llama_server_embedding", (void *)&s->llama_server_embedding},
{"llama_server_release_json_resp",
(void *)&s->llama_server_release_json_resp},
{"", NULL},
};
printf("loading library %s\n", libPath);
s->handle = LOAD_LIBRARY(libPath, RTLD_LOCAL|RTLD_NOW);
if (!s->handle) {
err->id = -1;
char *msg = LOAD_ERR();
snprintf(err->msg, err->msg_len,
"Unable to load dynamic server library: %s", msg);
free(msg);
return;
}
for (i = 0; l[i].p != NULL; i++) {
*l[i].p = LOAD_SYMBOL(s->handle, l[i].s);
if (!l[i].p) {
UNLOAD_LIBRARY(s->handle);
err->id = -1;
char *msg = LOAD_ERR();
snprintf(err->msg, err->msg_len, "symbol lookup for %s failed: %s",
l[i].s, msg);
free(msg);
return;
}
}
}
inline void dyn_llama_server_init(struct dynamic_llama_server s,
ext_server_params_t *sparams,
ext_server_resp_t *err) {
s.llama_server_init(sparams, err);
}
inline void dyn_llama_server_start(struct dynamic_llama_server s) {
s.llama_server_start();
}
inline void dyn_llama_server_stop(struct dynamic_llama_server s) {
s.llama_server_stop();
}
inline void dyn_llama_server_completion(struct dynamic_llama_server s,
const char *json_req,
ext_server_resp_t *resp) {
s.llama_server_completion(json_req, resp);
}
inline void dyn_llama_server_completion_next_result(
struct dynamic_llama_server s, const int task_id,
ext_server_task_result_t *result) {
s.llama_server_completion_next_result(task_id, result);
}
inline void dyn_llama_server_completion_cancel(
struct dynamic_llama_server s, const int task_id, ext_server_resp_t *err) {
s.llama_server_completion_cancel(task_id, err);
}
inline void dyn_llama_server_release_task_result(
struct dynamic_llama_server s, ext_server_task_result_t *result) {
s.llama_server_release_task_result(result);
}
inline void dyn_llama_server_tokenize(struct dynamic_llama_server s,
const char *json_req,
char **json_resp,
ext_server_resp_t *err) {
s.llama_server_tokenize(json_req, json_resp, err);
}
inline void dyn_llama_server_detokenize(struct dynamic_llama_server s,
const char *json_req,
char **json_resp,
ext_server_resp_t *err) {
s.llama_server_detokenize(json_req, json_resp, err);
}
inline void dyn_llama_server_embedding(struct dynamic_llama_server s,
const char *json_req,
char **json_resp,
ext_server_resp_t *err) {
s.llama_server_embedding(json_req, json_resp, err);
}
inline void dyn_llama_server_release_json_resp(
struct dynamic_llama_server s, char **json_resp) {
s.llama_server_release_json_resp(json_resp);
}
package llm
/*
#cgo CFLAGS: -I${SRCDIR}/ext_server -I${SRCDIR}/llama.cpp -I${SRCDIR}/llama.cpp/common -I${SRCDIR}/llama.cpp/examples/server
#cgo CFLAGS: -DNDEBUG -DLLAMA_SERVER_LIBRARY=1 -D_XOPEN_SOURCE=600 -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
#cgo CFLAGS: -Wmissing-noreturn -Wextra -Wcast-qual -Wno-unused-function -Wno-array-bounds
#cgo CPPFLAGS: -Ofast -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations
#cgo darwin CFLAGS: -D_DARWIN_C_SOURCE
#cgo darwin CPPFLAGS: -DGGML_USE_ACCELERATE
#cgo darwin CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_NDEBUG
#cgo darwin LDFLAGS: -lc++ -framework Accelerate
#cgo darwin LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
#cgo linux CFLAGS: -D_GNU_SOURCE
#cgo linux LDFLAGS: -lrt -ldl -lstdc++ -lm
#cgo linux windows LDFLAGS: -lpthread
#include <stdlib.h>
#include "dyn_ext_server.h"
*/
import "C"
import (
"bytes"
"context"
"encoding/json"
"fmt"
"log/slog"
"os"
"path/filepath"
"strings"
"sync"
"time"
"unsafe"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/gpu"
)
type dynExtServer struct {
s C.struct_dynamic_llama_server
options *api.Options
}
// Note: current implementation does not support concurrent instantiations
var mutex sync.Mutex
func newExtServerResp(len C.size_t) C.ext_server_resp_t {
var resp C.ext_server_resp_t
resp.msg_len = len
bytes := make([]byte, len)
resp.msg = (*C.char)(C.CBytes(bytes))
return resp
}
func freeExtServerResp(resp C.ext_server_resp_t) {
if resp.msg_len == 0 {
return
}
C.free(unsafe.Pointer(resp.msg))
}
func extServerResponseToErr(resp C.ext_server_resp_t) error {
return fmt.Errorf(C.GoString(resp.msg))
}
func newDynExtServer(library, model string, adapters, projectors []string, opts *api.Options) (LLM, error) {
if !mutex.TryLock() {
slog.Info("concurrent llm servers not yet supported, waiting for prior server to complete")
mutex.Lock()
}
gpu.UpdatePath(filepath.Dir(library))
libPath := C.CString(library)
defer C.free(unsafe.Pointer(libPath))
resp := newExtServerResp(512)
defer freeExtServerResp(resp)
var srv C.struct_dynamic_llama_server
C.dyn_init(libPath, &srv, &resp)
if resp.id < 0 {
mutex.Unlock()
return nil, fmt.Errorf("Unable to load dynamic library: %s", C.GoString(resp.msg))
}
llm := dynExtServer{
s: srv,
options: opts,
}
slog.Info(fmt.Sprintf("Loading Dynamic llm server: %s", library))
var sparams C.ext_server_params_t
sparams.model = C.CString(model)
defer C.free(unsafe.Pointer(sparams.model))
sparams.embedding = true
sparams.n_ctx = C.uint(opts.NumCtx)
sparams.n_batch = C.uint(opts.NumBatch)
sparams.n_gpu_layers = C.int(opts.NumGPU)
sparams.main_gpu = C.int(opts.MainGPU)
sparams.n_parallel = 1 // TODO - wire up concurrency
// Always use the value encoded in the model
sparams.rope_freq_base = 0.0
sparams.rope_freq_scale = 0.0
sparams.memory_f16 = C.bool(opts.F16KV)
sparams.use_mlock = C.bool(opts.UseMLock)
sparams.use_mmap = C.bool(opts.UseMMap)
if opts.UseNUMA {
sparams.numa = C.int(1)
} else {
sparams.numa = C.int(0)
}
sparams.lora_adapters = nil
for i := 0; i < len(adapters); i++ {
la := (*C.ext_server_lora_adapter_t)(C.malloc(C.sizeof_ext_server_lora_adapter_t))
defer C.free(unsafe.Pointer(la))
la.adapter = C.CString(adapters[i])
defer C.free(unsafe.Pointer(la.adapter))
la.scale = C.float(1.0) // TODO expose scale/weights up through ollama UX
la.next = nil
if i == 0 {
sparams.lora_adapters = la
} else {
tmp := sparams.lora_adapters
for ; tmp.next != nil; tmp = tmp.next {
}
tmp.next = la
}
}
if len(projectors) > 0 {
// TODO: applying multiple projectors is not supported by the llama.cpp server yet
sparams.mmproj = C.CString(projectors[0])
defer C.free(unsafe.Pointer(sparams.mmproj))
} else {
sparams.mmproj = nil
}
sparams.n_threads = C.uint(opts.NumThread)
if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
sparams.verbose_logging = C.bool(true)
} else {
sparams.verbose_logging = C.bool(false)
}
slog.Info("Initializing llama server")
slog.Debug(fmt.Sprintf("server params: %+v", sparams))
initResp := newExtServerResp(512)
defer freeExtServerResp(initResp)
C.dyn_llama_server_init(llm.s, &sparams, &initResp)
if initResp.id < 0 {
mutex.Unlock()
err := extServerResponseToErr(initResp)
slog.Debug(fmt.Sprintf("failure during initialization: %s", err))
return nil, err
}
slog.Info("Starting llama main loop")
C.dyn_llama_server_start(llm.s)
return &llm, nil
}
func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
resp := newExtServerResp(128)
defer freeExtServerResp(resp)
if len(predict.Images) > 0 {
slog.Info(fmt.Sprintf("loaded %d images", len(predict.Images)))
}
request := map[string]any{
"prompt": predict.Prompt,
"stream": true,
"n_predict": predict.Options.NumPredict,
"n_keep": predict.Options.NumKeep,
"temperature": predict.Options.Temperature,
"top_k": predict.Options.TopK,
"top_p": predict.Options.TopP,
"tfs_z": predict.Options.TFSZ,
"typical_p": predict.Options.TypicalP,
"repeat_last_n": predict.Options.RepeatLastN,
"repeat_penalty": predict.Options.RepeatPenalty,
"presence_penalty": predict.Options.PresencePenalty,
"frequency_penalty": predict.Options.FrequencyPenalty,
"mirostat": predict.Options.Mirostat,
"mirostat_tau": predict.Options.MirostatTau,
"mirostat_eta": predict.Options.MirostatEta,
"penalize_nl": predict.Options.PenalizeNewline,
"seed": predict.Options.Seed,
"stop": predict.Options.Stop,
"image_data": predict.Images,
"cache_prompt": true,
}
if predict.Format == "json" {
request["grammar"] = jsonGrammar
if !strings.Contains(strings.ToLower(predict.Prompt), "json") {
slog.Warn("Prompt does not specify that the LLM should response in JSON, but JSON format is expected. For best results specify that JSON is expected in the system prompt.")
}
}
retryDelay := 100 * time.Microsecond
for retries := 0; retries < maxRetries; retries++ {
if retries > 0 {
time.Sleep(retryDelay) // wait before retrying
retryDelay *= 2 // exponential backoff
}
// Handling JSON marshaling with special characters unescaped.
buffer := &bytes.Buffer{}
enc := json.NewEncoder(buffer)
enc.SetEscapeHTML(false)
if err := enc.Encode(request); err != nil {
return fmt.Errorf("failed to marshal data: %w", err)
}
req := C.CString(buffer.String())
defer C.free(unsafe.Pointer(req))
C.dyn_llama_server_completion(llm.s, req, &resp)
if resp.id < 0 {
return extServerResponseToErr(resp)
}
retryNeeded := false
// keep track of the last token generated, this is used to abort if the model starts looping
var lastToken string
var tokenRepeat int
out:
for {
select {
case <-ctx.Done():
return cancelCompletion(llm, resp)
default:
var result C.ext_server_task_result_t
C.dyn_llama_server_completion_next_result(llm.s, resp.id, &result)
json_resp := C.GoString(result.json_resp)
C.dyn_llama_server_release_task_result(llm.s, &result)
var p prediction
if err := json.Unmarshal([]byte(json_resp), &p); err != nil {
C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
if resp.id < 0 {
return fmt.Errorf("error unmarshaling llm prediction response: %w and cancel %s", err, C.GoString(resp.msg))
} else {
return fmt.Errorf("error unmarshaling llm prediction response: %w", err)
}
}
if bool(result.error) && strings.Contains(json_resp, "slot unavailable") {
retryNeeded = true
// task will already be canceled
break out
}
switch {
case strings.TrimSpace(p.Content) == lastToken:
tokenRepeat++
default:
lastToken = strings.TrimSpace(p.Content)
tokenRepeat = 0
}
// 30 picked as an arbitrary max token repeat limit, modify as needed
if tokenRepeat > 30 {
slog.Debug("prediction aborted, token repeat limit reached")
return cancelCompletion(llm, resp)
}
if p.Content != "" {
fn(PredictResult{
Content: p.Content,
})
}
if p.Stop || bool(result.stop) {
fn(PredictResult{
Done: true,
PromptEvalCount: p.Timings.PromptN,
PromptEvalDuration: parseDurationMs(p.Timings.PromptMS),
EvalCount: p.Timings.PredictedN,
EvalDuration: parseDurationMs(p.Timings.PredictedMS),
})
return nil
}
}
}
if !retryNeeded {
return nil // success
}
}
// should never reach here ideally
return fmt.Errorf("max retries exceeded")
}
func cancelCompletion(llm *dynExtServer, resp C.ext_server_resp_t) error {
C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
if resp.id < 0 {
return extServerResponseToErr(resp)
} else {
return nil
}
}
func (llm *dynExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
data, err := json.Marshal(TokenizeRequest{Content: prompt})
if err != nil {
return nil, fmt.Errorf("marshaling encode data: %w", err)
}
req := C.CString(string(data))
defer C.free(unsafe.Pointer(req))
var json_resp *C.char
resp := newExtServerResp(128)
defer freeExtServerResp(resp)
C.dyn_llama_server_tokenize(llm.s, req, &json_resp, &resp)
if resp.id < 0 {
return nil, extServerResponseToErr(resp)
}
defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
var encoded TokenizeResponse
if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &encoded); err2 != nil {
return nil, fmt.Errorf("unmarshal encode response: %w", err2)
}
return encoded.Tokens, err
}
func (llm *dynExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
if len(tokens) == 0 {
return "", nil
}
data, err := json.Marshal(DetokenizeRequest{Tokens: tokens})
if err != nil {
return "", fmt.Errorf("marshaling decode data: %w", err)
}
req := C.CString(string(data))
defer C.free(unsafe.Pointer(req))
var json_resp *C.char
resp := newExtServerResp(128)
defer freeExtServerResp(resp)
C.dyn_llama_server_detokenize(llm.s, req, &json_resp, &resp)
if resp.id < 0 {
return "", extServerResponseToErr(resp)
}
defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
var decoded DetokenizeResponse
if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &decoded); err2 != nil {
return "", fmt.Errorf("unmarshal encode response: %w", err2)
}
return decoded.Content, err
}
func (llm *dynExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
data, err := json.Marshal(TokenizeRequest{Content: input})
if err != nil {
return nil, fmt.Errorf("error marshaling embed data: %w", err)
}
req := C.CString(string(data))
defer C.free(unsafe.Pointer(req))
var json_resp *C.char
resp := newExtServerResp(128)
defer freeExtServerResp(resp)
C.dyn_llama_server_embedding(llm.s, req, &json_resp, &resp)
if resp.id < 0 {
return nil, extServerResponseToErr(resp)
}
defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
var embedding EmbeddingResponse
if err := json.Unmarshal([]byte(C.GoString(json_resp)), &embedding); err != nil {
return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
}
return embedding.Embedding, nil
}
func (llm *dynExtServer) Close() {
C.dyn_llama_server_stop(llm.s)
mutex.Unlock()
}
#include <stdlib.h>
#include "ext_server.h"
#ifdef __cplusplus
extern "C" {
#endif
struct dynamic_llama_server {
void *handle;
void (*llama_server_init)(ext_server_params_t *sparams,
ext_server_resp_t *err);
void (*llama_server_start)();
void (*llama_server_stop)();
void (*llama_server_completion)(const char *json_req,
ext_server_resp_t *resp);
void (*llama_server_completion_next_result)(const int task_id,
ext_server_task_result_t *result);
void (*llama_server_completion_cancel)(const int task_id,
ext_server_resp_t *err);
void (*llama_server_release_task_result)(ext_server_task_result_t *result);
void (*llama_server_tokenize)(const char *json_req, char **json_resp,
ext_server_resp_t *err);
void (*llama_server_detokenize)(const char *json_req, char **json_resp,
ext_server_resp_t *err);
void (*llama_server_embedding)(const char *json_req, char **json_resp,
ext_server_resp_t *err);
void (*llama_server_release_json_resp)(char **json_resp);
};
void dyn_init(const char *libPath, struct dynamic_llama_server *s,
ext_server_resp_t *err);
// No good way to call C function pointers from Go so inline the indirection
void dyn_llama_server_init(struct dynamic_llama_server s,
ext_server_params_t *sparams,
ext_server_resp_t *err);
void dyn_llama_server_start(struct dynamic_llama_server s);
void dyn_llama_server_stop(struct dynamic_llama_server s);
void dyn_llama_server_completion(struct dynamic_llama_server s,
const char *json_req,
ext_server_resp_t *resp);
void dyn_llama_server_completion_next_result(
struct dynamic_llama_server s, const int task_id,
ext_server_task_result_t *result);
void dyn_llama_server_completion_cancel(struct dynamic_llama_server s,
const int task_id,
ext_server_resp_t *err);
void dyn_llama_server_release_task_result(
struct dynamic_llama_server s, ext_server_task_result_t *result);
void dyn_llama_server_tokenize(struct dynamic_llama_server s,
const char *json_req, char **json_resp,
ext_server_resp_t *err);
void dyn_llama_server_detokenize(struct dynamic_llama_server s,
const char *json_req,
char **json_resp,
ext_server_resp_t *err);
void dyn_llama_server_embedding(struct dynamic_llama_server s,
const char *json_req, char **json_resp,
ext_server_resp_t *err);
void dyn_llama_server_release_json_resp(struct dynamic_llama_server s,
char **json_resp);
#ifdef __cplusplus
}
#endif
\ No newline at end of file
set(TARGET ext_server) set(TARGET ollama_llama_server)
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h)
install(TARGETS ${TARGET} RUNTIME)
target_compile_definitions(${TARGET} PRIVATE
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
)
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
if (WIN32) if (WIN32)
add_library(${TARGET} SHARED ext_server.cpp ../llama.cpp/llama.cpp) TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
else()
add_library(${TARGET} STATIC ext_server.cpp ../llama.cpp/llama.cpp)
endif() endif()
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_definitions(${TARGET} PUBLIC LLAMA_SERVER_LIBRARY=1) \ No newline at end of file
target_link_libraries(${TARGET} PRIVATE ggml llava common )
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(${TARGET} PRIVATE SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>)
install(TARGETS ext_server LIBRARY)
if (CUDAToolkit_FOUND)
target_include_directories(${TARGET} PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
if (WIN32)
target_link_libraries(${TARGET} PRIVATE nvml)
endif()
endif()
\ No newline at end of file
# Extern C Server
This directory contains a thin facade we layer on top of the Llama.cpp server to
expose `extern C` interfaces to access the functionality through direct API
calls in-process. The llama.cpp code uses compile time macros to configure GPU
type along with other settings. During the `go generate ./...` execution, the
build will generate one or more copies of the llama.cpp `extern C` server based
on what GPU libraries are detected to support multiple GPU types as well as CPU
only support. The Ollama go build then embeds these different servers to support
different GPUs and settings at runtime.
If you are making changes to the code in this directory, make sure to disable
caching during your go build to ensure you pick up your changes. A typical
iteration cycle from the top of the source tree looks like:
```
go generate ./... && go build -a .
```
\ No newline at end of file
#include "ext_server.h"
#include <atomic>
// Necessary evil since the server types are not defined in a header
#include "server.cpp"
// Low level API access to verify GPU access
#if defined(GGML_USE_CUBLAS)
#if defined(GGML_USE_HIPBLAS)
#include <hip/hip_runtime.h>
#include <hipblas/hipblas.h>
#include <hip/hip_fp16.h>
#ifdef __HIP_PLATFORM_AMD__
// for rocblas_initialize()
#include "rocblas/rocblas.h"
#endif // __HIP_PLATFORM_AMD__
#define cudaGetDevice hipGetDevice
#define cudaError_t hipError_t
#define cudaSuccess hipSuccess
#define cudaGetErrorString hipGetErrorString
#else
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cuda_fp16.h>
#endif // defined(GGML_USE_HIPBLAS)
#endif // GGML_USE_CUBLAS
// Expose the llama server as a callable extern "C" API
llama_server_context *llama = NULL;
std::thread ext_server_thread;
bool shutting_down = false;
std::atomic_int recv_counter;
// RAII wrapper for tracking in-flight recv calls
class atomicRecv {
public:
atomicRecv(std::atomic<int> &atomic) : atomic(atomic) {
++this->atomic;
}
~atomicRecv() {
--this->atomic;
}
private:
std::atomic<int> &atomic;
};
void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
recv_counter = 0;
assert(err != NULL && sparams != NULL);
log_set_target(stderr);
if (!sparams->verbose_logging) {
server_verbose = true;
log_disable();
}
LOG_TEE("system info: %s\n", llama_print_system_info());
err->id = 0;
err->msg[0] = '\0';
try {
llama = new llama_server_context;
gpt_params params;
params.n_ctx = sparams->n_ctx;
params.n_batch = sparams->n_batch;
if (sparams->n_threads > 0) {
params.n_threads = sparams->n_threads;
}
params.n_parallel = sparams->n_parallel;
params.rope_freq_base = sparams->rope_freq_base;
params.rope_freq_scale = sparams->rope_freq_scale;
if (sparams->memory_f16) {
params.cache_type_k = "f16";
params.cache_type_v = "f16";
} else {
params.cache_type_k = "f32";
params.cache_type_v = "f32";
}
params.n_gpu_layers = sparams->n_gpu_layers;
params.main_gpu = sparams->main_gpu;
params.use_mlock = sparams->use_mlock;
params.use_mmap = sparams->use_mmap;
params.numa = (ggml_numa_strategy)sparams->numa;
params.embedding = sparams->embedding;
if (sparams->model != NULL) {
params.model = sparams->model;
}
if (sparams->lora_adapters != NULL) {
for (ext_server_lora_adapter *la = sparams->lora_adapters; la != NULL;
la = la->next) {
params.lora_adapter.push_back(std::make_tuple(la->adapter, la->scale));
}
params.use_mmap = false;
}
if (sparams->mmproj != NULL) {
params.mmproj = std::string(sparams->mmproj);
}
#if defined(GGML_USE_CUBLAS)
// Before attempting to init the backend which will assert on error, verify the CUDA/ROCM GPU is accessible
LOG_TEE("Performing pre-initialization of GPU\n");
int id;
cudaError_t cudaErr = cudaGetDevice(&id);
if (cudaErr != cudaSuccess) {
err->id = -1;
snprintf(err->msg, err->msg_len, "Unable to init GPU: %s", cudaGetErrorString(cudaErr));
return;
}
#endif
llama_backend_init();
llama_numa_init(params.numa);
if (!llama->load_model(params)) {
// an error occurred that was not thrown
err->id = -1;
snprintf(err->msg, err->msg_len, "error loading model %s", params.model.c_str());
return;
}
llama->initialize();
} catch (std::exception &e) {
err->id = -1;
snprintf(err->msg, err->msg_len, "exception %s", e.what());
} catch (...) {
err->id = -1;
snprintf(err->msg, err->msg_len,
"Unknown exception initializing llama server");
}
}
void llama_server_start() {
assert(llama != NULL);
// TODO mutex to protect thread creation
ext_server_thread = std::thread([&]() {
try {
LOG_TEE("llama server main loop starting\n");
ggml_time_init();
llama->queue_tasks.on_new_task(std::bind(
&llama_server_context::process_single_task, llama, std::placeholders::_1));
llama->queue_tasks.on_finish_multitask(std::bind(
&llama_server_context::on_finish_multitask, llama, std::placeholders::_1));
llama->queue_tasks.on_run_slots(std::bind(
&llama_server_context::update_slots, llama));
llama->queue_results.on_multitask_update(std::bind(
&llama_server_queue::update_multitask,
&llama->queue_tasks,
std::placeholders::_1,
std::placeholders::_2,
std::placeholders::_3
));
llama->queue_tasks.start_loop();
} catch (std::exception &e) {
LOG_TEE("caught exception in llama server main loop: %s\n", e.what());
} catch (...) {
LOG_TEE("caught unknown exception in llama server main loop\n");
}
LOG_TEE("\nllama server shutting down\n");
llama_backend_free();
});
}
void llama_server_stop() {
assert(llama != NULL);
// Shutdown any in-flight requests and block incoming requests.
LOG_TEE("\ninitiating shutdown - draining remaining tasks...\n");
shutting_down = true;
while (recv_counter.load() > 0) {
std::this_thread::sleep_for(std::chrono::milliseconds(50));
}
// This may take a while for any pending tasks to drain
// TODO - consider a timeout to cancel tasks if it's taking too long
llama->queue_tasks.terminate();
ext_server_thread.join();
delete llama;
llama = NULL;
LOG_TEE("llama server shutdown complete\n");
shutting_down = false;
}
void llama_server_completion(const char *json_req, ext_server_resp_t *resp) {
assert(llama != NULL && json_req != NULL && resp != NULL);
resp->id = -1;
resp->msg[0] = '\0';
try {
if (shutting_down) {
throw std::runtime_error("server shutting down");
}
json data = json::parse(json_req);
resp->id = llama->queue_tasks.get_new_id();
llama->queue_results.add_waiting_task_id(resp->id);
llama->request_completion(resp->id, data, false, false, -1);
} catch (std::exception &e) {
snprintf(resp->msg, resp->msg_len, "exception %s", e.what());
} catch (...) {
snprintf(resp->msg, resp->msg_len, "Unknown exception during completion");
}
}
void llama_server_completion_next_result(const int task_id,
ext_server_task_result_t *resp) {
assert(llama != NULL && resp != NULL);
resp->id = -1;
resp->stop = false;
resp->error = false;
resp->json_resp = NULL;
std::string result_json;
try {
atomicRecv ar(recv_counter);
task_result result = llama->queue_results.recv(task_id);
result_json =
result.result_json.dump(-1, ' ', false, json::error_handler_t::replace);
resp->id = result.id;
resp->stop = result.stop;
resp->error = result.error;
if (result.error) {
LOG_TEE("next result cancel on error\n");
llama->request_cancel(task_id);
LOG_TEE("next result removing waiting tak ID: %d\n", task_id);
llama->queue_results.remove_waiting_task_id(task_id);
} else if (result.stop) {
LOG_TEE("next result cancel on stop\n");
llama->request_cancel(task_id);
LOG_TEE("next result removing waiting task ID: %d\n", task_id);
llama->queue_results.remove_waiting_task_id(task_id);
} else if (shutting_down) {
LOG_TEE("aborting completion due to shutdown %d\n", task_id);
llama->request_cancel(task_id);
llama->queue_results.remove_waiting_task_id(task_id);
resp->stop = true;
}
} catch (std::exception &e) {
resp->error = true;
resp->id = -1;
result_json = "{\"error\":\"exception " + std::string(e.what()) + "\"}";
LOG_TEE("llama server completion exception %s\n", e.what());
} catch (...) {
resp->error = true;
resp->id = -1;
result_json = "{\"error\":\"Unknown exception during completion\"}";
LOG_TEE("llama server completion unknown exception\n");
}
const std::string::size_type size = result_json.size() + 1;
resp->json_resp = new char[size];
snprintf(resp->json_resp, size, "%s", result_json.c_str());
}
void llama_server_release_task_result(ext_server_task_result_t *result) {
if (result == NULL || result->json_resp == NULL) {
return;
}
delete[] result->json_resp;
}
void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err) {
assert(llama != NULL && err != NULL);
err->id = 0;
err->msg[0] = '\0';
try {
llama->request_cancel(task_id);
llama->queue_results.remove_waiting_task_id(task_id);
} catch (std::exception &e) {
err->id = -1;
snprintf(err->msg, err->msg_len, "exception %s", e.what());
} catch (...) {
err->id = -1;
snprintf(err->msg, err->msg_len,
"Unknown exception completion cancel in llama server");
}
}
void llama_server_tokenize(const char *json_req, char **json_resp,
ext_server_resp_t *err) {
assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
*json_resp = NULL;
err->id = 0;
err->msg[0] = '\0';
try {
if (shutting_down) {
throw std::runtime_error("server shutting down");
}
const json body = json::parse(json_req);
std::vector<llama_token> tokens;
if (body.count("content") != 0) {
tokens = llama->tokenize(body["content"], false);
}
const json data = format_tokenizer_response(tokens);
std::string result_json = data.dump();
const std::string::size_type size = result_json.size() + 1;
*json_resp = new char[size];
snprintf(*json_resp, size, "%s", result_json.c_str());
} catch (std::exception &e) {
err->id = -1;
snprintf(err->msg, err->msg_len, "exception %s", e.what());
} catch (...) {
err->id = -1;
snprintf(err->msg, err->msg_len, "Unknown exception during tokenize");
}
}
void llama_server_release_json_resp(char **json_resp) {
if (json_resp == NULL || *json_resp == NULL) {
return;
}
delete[] *json_resp;
}
void llama_server_detokenize(const char *json_req, char **json_resp,
ext_server_resp_t *err) {
assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
*json_resp = NULL;
err->id = 0;
err->msg[0] = '\0';
try {
if (shutting_down) {
throw std::runtime_error("server shutting down");
}
const json body = json::parse(json_req);
std::string content;
if (body.count("tokens") != 0) {
const std::vector<llama_token> tokens = body["tokens"];
content = tokens_to_str(llama->ctx, tokens.cbegin(), tokens.cend());
}
const json data = format_detokenized_response(content);
std::string result_json = data.dump();
const std::string::size_type size = result_json.size() + 1;
*json_resp = new char[size];
snprintf(*json_resp, size, "%s", result_json.c_str());
} catch (std::exception &e) {
err->id = -1;
snprintf(err->msg, err->msg_len, "exception %s", e.what());
} catch (...) {
err->id = -1;
snprintf(err->msg, err->msg_len, "Unknown exception during detokenize");
}
}
void llama_server_embedding(const char *json_req, char **json_resp,
ext_server_resp_t *err) {
assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
*json_resp = NULL;
err->id = 0;
err->msg[0] = '\0';
try {
if (shutting_down) {
throw std::runtime_error("server shutting down");
}
const json body = json::parse(json_req);
json prompt;
if (body.count("content") != 0) {
prompt = body["content"];
} else {
prompt = "";
}
const int task_id = llama->queue_tasks.get_new_id();
llama->queue_results.add_waiting_task_id(task_id);
llama->request_completion(task_id, {{"prompt", prompt}, {"n_predict", 0}}, false, true, -1);
atomicRecv ar(recv_counter);
task_result result = llama->queue_results.recv(task_id);
std::string result_json = result.result_json.dump();
const std::string::size_type size = result_json.size() + 1;
*json_resp = new char[size];
snprintf(*json_resp, size, "%s", result_json.c_str());
llama->queue_results.remove_waiting_task_id(task_id);
} catch (std::exception &e) {
err->id = -1;
snprintf(err->msg, err->msg_len, "exception %s", e.what());
} catch (...) {
err->id = -1;
snprintf(err->msg, err->msg_len, "Unknown exception during embedding");
}
}
\ No newline at end of file
#if defined(LLAMA_SERVER_LIBRARY)
#ifndef LLAMA_SERVER_H
#define LLAMA_SERVER_H
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
int __main(int argc, char **argv);
// This exposes extern C entrypoints into the llama_server
// To enable the server compile with LLAMA_SERVER_LIBRARY
#ifdef __cplusplus
extern "C" {
#endif
typedef struct ext_server_resp {
int id; // < 0 on error
size_t msg_len; // caller must allocate msg and set msg_len
char *msg;
} ext_server_resp_t;
// Allocated and freed by caller
typedef struct ext_server_lora_adapter {
char *adapter;
float scale;
struct ext_server_lora_adapter *next;
} ext_server_lora_adapter_t;
// Allocated and freed by caller
typedef struct ext_server_params {
char *model;
uint32_t n_ctx; // token context window, 0 = from model
uint32_t n_batch; // prompt processing maximum batch size
uint32_t n_threads; // number of threads to use for generation
int32_t n_parallel; // number of parallel sequences to decodewra
float rope_freq_base; // RoPE base frequency, 0 = from model
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
bool memory_f16; // use f16 instead of f32 for memory kv
int32_t n_gpu_layers; // number of layers to store in VRAM (-1 - use default)
int32_t main_gpu; // the GPU that is used for scratch and small tensors
bool use_mlock; // force system to keep model in RAM
bool use_mmap; // use mmap if possible
int numa; // attempt optimizations that help on some NUMA systems
bool embedding; // get only sentence embedding
ext_server_lora_adapter_t *lora_adapters;
char *mmproj;
bool verbose_logging; // Enable verbose logging of the server
} ext_server_params_t;
typedef struct ext_server_task_result {
int id;
bool stop;
bool error;
char *json_resp; // null terminated, memory managed by ext_server
} ext_server_task_result_t;
// Initialize the server once per process
// err->id = 0 for success and err->msg[0] = NULL
// err->id != 0 for failure, and err->msg contains error message
void llama_server_init(ext_server_params_t *sparams, ext_server_resp_t *err);
// Run the main loop, called once per init
void llama_server_start();
// Stop the main loop and free up resources allocated in init and start. Init
// must be called again to reuse
void llama_server_stop();
// json_req null terminated string, memory managed by caller
// resp->id >= 0 on success (task ID)
// resp->id < 0 on error, and resp->msg contains error message
void llama_server_completion(const char *json_req, ext_server_resp_t *resp);
// Caller must call llama_server_release_task_result to free resp->json_resp
void llama_server_completion_next_result(const int task_id,
ext_server_task_result_t *result);
void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err);
void llama_server_release_task_result(ext_server_task_result_t *result);
// Caller must call llama_server_releaes_json_resp to free json_resp if err.id <
// 0
void llama_server_tokenize(const char *json_req, char **json_resp,
ext_server_resp_t *err);
void llama_server_detokenize(const char *json_req, char **json_resp,
ext_server_resp_t *err);
void llama_server_embedding(const char *json_req, char **json_resp,
ext_server_resp_t *err);
void llama_server_release_json_resp(char **json_resp);
#ifdef __cplusplus
}
#endif
#endif
#endif // LLAMA_SERVER_LIBRARY
\ No newline at end of file
...@@ -2768,7 +2768,7 @@ inline void signal_handler(int signal) { ...@@ -2768,7 +2768,7 @@ inline void signal_handler(int signal) {
shutdown_handler(signal); shutdown_handler(signal);
} }
int _main(int argc, char **argv) int main(int argc, char **argv)
{ {
#if SERVER_VERBOSE != 1 #if SERVER_VERBOSE != 1
log_disable(); log_disable();
......
...@@ -14,7 +14,7 @@ init_vars() { ...@@ -14,7 +14,7 @@ init_vars() {
LLAMACPP_DIR=../llama.cpp LLAMACPP_DIR=../llama.cpp
CMAKE_DEFS="" CMAKE_DEFS=""
CMAKE_TARGETS="--target ext_server" CMAKE_TARGETS="--target ollama_llama_server"
if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}" CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}"
else else
...@@ -81,27 +81,24 @@ apply_patches() { ...@@ -81,27 +81,24 @@ apply_patches() {
build() { build() {
cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS} cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS}
cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8 cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8
mkdir -p ${BUILD_DIR}/lib/
ls ${BUILD_DIR}
g++ -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.${LIB_EXT} \
${GCC_ARCH} \
${WHOLE_ARCHIVE} ${BUILD_DIR}/ext_server/libext_server.a ${NO_WHOLE_ARCHIVE} \
${BUILD_DIR}/common/libcommon.a \
${BUILD_DIR}/libllama.a \
-Wl,-rpath,\$ORIGIN \
-lpthread -ldl -lm \
${EXTRA_LIBS}
} }
compress_libs() { compress() {
echo "Compressing payloads to reduce overall binary size..." echo "Compressing payloads to reduce overall binary size..."
pids="" pids=""
rm -rf ${BUILD_DIR}/lib/*.${LIB_EXT}*.gz rm -rf ${BUILD_DIR}/bin/*.gz
for lib in ${BUILD_DIR}/lib/*.${LIB_EXT}* ; do for f in ${BUILD_DIR}/bin/* ; do
gzip -n --best -f ${lib} & gzip -n --best -f ${f} &
pids+=" $!" pids+=" $!"
done done
echo # check for lib directory
if [ -d ${BUILD_DIR}/lib ]; then
for f in ${BUILD_DIR}/lib/* ; do
gzip -n --best -f ${f} &
pids+=" $!"
done
fi
echo
for pid in ${pids}; do for pid in ${pids}; do
wait $pid wait $pid
done done
......
...@@ -18,21 +18,31 @@ sign() { ...@@ -18,21 +18,31 @@ sign() {
fi fi
} }
COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin" COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_METAL_EMBED_LIBRARY=on"
case "${GOARCH}" in case "${GOARCH}" in
"amd64") "amd64")
COMMON_CPU_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_NATIVE=off" COMMON_CPU_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_NATIVE=off"
# Static build for linking into the Go binary
init_vars
CMAKE_TARGETS="--target llama --target ggml"
CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}_static"
echo "Building static library"
build
# #
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta) # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
# #
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu" BUILD_DIR="../build/darwin/${ARCH}/cpu"
echo "Building LCD CPU" echo "Building LCD CPU"
build build
sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu/lib/libext_server.dylib sign ${BUILD_DIR}/lib/libext_server.dylib
compress_libs compress
# #
# ~2011 CPU Dynamic library with more capabilities turned on to optimize performance # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
...@@ -40,11 +50,11 @@ case "${GOARCH}" in ...@@ -40,11 +50,11 @@ case "${GOARCH}" in
# #
init_vars init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx" BUILD_DIR="../build/darwin/${ARCH}/cpu_avx"
echo "Building AVX CPU" echo "Building AVX CPU"
build build
sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx/lib/libext_server.dylib sign ${BUILD_DIR}/lib/libext_server.dylib
compress_libs compress
# #
# ~2013 CPU Dynamic library # ~2013 CPU Dynamic library
...@@ -52,20 +62,30 @@ case "${GOARCH}" in ...@@ -52,20 +62,30 @@ case "${GOARCH}" in
# #
init_vars init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}" CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx2" BUILD_DIR="../build/darwin/${ARCH}/cpu_avx2"
echo "Building AVX2 CPU" echo "Building AVX2 CPU"
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation" EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
build build
sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx2/lib/libext_server.dylib sign ${BUILD_DIR}/lib/libext_server.dylib
compress_libs compress
;; ;;
"arm64") "arm64")
# Static build for linking into the Go binary
init_vars
CMAKE_TARGETS="--target llama --target ggml"
CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}_static"
echo "Building static library"
build
init_vars
CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DLLAMA_METAL_EMBED_LIBRARY=on -DLLAMA_ACCELERATE=on -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=on ${CMAKE_DEFS}" CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DLLAMA_METAL_EMBED_LIBRARY=on -DLLAMA_ACCELERATE=on -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=on ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/metal" BUILD_DIR="../build/darwin/${ARCH}/metal"
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders" EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
build build
sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/metal/lib/libext_server.dylib sign ${BUILD_DIR}/lib/libext_server.dylib
compress_libs compress
;; ;;
*) *)
echo "GOARCH must be set" echo "GOARCH must be set"
...@@ -75,3 +95,4 @@ case "${GOARCH}" in ...@@ -75,3 +95,4 @@ case "${GOARCH}" in
esac esac
cleanup cleanup
echo "go generate completed. LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
...@@ -57,16 +57,31 @@ init_vars ...@@ -57,16 +57,31 @@ init_vars
git_module_setup git_module_setup
apply_patches apply_patches
init_vars
if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "static" ]; then
# Static build for linking into the Go binary
init_vars
CMAKE_TARGETS="--target llama --target ggml"
CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/linux/${ARCH}_static"
echo "Building static library"
build
fi
# Users building from source can tune the exact flags we pass to cmake for configuring # Users building from source can tune the exact flags we pass to cmake for configuring
# llama.cpp, and we'll build only 1 CPU variant in that case as the default. # llama.cpp, and we'll build only 1 CPU variant in that case as the default.
if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
init_vars
echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\"" echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}" CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu" BUILD_DIR="../build/linux/${ARCH}/cpu"
echo "Building custom CPU" echo "Building custom CPU"
build build
compress_libs compress
else else
# Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512 # Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
# -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer # -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
...@@ -83,11 +98,12 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then ...@@ -83,11 +98,12 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
# #
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta) # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
# #
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu" BUILD_DIR="../build/linux/${ARCH}/cpu"
echo "Building LCD CPU" echo "Building LCD CPU"
build build
compress_libs compress
fi fi
if [ "${ARCH}" == "x86_64" ]; then if [ "${ARCH}" == "x86_64" ]; then
...@@ -101,10 +117,10 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then ...@@ -101,10 +117,10 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
# #
init_vars init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx" BUILD_DIR="../build/linux/${ARCH}/cpu_avx"
echo "Building AVX CPU" echo "Building AVX CPU"
build build
compress_libs compress
fi fi
if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx2" ]; then if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx2" ]; then
...@@ -114,10 +130,10 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then ...@@ -114,10 +130,10 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
# #
init_vars init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}" CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx2" BUILD_DIR="../build/linux/${ARCH}/cpu_avx2"
echo "Building AVX2 CPU" echo "Building AVX2 CPU"
build build
compress_libs compress
fi fi
fi fi
fi fi
...@@ -157,7 +173,7 @@ if [ -d "${CUDA_LIB_DIR}" ]; then ...@@ -157,7 +173,7 @@ if [ -d "${CUDA_LIB_DIR}" ]; then
ARM64_DEFS="-DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_CUDA_F16=off" ARM64_DEFS="-DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_CUDA_F16=off"
fi fi
CMAKE_DEFS="-DLLAMA_CUBLAS=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS}" CMAKE_DEFS="-DLLAMA_CUBLAS=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cuda${CUDA_VARIANT}" BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda" EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
build build
...@@ -165,20 +181,20 @@ if [ -d "${CUDA_LIB_DIR}" ]; then ...@@ -165,20 +181,20 @@ if [ -d "${CUDA_LIB_DIR}" ]; then
# #
# TODO - in the future we may shift to packaging these separately and conditionally # TODO - in the future we may shift to packaging these separately and conditionally
# downloading them in the install script. # downloading them in the install script.
DEPS="$(ldd ${BUILD_DIR}/lib/libext_server.so )" DEPS="$(ldd ${BUILD_DIR}/bin/ollama_llama_server )"
for lib in libcudart.so libcublas.so libcublasLt.so ; do for lib in libcudart.so libcublas.so libcublasLt.so ; do
DEP=$(echo "${DEPS}" | grep ${lib} | cut -f1 -d' ' | xargs || true) DEP=$(echo "${DEPS}" | grep ${lib} | cut -f1 -d' ' | xargs || true)
if [ -n "${DEP}" -a -e "${CUDA_LIB_DIR}/${DEP}" ]; then if [ -n "${DEP}" -a -e "${CUDA_LIB_DIR}/${DEP}" ]; then
cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/lib/" cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/bin/"
elif [ -e "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" ]; then elif [ -e "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" ]; then
cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/lib/" cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/bin/"
elif [ -e "${CUDART_LIB_DIR}/${lib}" ]; then elif [ -e "${CUDART_LIB_DIR}/${lib}" ]; then
cp -d ${CUDART_LIB_DIR}/${lib}* "${BUILD_DIR}/lib/" cp -d ${CUDART_LIB_DIR}/${lib}* "${BUILD_DIR}/bin/"
else else
cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/lib/" cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/bin/"
fi fi
done done
compress_libs compress
fi fi
...@@ -201,23 +217,24 @@ if [ -d "${ROCM_PATH}" ]; then ...@@ -201,23 +217,24 @@ if [ -d "${ROCM_PATH}" ]; then
fi fi
init_vars init_vars
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)" CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/rocm${ROCM_VARIANT}" BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}"
EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,\$ORIGIN/../../rocm/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu" EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,\$ORIGIN/../../rocm/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
build build
# Record the ROCM dependencies # Record the ROCM dependencies
rm -f "${BUILD_DIR}/lib/deps.txt" rm -f "${BUILD_DIR}/bin/deps.txt"
touch "${BUILD_DIR}/lib/deps.txt" touch "${BUILD_DIR}/bin/deps.txt"
for dep in $(ldd "${BUILD_DIR}/lib/libext_server.so" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo ); do for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo ); do
echo "${dep}" >> "${BUILD_DIR}/lib/deps.txt" echo "${dep}" >> "${BUILD_DIR}/bin/deps.txt"
done done
# bomb out if for some reason we didn't get a few deps # bomb out if for some reason we didn't get a few deps
if [ $(cat "${BUILD_DIR}/lib/deps.txt" | wc -l ) -lt 8 ] ; then if [ $(cat "${BUILD_DIR}/bin/deps.txt" | wc -l ) -lt 8 ] ; then
cat "${BUILD_DIR}/lib/deps.txt" cat "${BUILD_DIR}/bin/deps.txt"
echo "ERROR: deps file short" echo "ERROR: deps file short"
exit 1 exit 1
fi fi
compress_libs compress
fi fi
cleanup cleanup
echo "go generate completed. LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
...@@ -33,7 +33,7 @@ function init_vars { ...@@ -33,7 +33,7 @@ function init_vars {
"-DBUILD_SHARED_LIBS=on", "-DBUILD_SHARED_LIBS=on",
"-DLLAMA_NATIVE=off" "-DLLAMA_NATIVE=off"
) )
$script:cmakeTargets = @("ext_server") $script:cmakeTargets = @("ollama_llama_server")
$script:ARCH = "amd64" # arm not yet supported. $script:ARCH = "amd64" # arm not yet supported.
if ($env:CGO_CFLAGS -contains "-g") { if ($env:CGO_CFLAGS -contains "-g") {
$script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on", "-DCMAKE_BUILD_TYPE=RelWithDebInfo") $script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on", "-DCMAKE_BUILD_TYPE=RelWithDebInfo")
...@@ -97,16 +97,14 @@ function apply_patches { ...@@ -97,16 +97,14 @@ function apply_patches {
} }
# Checkout each file # Checkout each file
Set-Location -Path ${script:llamacppDir}
foreach ($file in $filePaths) { foreach ($file in $filePaths) {
git checkout $file git -C "${script:llamacppDir}" checkout $file
} }
} }
# Apply each patch # Apply each patch
foreach ($patch in $patches) { foreach ($patch in $patches) {
Set-Location -Path ${script:llamacppDir} git -C "${script:llamacppDir}" apply $patch.FullName
git apply $patch.FullName
} }
} }
...@@ -115,41 +113,41 @@ function build { ...@@ -115,41 +113,41 @@ function build {
& cmake --version & cmake --version
& cmake -S "${script:llamacppDir}" -B $script:buildDir $script:cmakeDefs & cmake -S "${script:llamacppDir}" -B $script:buildDir $script:cmakeDefs
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
write-host "building with: cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })" write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ })"
& cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ }) & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
} # Rearrange output to be consistent between different generators
if ($null -ne ${script:config} -And (test-path -path "${script:buildDir}/bin/${script:config}" ) ) {
function install { mv -force "${script:buildDir}/bin/${script:config}/*" "${script:buildDir}/bin/"
rm -ea 0 -recurse -force -path "${script:buildDir}/lib" remove-item "${script:buildDir}/bin/${script:config}"
md "${script:buildDir}/lib" -ea 0 > $null
cp "${script:buildDir}/bin/${script:config}/ext_server.dll" "${script:buildDir}/lib"
cp "${script:buildDir}/bin/${script:config}/llama.dll" "${script:buildDir}/lib"
# Display the dll dependencies in the build log
if ($script:DUMPBIN -ne $null) {
& "$script:DUMPBIN" /dependents "${script:buildDir}/bin/${script:config}/ext_server.dll" | select-string ".dll"
} }
} }
function sign { function sign {
if ("${env:KEY_CONTAINER}") { if ("${env:KEY_CONTAINER}") {
write-host "Signing ${script:buildDir}/lib/*.dll" write-host "Signing ${script:buildDir}/bin/*.exe ${script:buildDir}/bin/*.dll"
foreach ($file in (get-childitem "${script:buildDir}/lib/*.dll")){ foreach ($file in @(get-childitem "${script:buildDir}/bin/*.exe") + @(get-childitem "${script:buildDir}/bin/*.dll")){
& "${script:SignTool}" sign /v /debug /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" ` & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
/csp "Google Cloud KMS Provider" /kc "${env:KEY_CONTAINER}" $file /csp "Google Cloud KMS Provider" /kc "${env:KEY_CONTAINER}" $file
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
} }
} }
} }
function compress_libs { function compress {
if ($script:GZIP -eq $null) { if ($script:GZIP -eq $null) {
write-host "gzip not installed, not compressing files" write-host "gzip not installed, not compressing files"
return return
} }
write-host "Compressing binaries..."
$binaries = dir "${script:buildDir}/bin/*.exe"
foreach ($file in $binaries) {
& "$script:GZIP" --best -f $file
}
write-host "Compressing dlls..." write-host "Compressing dlls..."
$libs = dir "${script:buildDir}/lib/*.dll" $binaries = dir "${script:buildDir}/bin/*.dll"
foreach ($file in $libs) { foreach ($file in $dlls) {
& "$script:GZIP" --best -f $file & "$script:GZIP" --best -f $file
} }
} }
...@@ -164,14 +162,11 @@ function cleanup { ...@@ -164,14 +162,11 @@ function cleanup {
} }
# Checkout each file # Checkout each file
Set-Location -Path ${script:llamacppDir}
foreach ($file in $filePaths) { foreach ($file in $filePaths) {
git checkout $file git -C "${script:llamacppDir}" checkout $file
} }
git -C "${script:llamacppDir}" checkout CMakeLists.txt
} }
Set-Location "${script:llamacppDir}/"
git checkout CMakeLists.txt
} }
init_vars init_vars
...@@ -179,7 +174,6 @@ git_module_setup ...@@ -179,7 +174,6 @@ git_module_setup
apply_patches apply_patches
# -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer # -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
# -DLLAMA_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
# -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen # -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
# -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver # -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
...@@ -187,32 +181,46 @@ $script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on") ...@@ -187,32 +181,46 @@ $script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
if ($null -eq ${env:OLLAMA_SKIP_CPU_GENERATE}) { if ($null -eq ${env:OLLAMA_SKIP_CPU_GENERATE}) {
# GCC build for direct linking into the Go binary
init_vars
$script:cmakeTargets = @("llama", "ggml")
$script:cmakeDefs = @(
"-G", "MinGW Makefiles"
"-DBUILD_SHARED_LIBS=off",
"-DLLAMA_NATIVE=off",
"-DLLAMA_AVX=off",
"-DLLAMA_AVX2=off",
"-DLLAMA_AVX512=off",
"-DLLAMA_F16C=off",
"-DLLAMA_FMA=off")
$script:buildDir="../build/windows/${script:ARCH}_static"
write-host "Building static library"
build
# remaining llama.cpp builds use MSVC
init_vars init_vars
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu" $script:buildDir="../build/windows/${script:ARCH}/cpu"
write-host "Building LCD CPU" write-host "Building LCD CPU"
build build
install
sign sign
compress_libs compress
init_vars init_vars
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx" $script:buildDir="../build/windows/${script:ARCH}/cpu_avx"
write-host "Building AVX CPU" write-host "Building AVX CPU"
build build
install
sign sign
compress_libs compress
init_vars init_vars
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx2" $script:buildDir="../build/windows/${script:ARCH}/cpu_avx2"
write-host "Building AVX2 CPU" write-host "Building AVX2 CPU"
build build
install
sign sign
compress_libs compress
} else { } else {
write-host "Skipping CPU generation step as requested" write-host "Skipping CPU generation step as requested"
} }
...@@ -225,13 +233,11 @@ if ($null -ne $script:CUDA_LIB_DIR) { ...@@ -225,13 +233,11 @@ if ($null -ne $script:CUDA_LIB_DIR) {
$script:CUDA_VARIANT="_"+$script:CUDA_VERSION $script:CUDA_VARIANT="_"+$script:CUDA_VERSION
} }
init_vars init_vars
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT" $script:buildDir="../build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
$script:cmakeDefs += @("-A", "x64", "-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}") $script:cmakeDefs += @("-A", "x64", "-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
write-host "Building CUDA"
build build
install
sign sign
compress_libs compress
} }
if ($null -ne $env:HIP_PATH) { if ($null -ne $env:HIP_PATH) {
...@@ -241,7 +247,7 @@ if ($null -ne $env:HIP_PATH) { ...@@ -241,7 +247,7 @@ if ($null -ne $env:HIP_PATH) {
} }
init_vars init_vars
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/rocm$script:ROCM_VARIANT" $script:buildDir="../build/windows/${script:ARCH}/rocm$script:ROCM_VARIANT"
$script:cmakeDefs += @( $script:cmakeDefs += @(
"-G", "Ninja", "-G", "Ninja",
"-DCMAKE_C_COMPILER=clang.exe", "-DCMAKE_C_COMPILER=clang.exe",
...@@ -264,13 +270,13 @@ if ($null -ne $env:HIP_PATH) { ...@@ -264,13 +270,13 @@ if ($null -ne $env:HIP_PATH) {
build build
# Ninja doesn't prefix with config name # Ninja doesn't prefix with config name
${script:config}="" ${script:config}=""
install
if ($null -ne $script:DUMPBIN) { if ($null -ne $script:DUMPBIN) {
& "$script:DUMPBIN" /dependents "${script:buildDir}/bin/${script:config}/ext_server.dll" | select-string ".dll" & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/ollama_llama_server.exe" | select-string ".dll"
} }
sign sign
compress_libs compress
} }
cleanup cleanup
write-host "`ngo generate completed. LLM runners: $(get-childitem -path ${script:SRC_DIR}\llm\llama.cpp\build\windows\${script:ARCH})" write-host "`ngo generate completed. LLM runners: $(get-childitem -path ${script:SRC_DIR}\llm\build\windows\${script:ARCH})"
package generate package generate
//go:generate sh ./gen_darwin.sh //go:generate bash ./gen_darwin.sh
package llm
import (
_ "embed"
"fmt"
"time"
"github.com/ollama/ollama/api"
)
const jsonGrammar = `
root ::= object
value ::= object | array | string | number | ("true" | "false" | "null") ws
object ::=
"{" ws (
string ":" ws value
("," ws string ":" ws value)*
)? "}" ws
array ::=
"[" ws (
value
("," ws value)*
)? "]" ws
string ::=
"\"" (
[^"\\] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
)* "\"" ws
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
# Optional space: by convention, applied in this grammar after literal chars when allowed
ws ::= ([ \t\n] ws)?
`
type ImageData struct {
Data []byte `json:"data"`
ID int `json:"id"`
}
var payloadMissing = fmt.Errorf("expected dynamic library payloads not included in this build of ollama")
type prediction struct {
Content string `json:"content"`
Model string `json:"model"`
Prompt string `json:"prompt"`
Stop bool `json:"stop"`
Timings struct {
PredictedN int `json:"predicted_n"`
PredictedMS float64 `json:"predicted_ms"`
PromptN int `json:"prompt_n"`
PromptMS float64 `json:"prompt_ms"`
}
}
const maxRetries = 3
type PredictOpts struct {
Prompt string
Format string
Images []ImageData
Options api.Options
}
type PredictResult struct {
Content string
Done bool
PromptEvalCount int
PromptEvalDuration time.Duration
EvalCount int
EvalDuration time.Duration
}
type TokenizeRequest struct {
Content string `json:"content"`
}
type TokenizeResponse struct {
Tokens []int `json:"tokens"`
}
type DetokenizeRequest struct {
Tokens []int `json:"tokens"`
}
type DetokenizeResponse struct {
Content string `json:"content"`
}
type EmbeddingRequest struct {
Content string `json:"content"`
}
type EmbeddingResponse struct {
Embedding []float64 `json:"embedding"`
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment