Unverified Commit c863c6a9 authored by Daniel Hiltgen's avatar Daniel Hiltgen Committed by GitHub
Browse files

Merge pull request #3218 from dhiltgen/subprocess

Switch back to subprocessing for llama.cpp
parents 3b6a9154 1f11b525
...@@ -56,10 +56,12 @@ jobs: ...@@ -56,10 +56,12 @@ jobs:
- run: go get ./... - run: go get ./...
- run: | - run: |
$gopath=(get-command go).source | split-path -parent $gopath=(get-command go).source | split-path -parent
$gccpath=(get-command gcc).source | split-path -parent
& "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1" & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
cd $env:GITHUB_WORKSPACE cd $env:GITHUB_WORKSPACE
$env:CMAKE_SYSTEM_VERSION="10.0.22621.0" $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
$env:PATH="$gopath;$env:PATH" $env:PATH="$gopath;$gccpath;$env:PATH"
echo $env:PATH
go generate -x ./... go generate -x ./...
if: ${{ startsWith(matrix.os, 'windows-') }} if: ${{ startsWith(matrix.os, 'windows-') }}
name: "Windows Go Generate" name: "Windows Go Generate"
...@@ -69,7 +71,9 @@ jobs: ...@@ -69,7 +71,9 @@ jobs:
- uses: actions/upload-artifact@v4 - uses: actions/upload-artifact@v4
with: with:
name: ${{ matrix.os }}-${{ matrix.arch }}-libraries name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
path: llm/llama.cpp/build/**/lib/* path: |
llm/build/**/bin/*
llm/build/**/*.a
generate-cuda: generate-cuda:
needs: [changes] needs: [changes]
if: ${{ needs.changes.outputs.GENERATE_CUDA == 'True' }} if: ${{ needs.changes.outputs.GENERATE_CUDA == 'True' }}
...@@ -100,7 +104,7 @@ jobs: ...@@ -100,7 +104,7 @@ jobs:
- uses: actions/upload-artifact@v4 - uses: actions/upload-artifact@v4
with: with:
name: cuda-${{ matrix.cuda-version }}-libraries name: cuda-${{ matrix.cuda-version }}-libraries
path: llm/llama.cpp/build/**/lib/* path: llm/build/**/bin/*
generate-rocm: generate-rocm:
needs: [changes] needs: [changes]
if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }} if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }}
...@@ -131,7 +135,7 @@ jobs: ...@@ -131,7 +135,7 @@ jobs:
- uses: actions/upload-artifact@v4 - uses: actions/upload-artifact@v4
with: with:
name: rocm-${{ matrix.rocm-version }}-libraries name: rocm-${{ matrix.rocm-version }}-libraries
path: llm/llama.cpp/build/**/lib/* path: llm/build/**/lib/*
# ROCm generation step # ROCm generation step
generate-windows-rocm: generate-windows-rocm:
...@@ -244,17 +248,17 @@ jobs: ...@@ -244,17 +248,17 @@ jobs:
esac >>$GITHUB_ENV esac >>$GITHUB_ENV
shell: bash shell: bash
- run: | - run: |
mkdir -p llm/llama.cpp/build/linux/$ARCH/stub/lib/ mkdir -p llm/build/linux/$ARCH/stub/bin/
touch llm/llama.cpp/build/linux/$ARCH/stub/lib/stub.so touch llm/build/linux/$ARCH/stub/bin/stub.so
if: ${{ startsWith(matrix.os, 'ubuntu-') }} if: ${{ startsWith(matrix.os, 'ubuntu-') }}
- run: | - run: |
mkdir -p llm/llama.cpp/build/darwin/$ARCH/stub/lib/ mkdir -p llm/build/darwin/$ARCH/stub/bin/
touch llm/llama.cpp/build/darwin/$ARCH/stub/lib/stub.dylib touch llm/build/darwin/$ARCH/stub/bin/stub.dylib
touch llm/llama.cpp/ggml-metal.metal touch llm/ggml-metal.metal
if: ${{ startsWith(matrix.os, 'macos-') }} if: ${{ startsWith(matrix.os, 'macos-') }}
- run: | - run: |
mkdir -p llm/llama.cpp/build/windows/$ARCH/stub/stub/lib/ mkdir -p llm/build/windows/$ARCH/stub/stub/bin/
touch llm/llama.cpp/build/windows/$ARCH/stub/stub/lib/stub.dll touch llm/build/windows/$ARCH/stub/stub/bin/stub.dll
if: ${{ startsWith(matrix.os, 'windows-') }} if: ${{ startsWith(matrix.os, 'windows-') }}
- uses: golangci/golangci-lint-action@v3 - uses: golangci/golangci-lint-action@v3
test: test:
...@@ -271,6 +275,7 @@ jobs: ...@@ -271,6 +275,7 @@ jobs:
env: env:
GOARCH: ${{ matrix.arch }} GOARCH: ${{ matrix.arch }}
CGO_ENABLED: '1' CGO_ENABLED: '1'
OLLAMA_CPU_TARGET: "static"
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
with: with:
...@@ -287,18 +292,19 @@ jobs: ...@@ -287,18 +292,19 @@ jobs:
esac >>$GITHUB_ENV esac >>$GITHUB_ENV
shell: bash shell: bash
- run: | - run: |
mkdir -p llm/llama.cpp/build/linux/$ARCH/stub/lib/ mkdir -p llm/build/linux/$ARCH/stub/bin/
touch llm/llama.cpp/build/linux/$ARCH/stub/lib/stub.so touch llm//build/linux/$ARCH/stub/bin/stub.so
if: ${{ startsWith(matrix.os, 'ubuntu-') }} if: ${{ startsWith(matrix.os, 'ubuntu-') }}
- run: | - run: |
mkdir -p llm/llama.cpp/build/darwin/$ARCH/stub/lib/ mkdir -p llm/build/darwin/$ARCH/stub/bin/
touch llm/llama.cpp/build/darwin/$ARCH/stub/lib/stub.dylib touch llm/build/darwin/$ARCH/stub/bin/stub.dylib
touch llm/llama.cpp/ggml-metal.metal touch llm/ggml-metal.metal
if: ${{ startsWith(matrix.os, 'macos-') }} if: ${{ startsWith(matrix.os, 'macos-') }}
- run: | - run: |
mkdir -p llm/llama.cpp/build/windows/$ARCH/stub/stub/lib/ mkdir -p llm/build/windows/$ARCH/stub/stub/bin/
touch llm/llama.cpp/build/windows/$ARCH/stub/stub/lib/stub.dll touch llm/build/windows/$ARCH/stub/stub/bin/stub.dll
if: ${{ startsWith(matrix.os, 'windows-') }} if: ${{ startsWith(matrix.os, 'windows-') }}
- run: go generate ./...
- run: go build - run: go build
- run: go test -v ./... - run: go test -v ./...
- uses: actions/upload-artifact@v4 - uses: actions/upload-artifact@v4
......
...@@ -11,3 +11,4 @@ ggml-metal.metal ...@@ -11,3 +11,4 @@ ggml-metal.metal
.idea .idea
test_data test_data
*.crt *.crt
llm/build
\ No newline at end of file
...@@ -61,6 +61,8 @@ ARG OLLAMA_CUSTOM_CPU_DEFS ...@@ -61,6 +61,8 @@ ARG OLLAMA_CUSTOM_CPU_DEFS
ARG CGO_CFLAGS ARG CGO_CFLAGS
WORKDIR /go/src/github.com/ollama/ollama/llm/generate WORKDIR /go/src/github.com/ollama/ollama/llm/generate
FROM --platform=linux/amd64 cpu-builder-amd64 AS static-build-amd64
RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh
FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu-build-amd64 FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu-build-amd64
RUN OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh RUN OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx-build-amd64 FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx-build-amd64
...@@ -68,28 +70,33 @@ RUN OLLAMA_CPU_TARGET="cpu_avx" sh gen_linux.sh ...@@ -68,28 +70,33 @@ RUN OLLAMA_CPU_TARGET="cpu_avx" sh gen_linux.sh
FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx2-build-amd64 FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx2-build-amd64
RUN OLLAMA_CPU_TARGET="cpu_avx2" sh gen_linux.sh RUN OLLAMA_CPU_TARGET="cpu_avx2" sh gen_linux.sh
FROM --platform=linux/arm64 centos:7 AS cpu-build-arm64 FROM --platform=linux/arm64 centos:7 AS cpu-builder-arm64
ARG CMAKE_VERSION ARG CMAKE_VERSION
ARG GOLANG_VERSION ARG GOLANG_VERSION
COPY ./scripts/rh_linux_deps.sh / COPY ./scripts/rh_linux_deps.sh /
RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
COPY --from=llm-code / /go/src/github.com/ollama/ollama/ COPY --from=llm-code / /go/src/github.com/ollama/ollama/
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
# Note, we only build the "base" CPU variant on arm since avx/avx2 are x86 features
ARG OLLAMA_CUSTOM_CPU_DEFS ARG OLLAMA_CUSTOM_CPU_DEFS
ARG CGO_CFLAGS ARG CGO_CFLAGS
WORKDIR /go/src/github.com/ollama/ollama/llm/generate
FROM --platform=linux/arm64 cpu-builder-arm64 AS static-build-arm64
RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh
FROM --platform=linux/arm64 cpu-builder-arm64 AS cpu-build-arm64
RUN OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh RUN OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
# Intermediate stage used for ./scripts/build_linux.sh # Intermediate stage used for ./scripts/build_linux.sh
FROM --platform=linux/amd64 cpu-build-amd64 AS build-amd64 FROM --platform=linux/amd64 cpu-build-amd64 AS build-amd64
ENV CGO_ENABLED 1 ENV CGO_ENABLED 1
WORKDIR /go/src/github.com/ollama/ollama WORKDIR /go/src/github.com/ollama/ollama
COPY . . COPY . .
COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/ COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/ COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/ COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/ COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/deps/ ./dist/deps/ COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/deps/ ./dist/deps/
ARG GOFLAGS ARG GOFLAGS
ARG CGO_CFLAGS ARG CGO_CFLAGS
...@@ -101,8 +108,8 @@ ENV CGO_ENABLED 1 ...@@ -101,8 +108,8 @@ ENV CGO_ENABLED 1
ARG GOLANG_VERSION ARG GOLANG_VERSION
WORKDIR /go/src/github.com/ollama/ollama WORKDIR /go/src/github.com/ollama/ollama
COPY . . COPY . .
COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/ COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
RUN mkdir -p /go/src/github.com/ollama/ollama/dist/deps/ COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
ARG GOFLAGS ARG GOFLAGS
ARG CGO_CFLAGS ARG CGO_CFLAGS
RUN go build -trimpath . RUN go build -trimpath .
......
...@@ -9,6 +9,7 @@ import ( ...@@ -9,6 +9,7 @@ import (
"os" "os"
"os/exec" "os/exec"
"path/filepath" "path/filepath"
"syscall"
"time" "time"
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
...@@ -83,6 +84,28 @@ func SpawnServer(ctx context.Context, command string) (chan int, error) { ...@@ -83,6 +84,28 @@ func SpawnServer(ctx context.Context, command string) (chan int, error) {
io.Copy(logFile, stderr) //nolint:errcheck io.Copy(logFile, stderr) //nolint:errcheck
}() }()
// Re-wire context done behavior to attempt a graceful shutdown of the server
cmd.Cancel = func() error {
if cmd.Process != nil {
cmd.Process.Signal(os.Interrupt) //nolint:errcheck
tick := time.NewTicker(10 * time.Millisecond)
defer tick.Stop()
for {
select {
case <-tick.C:
// OS agnostic "is it still running"
if proc, err := os.FindProcess(int(cmd.Process.Pid)); err != nil || errors.Is(proc.Signal(syscall.Signal(0)), os.ErrProcessDone) {
return nil //nolint:nilerr
}
case <-time.After(5 * time.Second):
slog.Warn("graceful server shutdown timeout, killing", "pid", cmd.Process.Pid)
cmd.Process.Kill() //nolint:errcheck
}
}
}
return nil
}
// run the command and wait for it to finish // run the command and wait for it to finish
if err := cmd.Start(); err != nil { if err := cmd.Start(); err != nil {
return done, fmt.Errorf("failed to start server %w", err) return done, fmt.Errorf("failed to start server %w", err)
...@@ -105,7 +128,7 @@ func SpawnServer(ctx context.Context, command string) (chan int, error) { ...@@ -105,7 +128,7 @@ func SpawnServer(ctx context.Context, command string) (chan int, error) {
select { select {
case <-ctx.Done(): case <-ctx.Done():
slog.Debug(fmt.Sprintf("server shutdown with exit code %d", code)) slog.Info(fmt.Sprintf("server shutdown with exit code %d", code))
done <- code done <- code
return return
default: default:
......
...@@ -76,3 +76,10 @@ install script which version to install. ...@@ -76,3 +76,10 @@ install script which version to install.
```sh ```sh
curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION="0.1.29" sh curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION="0.1.29" sh
``` ```
## Linux tmp noexec
If your system is configured with the "noexec" flag where Ollama stores its
temporary executable files, you can specify an alternate location by setting
OLLAMA_TMPDIR to a location writable by the user ollama runs as. For example
OLLAMA_TMPDIR=/usr/share/ollama/
\ No newline at end of file
...@@ -100,6 +100,8 @@ func AMDGetGPUInfo(resp *GpuInfo) { ...@@ -100,6 +100,8 @@ func AMDGetGPUInfo(resp *GpuInfo) {
return return
} }
updateLibPath(libDir)
gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION") gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION")
if gfxOverride == "" { if gfxOverride == "" {
supported, err := GetSupportedGFX(libDir) supported, err := GetSupportedGFX(libDir)
...@@ -143,6 +145,21 @@ func AMDGetGPUInfo(resp *GpuInfo) { ...@@ -143,6 +145,21 @@ func AMDGetGPUInfo(resp *GpuInfo) {
} }
} }
func updateLibPath(libDir string) {
ldPaths := []string{}
if val, ok := os.LookupEnv("LD_LIBRARY_PATH"); ok {
ldPaths = strings.Split(val, ":")
}
for _, d := range ldPaths {
if d == libDir {
return
}
}
val := strings.Join(append(ldPaths, libDir), ":")
slog.Debug("updated lib path", "LD_LIBRARY_PATH", val)
os.Setenv("LD_LIBRARY_PATH", val)
}
// Walk the sysfs nodes for the available GPUs and gather information from them // Walk the sysfs nodes for the available GPUs and gather information from them
// skipping over any devices in the skip map // skipping over any devices in the skip map
func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) { func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
......
...@@ -11,6 +11,7 @@ import ( ...@@ -11,6 +11,7 @@ import (
"strings" "strings"
"sync" "sync"
"syscall" "syscall"
"time"
) )
var ( var (
...@@ -21,12 +22,21 @@ var ( ...@@ -21,12 +22,21 @@ var (
func PayloadsDir() (string, error) { func PayloadsDir() (string, error) {
lock.Lock() lock.Lock()
defer lock.Unlock() defer lock.Unlock()
var err error
if payloadsDir == "" { if payloadsDir == "" {
cleanupTmpDirs() cleanupTmpDirs()
tmpDir, err := os.MkdirTemp("", "ollama") tmpDir := os.Getenv("OLLAMA_TMPDIR")
if tmpDir == "" {
tmpDir, err = os.MkdirTemp("", "ollama")
if err != nil { if err != nil {
return "", fmt.Errorf("failed to generate tmp dir: %w", err) return "", fmt.Errorf("failed to generate tmp dir: %w", err)
} }
} else {
err = os.MkdirAll(tmpDir, 0755)
if err != nil {
return "", fmt.Errorf("failed to generate tmp dir %s: %w", tmpDir, err)
}
}
// Track our pid so we can clean up orphaned tmpdirs // Track our pid so we can clean up orphaned tmpdirs
pidFilePath := filepath.Join(tmpDir, "ollama.pid") pidFilePath := filepath.Join(tmpDir, "ollama.pid")
...@@ -83,10 +93,15 @@ func Cleanup() { ...@@ -83,10 +93,15 @@ func Cleanup() {
tmpDir := filepath.Clean(filepath.Join(payloadsDir, "..")) tmpDir := filepath.Clean(filepath.Join(payloadsDir, ".."))
slog.Debug("cleaning up", "dir", tmpDir) slog.Debug("cleaning up", "dir", tmpDir)
err := os.RemoveAll(tmpDir) err := os.RemoveAll(tmpDir)
if err != nil {
// On windows, if we remove too quickly the llama.dll may still be in-use and fail to remove
time.Sleep(1000 * time.Millisecond)
err = os.RemoveAll(tmpDir)
if err != nil { if err != nil {
slog.Warn("failed to clean up", "dir", tmpDir, "err", err) slog.Warn("failed to clean up", "dir", tmpDir, "err", err)
} }
} }
}
} }
func UpdatePath(dir string) { func UpdatePath(dir string) {
......
...@@ -30,12 +30,11 @@ type handles struct { ...@@ -30,12 +30,11 @@ type handles struct {
} }
const ( const (
cudaMinimumMemory = 377 * format.MebiByte cudaMinimumMemory = 457 * format.MebiByte
rocmMinimumMemory = 377 * format.MebiByte rocmMinimumMemory = 457 * format.MebiByte
) )
var gpuMutex sync.Mutex var gpuMutex sync.Mutex
var gpuHandles *handles = nil
// With our current CUDA compile flags, older than 5.0 will not work properly // With our current CUDA compile flags, older than 5.0 will not work properly
var CudaComputeMin = [2]C.int{5, 0} var CudaComputeMin = [2]C.int{5, 0}
...@@ -85,11 +84,11 @@ var CudartWindowsGlobs = []string{ ...@@ -85,11 +84,11 @@ var CudartWindowsGlobs = []string{
var CudaTegra string = os.Getenv("JETSON_JETPACK") var CudaTegra string = os.Getenv("JETSON_JETPACK")
// Note: gpuMutex must already be held // Note: gpuMutex must already be held
func initGPUHandles() { func initGPUHandles() *handles {
// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing // TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
gpuHandles = &handles{nil, nil} gpuHandles := &handles{nil, nil}
var nvmlMgmtName string var nvmlMgmtName string
var nvmlMgmtPatterns []string var nvmlMgmtPatterns []string
var cudartMgmtName string var cudartMgmtName string
...@@ -116,7 +115,7 @@ func initGPUHandles() { ...@@ -116,7 +115,7 @@ func initGPUHandles() {
} }
cudartMgmtPatterns = append(cudartMgmtPatterns, CudartLinuxGlobs...) cudartMgmtPatterns = append(cudartMgmtPatterns, CudartLinuxGlobs...)
default: default:
return return gpuHandles
} }
slog.Info("Detecting GPU type") slog.Info("Detecting GPU type")
...@@ -126,7 +125,7 @@ func initGPUHandles() { ...@@ -126,7 +125,7 @@ func initGPUHandles() {
if cudart != nil { if cudart != nil {
slog.Info("Nvidia GPU detected via cudart") slog.Info("Nvidia GPU detected via cudart")
gpuHandles.cudart = cudart gpuHandles.cudart = cudart
return return gpuHandles
} }
} }
...@@ -137,10 +136,10 @@ func initGPUHandles() { ...@@ -137,10 +136,10 @@ func initGPUHandles() {
if nvml != nil { if nvml != nil {
slog.Info("Nvidia GPU detected via nvidia-ml") slog.Info("Nvidia GPU detected via nvidia-ml")
gpuHandles.nvml = nvml gpuHandles.nvml = nvml
return return gpuHandles
} }
} }
return gpuHandles
} }
func GetGPUInfo() GpuInfo { func GetGPUInfo() GpuInfo {
...@@ -148,9 +147,16 @@ func GetGPUInfo() GpuInfo { ...@@ -148,9 +147,16 @@ func GetGPUInfo() GpuInfo {
// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries // GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
gpuMutex.Lock() gpuMutex.Lock()
defer gpuMutex.Unlock() defer gpuMutex.Unlock()
if gpuHandles == nil {
initGPUHandles() gpuHandles := initGPUHandles()
defer func() {
if gpuHandles.nvml != nil {
C.nvml_release(*gpuHandles.nvml)
}
if gpuHandles.cudart != nil {
C.cudart_release(*gpuHandles.cudart)
} }
}()
// All our GPU builds on x86 have AVX enabled, so fallback to CPU if we don't detect at least AVX // All our GPU builds on x86 have AVX enabled, so fallback to CPU if we don't detect at least AVX
cpuVariant := GetCPUVariant() cpuVariant := GetCPUVariant()
......
...@@ -62,6 +62,10 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) { ...@@ -62,6 +62,10 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
LOG(resp->ch.verbose, "cudaSetDevice err: %d\n", ret); LOG(resp->ch.verbose, "cudaSetDevice err: %d\n", ret);
UNLOAD_LIBRARY(resp->ch.handle); UNLOAD_LIBRARY(resp->ch.handle);
resp->ch.handle = NULL; resp->ch.handle = NULL;
if (ret == CUDA_ERROR_INSUFFICIENT_DRIVER) {
resp->err = strdup("your nvidia driver is too old or missing, please upgrade to run ollama");
return;
}
snprintf(buf, buflen, "cudart init failure: %d", ret); snprintf(buf, buflen, "cudart init failure: %d", ret);
resp->err = strdup(buf); resp->err = strdup(buf);
return; return;
...@@ -187,4 +191,10 @@ void cudart_compute_capability(cudart_handle_t h, cudart_compute_capability_t *r ...@@ -187,4 +191,10 @@ void cudart_compute_capability(cudart_handle_t h, cudart_compute_capability_t *r
} }
} }
void cudart_release(cudart_handle_t h) {
LOG(h.verbose, "releasing cudart library\n");
UNLOAD_LIBRARY(h.handle);
h.handle = NULL;
}
#endif // __APPLE__ #endif // __APPLE__
\ No newline at end of file
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
typedef enum cudartReturn_enum { typedef enum cudartReturn_enum {
CUDART_SUCCESS = 0, CUDART_SUCCESS = 0,
CUDART_UNSUPPORTED = 1, CUDART_UNSUPPORTED = 1,
CUDA_ERROR_INSUFFICIENT_DRIVER = 35,
// Other values omitted for now... // Other values omitted for now...
} cudartReturn_t; } cudartReturn_t;
...@@ -54,6 +55,7 @@ typedef struct cudart_compute_capability { ...@@ -54,6 +55,7 @@ typedef struct cudart_compute_capability {
void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp); void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp);
void cudart_check_vram(cudart_handle_t ch, mem_info_t *resp); void cudart_check_vram(cudart_handle_t ch, mem_info_t *resp);
void cudart_compute_capability(cudart_handle_t th, cudart_compute_capability_t *cc); void cudart_compute_capability(cudart_handle_t th, cudart_compute_capability_t *cc);
void cudart_release(cudart_handle_t ch);
#endif // __GPU_INFO_CUDART_H__ #endif // __GPU_INFO_CUDART_H__
#endif // __APPLE__ #endif // __APPLE__
...@@ -211,4 +211,11 @@ void nvml_compute_capability(nvml_handle_t h, nvml_compute_capability_t *resp) { ...@@ -211,4 +211,11 @@ void nvml_compute_capability(nvml_handle_t h, nvml_compute_capability_t *resp) {
} }
} }
} }
void nvml_release(nvml_handle_t h) {
LOG(h.verbose, "releasing nvml library\n");
UNLOAD_LIBRARY(h.handle);
h.handle = NULL;
}
#endif // __APPLE__ #endif // __APPLE__
\ No newline at end of file
...@@ -51,6 +51,7 @@ typedef struct nvml_compute_capability { ...@@ -51,6 +51,7 @@ typedef struct nvml_compute_capability {
void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp); void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp);
void nvml_check_vram(nvml_handle_t ch, mem_info_t *resp); void nvml_check_vram(nvml_handle_t ch, mem_info_t *resp);
void nvml_compute_capability(nvml_handle_t ch, nvml_compute_capability_t *cc); void nvml_compute_capability(nvml_handle_t ch, nvml_compute_capability_t *cc);
void nvml_release(nvml_handle_t ch);
#endif // __GPU_INFO_NVML_H__ #endif // __GPU_INFO_NVML_H__
#endif // __APPLE__ #endif // __APPLE__
\ No newline at end of file
...@@ -24,5 +24,5 @@ func TestOrcaMiniBlueSky(t *testing.T) { ...@@ -24,5 +24,5 @@ func TestOrcaMiniBlueSky(t *testing.T) {
"seed": 123, "seed": 123,
}, },
} }
GenerateTestHelper(ctx, t, &http.Client{}, req, []string{"rayleigh"}) GenerateTestHelper(ctx, t, &http.Client{}, req, []string{"rayleigh", "scattering"})
} }
...@@ -126,7 +126,7 @@ func StartServer(ctx context.Context, ollamaHost string) error { ...@@ -126,7 +126,7 @@ func StartServer(ctx context.Context, ollamaHost string) error {
} }
func PullIfMissing(ctx context.Context, client *http.Client, scheme, testEndpoint, modelName string) error { func PullIfMissing(ctx context.Context, client *http.Client, scheme, testEndpoint, modelName string) error {
slog.Debug("checking status of model", "model", modelName) slog.Info("checking status of model", "model", modelName)
showReq := &api.ShowRequest{Name: modelName} showReq := &api.ShowRequest{Name: modelName}
requestJSON, err := json.Marshal(showReq) requestJSON, err := json.Marshal(showReq)
if err != nil { if err != nil {
...@@ -174,14 +174,21 @@ func PullIfMissing(ctx context.Context, client *http.Client, scheme, testEndpoin ...@@ -174,14 +174,21 @@ func PullIfMissing(ctx context.Context, client *http.Client, scheme, testEndpoin
return nil return nil
} }
var serverProcMutex sync.Mutex
func GenerateTestHelper(ctx context.Context, t *testing.T, client *http.Client, genReq api.GenerateRequest, anyResp []string) { func GenerateTestHelper(ctx context.Context, t *testing.T, client *http.Client, genReq api.GenerateRequest, anyResp []string) {
// TODO maybe stuff in an init routine?
lifecycle.InitLogging()
requestJSON, err := json.Marshal(genReq) requestJSON, err := json.Marshal(genReq)
if err != nil { if err != nil {
t.Fatalf("Error serializing request: %v", err) t.Fatalf("Error serializing request: %v", err)
} }
defer func() { defer func() {
if t.Failed() && os.Getenv("OLLAMA_TEST_EXISTING") == "" { if os.Getenv("OLLAMA_TEST_EXISTING") == "" {
// TODO defer serverProcMutex.Unlock()
if t.Failed() {
fp, err := os.Open(lifecycle.ServerLogFile) fp, err := os.Open(lifecycle.ServerLogFile)
if err != nil { if err != nil {
slog.Error("failed to open server log", "logfile", lifecycle.ServerLogFile, "error", err) slog.Error("failed to open server log", "logfile", lifecycle.ServerLogFile, "error", err)
...@@ -200,10 +207,18 @@ func GenerateTestHelper(ctx context.Context, t *testing.T, client *http.Client, ...@@ -200,10 +207,18 @@ func GenerateTestHelper(ctx context.Context, t *testing.T, client *http.Client,
if err != nil && !os.IsNotExist(err) { if err != nil && !os.IsNotExist(err) {
slog.Warn("failed to cleanup", "logfile", lifecycle.ServerLogFile, "error", err) slog.Warn("failed to cleanup", "logfile", lifecycle.ServerLogFile, "error", err)
} }
}
}() }()
scheme, testEndpoint := GetTestEndpoint() scheme, testEndpoint := GetTestEndpoint()
if os.Getenv("OLLAMA_TEST_EXISTING") == "" { if os.Getenv("OLLAMA_TEST_EXISTING") == "" {
serverProcMutex.Lock()
fp, err := os.CreateTemp("", "ollama-server-*.log")
if err != nil {
t.Fatalf("failed to generate log file: %s", err)
}
lifecycle.ServerLogFile = fp.Name()
fp.Close()
assert.NoError(t, StartServer(ctx, testEndpoint)) assert.NoError(t, StartServer(ctx, testEndpoint))
} }
......
#include "dyn_ext_server.h"
#include <stdio.h>
#include <string.h>
#ifdef __linux__
#include <dlfcn.h>
#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
#define LOAD_ERR() strdup(dlerror())
#define UNLOAD_LIBRARY(handle) dlclose(handle)
#elif _WIN32
#include <windows.h>
#define LOAD_LIBRARY(lib, flags) LoadLibrary(lib)
#define LOAD_SYMBOL(handle, sym) GetProcAddress(handle, sym)
#define UNLOAD_LIBRARY(handle) FreeLibrary(handle)
#define LOAD_ERR() ({\
LPSTR messageBuffer = NULL; \
size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, \
NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&messageBuffer, 0, NULL); \
char *resp = strdup(messageBuffer); \
LocalFree(messageBuffer); \
resp; \
})
#else
#include <dlfcn.h>
#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
#define LOAD_ERR() strdup(dlerror())
#define UNLOAD_LIBRARY(handle) dlclose(handle)
#endif
void dyn_init(const char *libPath, struct dynamic_llama_server *s,
ext_server_resp_t *err) {
int i = 0;
struct lookup {
char *s;
void **p;
} l[] = {
{"llama_server_init", (void *)&s->llama_server_init},
{"llama_server_start", (void *)&s->llama_server_start},
{"llama_server_stop", (void *)&s->llama_server_stop},
{"llama_server_completion", (void *)&s->llama_server_completion},
{"llama_server_completion_next_result",
(void *)&s->llama_server_completion_next_result},
{"llama_server_completion_cancel",
(void *)&s->llama_server_completion_cancel},
{"llama_server_release_task_result",
(void *)&s->llama_server_release_task_result},
{"llama_server_tokenize", (void *)&s->llama_server_tokenize},
{"llama_server_detokenize", (void *)&s->llama_server_detokenize},
{"llama_server_embedding", (void *)&s->llama_server_embedding},
{"llama_server_release_json_resp",
(void *)&s->llama_server_release_json_resp},
{"", NULL},
};
printf("loading library %s\n", libPath);
s->handle = LOAD_LIBRARY(libPath, RTLD_LOCAL|RTLD_NOW);
if (!s->handle) {
err->id = -1;
char *msg = LOAD_ERR();
snprintf(err->msg, err->msg_len,
"Unable to load dynamic server library: %s", msg);
free(msg);
return;
}
for (i = 0; l[i].p != NULL; i++) {
*l[i].p = LOAD_SYMBOL(s->handle, l[i].s);
if (!l[i].p) {
UNLOAD_LIBRARY(s->handle);
err->id = -1;
char *msg = LOAD_ERR();
snprintf(err->msg, err->msg_len, "symbol lookup for %s failed: %s",
l[i].s, msg);
free(msg);
return;
}
}
}
inline void dyn_llama_server_init(struct dynamic_llama_server s,
ext_server_params_t *sparams,
ext_server_resp_t *err) {
s.llama_server_init(sparams, err);
}
inline void dyn_llama_server_start(struct dynamic_llama_server s) {
s.llama_server_start();
}
inline void dyn_llama_server_stop(struct dynamic_llama_server s) {
s.llama_server_stop();
}
inline void dyn_llama_server_completion(struct dynamic_llama_server s,
const char *json_req,
ext_server_resp_t *resp) {
s.llama_server_completion(json_req, resp);
}
inline void dyn_llama_server_completion_next_result(
struct dynamic_llama_server s, const int task_id,
ext_server_task_result_t *result) {
s.llama_server_completion_next_result(task_id, result);
}
inline void dyn_llama_server_completion_cancel(
struct dynamic_llama_server s, const int task_id, ext_server_resp_t *err) {
s.llama_server_completion_cancel(task_id, err);
}
inline void dyn_llama_server_release_task_result(
struct dynamic_llama_server s, ext_server_task_result_t *result) {
s.llama_server_release_task_result(result);
}
inline void dyn_llama_server_tokenize(struct dynamic_llama_server s,
const char *json_req,
char **json_resp,
ext_server_resp_t *err) {
s.llama_server_tokenize(json_req, json_resp, err);
}
inline void dyn_llama_server_detokenize(struct dynamic_llama_server s,
const char *json_req,
char **json_resp,
ext_server_resp_t *err) {
s.llama_server_detokenize(json_req, json_resp, err);
}
inline void dyn_llama_server_embedding(struct dynamic_llama_server s,
const char *json_req,
char **json_resp,
ext_server_resp_t *err) {
s.llama_server_embedding(json_req, json_resp, err);
}
inline void dyn_llama_server_release_json_resp(
struct dynamic_llama_server s, char **json_resp) {
s.llama_server_release_json_resp(json_resp);
}
package llm
/*
#cgo CFLAGS: -I${SRCDIR}/ext_server -I${SRCDIR}/llama.cpp -I${SRCDIR}/llama.cpp/common -I${SRCDIR}/llama.cpp/examples/server
#cgo CFLAGS: -DNDEBUG -DLLAMA_SERVER_LIBRARY=1 -D_XOPEN_SOURCE=600 -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
#cgo CFLAGS: -Wmissing-noreturn -Wextra -Wcast-qual -Wno-unused-function -Wno-array-bounds
#cgo CPPFLAGS: -Ofast -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations
#cgo darwin CFLAGS: -D_DARWIN_C_SOURCE
#cgo darwin CPPFLAGS: -DGGML_USE_ACCELERATE
#cgo darwin CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_NDEBUG
#cgo darwin LDFLAGS: -lc++ -framework Accelerate
#cgo darwin LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
#cgo linux CFLAGS: -D_GNU_SOURCE
#cgo linux LDFLAGS: -lrt -ldl -lstdc++ -lm
#cgo linux windows LDFLAGS: -lpthread
#include <stdlib.h>
#include "dyn_ext_server.h"
*/
import "C"
import (
"bytes"
"context"
"encoding/json"
"fmt"
"log/slog"
"os"
"path/filepath"
"strings"
"sync"
"time"
"unsafe"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/gpu"
)
type dynExtServer struct {
s C.struct_dynamic_llama_server
options *api.Options
}
// Note: current implementation does not support concurrent instantiations
var mutex sync.Mutex
func newExtServerResp(len C.size_t) C.ext_server_resp_t {
var resp C.ext_server_resp_t
resp.msg_len = len
bytes := make([]byte, len)
resp.msg = (*C.char)(C.CBytes(bytes))
return resp
}
func freeExtServerResp(resp C.ext_server_resp_t) {
if resp.msg_len == 0 {
return
}
C.free(unsafe.Pointer(resp.msg))
}
func extServerResponseToErr(resp C.ext_server_resp_t) error {
return fmt.Errorf(C.GoString(resp.msg))
}
func newDynExtServer(library, model string, adapters, projectors []string, opts *api.Options) (LLM, error) {
if !mutex.TryLock() {
slog.Info("concurrent llm servers not yet supported, waiting for prior server to complete")
mutex.Lock()
}
gpu.UpdatePath(filepath.Dir(library))
libPath := C.CString(library)
defer C.free(unsafe.Pointer(libPath))
resp := newExtServerResp(512)
defer freeExtServerResp(resp)
var srv C.struct_dynamic_llama_server
C.dyn_init(libPath, &srv, &resp)
if resp.id < 0 {
mutex.Unlock()
return nil, fmt.Errorf("Unable to load dynamic library: %s", C.GoString(resp.msg))
}
llm := dynExtServer{
s: srv,
options: opts,
}
slog.Info(fmt.Sprintf("Loading Dynamic llm server: %s", library))
var sparams C.ext_server_params_t
sparams.model = C.CString(model)
defer C.free(unsafe.Pointer(sparams.model))
sparams.embedding = true
sparams.n_ctx = C.uint(opts.NumCtx)
sparams.n_batch = C.uint(opts.NumBatch)
sparams.n_gpu_layers = C.int(opts.NumGPU)
sparams.main_gpu = C.int(opts.MainGPU)
sparams.n_parallel = 1 // TODO - wire up concurrency
// Always use the value encoded in the model
sparams.rope_freq_base = 0.0
sparams.rope_freq_scale = 0.0
sparams.memory_f16 = C.bool(opts.F16KV)
sparams.use_mlock = C.bool(opts.UseMLock)
sparams.use_mmap = C.bool(opts.UseMMap)
if opts.UseNUMA {
sparams.numa = C.int(1)
} else {
sparams.numa = C.int(0)
}
sparams.lora_adapters = nil
for i := 0; i < len(adapters); i++ {
la := (*C.ext_server_lora_adapter_t)(C.malloc(C.sizeof_ext_server_lora_adapter_t))
defer C.free(unsafe.Pointer(la))
la.adapter = C.CString(adapters[i])
defer C.free(unsafe.Pointer(la.adapter))
la.scale = C.float(1.0) // TODO expose scale/weights up through ollama UX
la.next = nil
if i == 0 {
sparams.lora_adapters = la
} else {
tmp := sparams.lora_adapters
for ; tmp.next != nil; tmp = tmp.next {
}
tmp.next = la
}
}
if len(projectors) > 0 {
// TODO: applying multiple projectors is not supported by the llama.cpp server yet
sparams.mmproj = C.CString(projectors[0])
defer C.free(unsafe.Pointer(sparams.mmproj))
} else {
sparams.mmproj = nil
}
sparams.n_threads = C.uint(opts.NumThread)
if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
sparams.verbose_logging = C.bool(true)
} else {
sparams.verbose_logging = C.bool(false)
}
slog.Info("Initializing llama server")
slog.Debug(fmt.Sprintf("server params: %+v", sparams))
initResp := newExtServerResp(512)
defer freeExtServerResp(initResp)
C.dyn_llama_server_init(llm.s, &sparams, &initResp)
if initResp.id < 0 {
mutex.Unlock()
err := extServerResponseToErr(initResp)
slog.Debug(fmt.Sprintf("failure during initialization: %s", err))
return nil, err
}
slog.Info("Starting llama main loop")
C.dyn_llama_server_start(llm.s)
return &llm, nil
}
func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
resp := newExtServerResp(128)
defer freeExtServerResp(resp)
if len(predict.Images) > 0 {
slog.Info(fmt.Sprintf("loaded %d images", len(predict.Images)))
}
request := map[string]any{
"prompt": predict.Prompt,
"stream": true,
"n_predict": predict.Options.NumPredict,
"n_keep": predict.Options.NumKeep,
"temperature": predict.Options.Temperature,
"top_k": predict.Options.TopK,
"top_p": predict.Options.TopP,
"tfs_z": predict.Options.TFSZ,
"typical_p": predict.Options.TypicalP,
"repeat_last_n": predict.Options.RepeatLastN,
"repeat_penalty": predict.Options.RepeatPenalty,
"presence_penalty": predict.Options.PresencePenalty,
"frequency_penalty": predict.Options.FrequencyPenalty,
"mirostat": predict.Options.Mirostat,
"mirostat_tau": predict.Options.MirostatTau,
"mirostat_eta": predict.Options.MirostatEta,
"penalize_nl": predict.Options.PenalizeNewline,
"seed": predict.Options.Seed,
"stop": predict.Options.Stop,
"image_data": predict.Images,
"cache_prompt": true,
}
if predict.Format == "json" {
request["grammar"] = jsonGrammar
if !strings.Contains(strings.ToLower(predict.Prompt), "json") {
slog.Warn("Prompt does not specify that the LLM should response in JSON, but JSON format is expected. For best results specify that JSON is expected in the system prompt.")
}
}
retryDelay := 100 * time.Microsecond
for retries := 0; retries < maxRetries; retries++ {
if retries > 0 {
time.Sleep(retryDelay) // wait before retrying
retryDelay *= 2 // exponential backoff
}
// Handling JSON marshaling with special characters unescaped.
buffer := &bytes.Buffer{}
enc := json.NewEncoder(buffer)
enc.SetEscapeHTML(false)
if err := enc.Encode(request); err != nil {
return fmt.Errorf("failed to marshal data: %w", err)
}
req := C.CString(buffer.String())
defer C.free(unsafe.Pointer(req))
C.dyn_llama_server_completion(llm.s, req, &resp)
if resp.id < 0 {
return extServerResponseToErr(resp)
}
retryNeeded := false
// keep track of the last token generated, this is used to abort if the model starts looping
var lastToken string
var tokenRepeat int
out:
for {
select {
case <-ctx.Done():
return cancelCompletion(llm, resp)
default:
var result C.ext_server_task_result_t
C.dyn_llama_server_completion_next_result(llm.s, resp.id, &result)
json_resp := C.GoString(result.json_resp)
C.dyn_llama_server_release_task_result(llm.s, &result)
var p prediction
if err := json.Unmarshal([]byte(json_resp), &p); err != nil {
C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
if resp.id < 0 {
return fmt.Errorf("error unmarshaling llm prediction response: %w and cancel %s", err, C.GoString(resp.msg))
} else {
return fmt.Errorf("error unmarshaling llm prediction response: %w", err)
}
}
if bool(result.error) && strings.Contains(json_resp, "slot unavailable") {
retryNeeded = true
// task will already be canceled
break out
}
switch {
case strings.TrimSpace(p.Content) == lastToken:
tokenRepeat++
default:
lastToken = strings.TrimSpace(p.Content)
tokenRepeat = 0
}
// 30 picked as an arbitrary max token repeat limit, modify as needed
if tokenRepeat > 30 {
slog.Debug("prediction aborted, token repeat limit reached")
return cancelCompletion(llm, resp)
}
if p.Content != "" {
fn(PredictResult{
Content: p.Content,
})
}
if p.Stop || bool(result.stop) {
fn(PredictResult{
Done: true,
PromptEvalCount: p.Timings.PromptN,
PromptEvalDuration: parseDurationMs(p.Timings.PromptMS),
EvalCount: p.Timings.PredictedN,
EvalDuration: parseDurationMs(p.Timings.PredictedMS),
})
return nil
}
}
}
if !retryNeeded {
return nil // success
}
}
// should never reach here ideally
return fmt.Errorf("max retries exceeded")
}
func cancelCompletion(llm *dynExtServer, resp C.ext_server_resp_t) error {
C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
if resp.id < 0 {
return extServerResponseToErr(resp)
} else {
return nil
}
}
func (llm *dynExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
data, err := json.Marshal(TokenizeRequest{Content: prompt})
if err != nil {
return nil, fmt.Errorf("marshaling encode data: %w", err)
}
req := C.CString(string(data))
defer C.free(unsafe.Pointer(req))
var json_resp *C.char
resp := newExtServerResp(128)
defer freeExtServerResp(resp)
C.dyn_llama_server_tokenize(llm.s, req, &json_resp, &resp)
if resp.id < 0 {
return nil, extServerResponseToErr(resp)
}
defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
var encoded TokenizeResponse
if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &encoded); err2 != nil {
return nil, fmt.Errorf("unmarshal encode response: %w", err2)
}
return encoded.Tokens, err
}
func (llm *dynExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
if len(tokens) == 0 {
return "", nil
}
data, err := json.Marshal(DetokenizeRequest{Tokens: tokens})
if err != nil {
return "", fmt.Errorf("marshaling decode data: %w", err)
}
req := C.CString(string(data))
defer C.free(unsafe.Pointer(req))
var json_resp *C.char
resp := newExtServerResp(128)
defer freeExtServerResp(resp)
C.dyn_llama_server_detokenize(llm.s, req, &json_resp, &resp)
if resp.id < 0 {
return "", extServerResponseToErr(resp)
}
defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
var decoded DetokenizeResponse
if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &decoded); err2 != nil {
return "", fmt.Errorf("unmarshal encode response: %w", err2)
}
return decoded.Content, err
}
func (llm *dynExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
data, err := json.Marshal(TokenizeRequest{Content: input})
if err != nil {
return nil, fmt.Errorf("error marshaling embed data: %w", err)
}
req := C.CString(string(data))
defer C.free(unsafe.Pointer(req))
var json_resp *C.char
resp := newExtServerResp(128)
defer freeExtServerResp(resp)
C.dyn_llama_server_embedding(llm.s, req, &json_resp, &resp)
if resp.id < 0 {
return nil, extServerResponseToErr(resp)
}
defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
var embedding EmbeddingResponse
if err := json.Unmarshal([]byte(C.GoString(json_resp)), &embedding); err != nil {
return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
}
return embedding.Embedding, nil
}
func (llm *dynExtServer) Close() {
C.dyn_llama_server_stop(llm.s)
mutex.Unlock()
}
#include <stdlib.h>
#include "ext_server.h"
#ifdef __cplusplus
extern "C" {
#endif
struct dynamic_llama_server {
void *handle;
void (*llama_server_init)(ext_server_params_t *sparams,
ext_server_resp_t *err);
void (*llama_server_start)();
void (*llama_server_stop)();
void (*llama_server_completion)(const char *json_req,
ext_server_resp_t *resp);
void (*llama_server_completion_next_result)(const int task_id,
ext_server_task_result_t *result);
void (*llama_server_completion_cancel)(const int task_id,
ext_server_resp_t *err);
void (*llama_server_release_task_result)(ext_server_task_result_t *result);
void (*llama_server_tokenize)(const char *json_req, char **json_resp,
ext_server_resp_t *err);
void (*llama_server_detokenize)(const char *json_req, char **json_resp,
ext_server_resp_t *err);
void (*llama_server_embedding)(const char *json_req, char **json_resp,
ext_server_resp_t *err);
void (*llama_server_release_json_resp)(char **json_resp);
};
void dyn_init(const char *libPath, struct dynamic_llama_server *s,
ext_server_resp_t *err);
// No good way to call C function pointers from Go so inline the indirection
void dyn_llama_server_init(struct dynamic_llama_server s,
ext_server_params_t *sparams,
ext_server_resp_t *err);
void dyn_llama_server_start(struct dynamic_llama_server s);
void dyn_llama_server_stop(struct dynamic_llama_server s);
void dyn_llama_server_completion(struct dynamic_llama_server s,
const char *json_req,
ext_server_resp_t *resp);
void dyn_llama_server_completion_next_result(
struct dynamic_llama_server s, const int task_id,
ext_server_task_result_t *result);
void dyn_llama_server_completion_cancel(struct dynamic_llama_server s,
const int task_id,
ext_server_resp_t *err);
void dyn_llama_server_release_task_result(
struct dynamic_llama_server s, ext_server_task_result_t *result);
void dyn_llama_server_tokenize(struct dynamic_llama_server s,
const char *json_req, char **json_resp,
ext_server_resp_t *err);
void dyn_llama_server_detokenize(struct dynamic_llama_server s,
const char *json_req,
char **json_resp,
ext_server_resp_t *err);
void dyn_llama_server_embedding(struct dynamic_llama_server s,
const char *json_req, char **json_resp,
ext_server_resp_t *err);
void dyn_llama_server_release_json_resp(struct dynamic_llama_server s,
char **json_resp);
#ifdef __cplusplus
}
#endif
\ No newline at end of file
set(TARGET ext_server) set(TARGET ollama_llama_server)
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h)
install(TARGETS ${TARGET} RUNTIME)
target_compile_definitions(${TARGET} PRIVATE
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
)
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
if (WIN32) if (WIN32)
add_library(${TARGET} SHARED ext_server.cpp ../llama.cpp/llama.cpp) TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
else()
add_library(${TARGET} STATIC ext_server.cpp ../llama.cpp/llama.cpp)
endif() endif()
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
\ No newline at end of file
target_compile_definitions(${TARGET} PUBLIC LLAMA_SERVER_LIBRARY=1)
target_link_libraries(${TARGET} PRIVATE ggml llava common )
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(${TARGET} PRIVATE SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>)
install(TARGETS ext_server LIBRARY)
if (CUDAToolkit_FOUND)
target_include_directories(${TARGET} PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
if (WIN32)
target_link_libraries(${TARGET} PRIVATE nvml)
endif()
endif()
\ No newline at end of file
# Extern C Server
This directory contains a thin facade we layer on top of the Llama.cpp server to
expose `extern C` interfaces to access the functionality through direct API
calls in-process. The llama.cpp code uses compile time macros to configure GPU
type along with other settings. During the `go generate ./...` execution, the
build will generate one or more copies of the llama.cpp `extern C` server based
on what GPU libraries are detected to support multiple GPU types as well as CPU
only support. The Ollama go build then embeds these different servers to support
different GPUs and settings at runtime.
If you are making changes to the code in this directory, make sure to disable
caching during your go build to ensure you pick up your changes. A typical
iteration cycle from the top of the source tree looks like:
```
go generate ./... && go build -a .
```
\ No newline at end of file
#include "ext_server.h"
#include <atomic>
// Necessary evil since the server types are not defined in a header
#include "server.cpp"
// Low level API access to verify GPU access
#if defined(GGML_USE_CUBLAS)
#if defined(GGML_USE_HIPBLAS)
#include <hip/hip_runtime.h>
#include <hipblas/hipblas.h>
#include <hip/hip_fp16.h>
#ifdef __HIP_PLATFORM_AMD__
// for rocblas_initialize()
#include "rocblas/rocblas.h"
#endif // __HIP_PLATFORM_AMD__
#define cudaGetDevice hipGetDevice
#define cudaError_t hipError_t
#define cudaSuccess hipSuccess
#define cudaGetErrorString hipGetErrorString
#else
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cuda_fp16.h>
#endif // defined(GGML_USE_HIPBLAS)
#endif // GGML_USE_CUBLAS
// Expose the llama server as a callable extern "C" API
llama_server_context *llama = NULL;
std::thread ext_server_thread;
bool shutting_down = false;
std::atomic_int recv_counter;
// RAII wrapper for tracking in-flight recv calls
class atomicRecv {
public:
atomicRecv(std::atomic<int> &atomic) : atomic(atomic) {
++this->atomic;
}
~atomicRecv() {
--this->atomic;
}
private:
std::atomic<int> &atomic;
};
void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
recv_counter = 0;
assert(err != NULL && sparams != NULL);
log_set_target(stderr);
if (!sparams->verbose_logging) {
server_verbose = true;
log_disable();
}
LOG_TEE("system info: %s\n", llama_print_system_info());
err->id = 0;
err->msg[0] = '\0';
try {
llama = new llama_server_context;
gpt_params params;
params.n_ctx = sparams->n_ctx;
params.n_batch = sparams->n_batch;
if (sparams->n_threads > 0) {
params.n_threads = sparams->n_threads;
}
params.n_parallel = sparams->n_parallel;
params.rope_freq_base = sparams->rope_freq_base;
params.rope_freq_scale = sparams->rope_freq_scale;
if (sparams->memory_f16) {
params.cache_type_k = "f16";
params.cache_type_v = "f16";
} else {
params.cache_type_k = "f32";
params.cache_type_v = "f32";
}
params.n_gpu_layers = sparams->n_gpu_layers;
params.main_gpu = sparams->main_gpu;
params.use_mlock = sparams->use_mlock;
params.use_mmap = sparams->use_mmap;
params.numa = (ggml_numa_strategy)sparams->numa;
params.embedding = sparams->embedding;
if (sparams->model != NULL) {
params.model = sparams->model;
}
if (sparams->lora_adapters != NULL) {
for (ext_server_lora_adapter *la = sparams->lora_adapters; la != NULL;
la = la->next) {
params.lora_adapter.push_back(std::make_tuple(la->adapter, la->scale));
}
params.use_mmap = false;
}
if (sparams->mmproj != NULL) {
params.mmproj = std::string(sparams->mmproj);
}
#if defined(GGML_USE_CUBLAS)
// Before attempting to init the backend which will assert on error, verify the CUDA/ROCM GPU is accessible
LOG_TEE("Performing pre-initialization of GPU\n");
int id;
cudaError_t cudaErr = cudaGetDevice(&id);
if (cudaErr != cudaSuccess) {
err->id = -1;
snprintf(err->msg, err->msg_len, "Unable to init GPU: %s", cudaGetErrorString(cudaErr));
return;
}
#endif
llama_backend_init();
llama_numa_init(params.numa);
if (!llama->load_model(params)) {
// an error occurred that was not thrown
err->id = -1;
snprintf(err->msg, err->msg_len, "error loading model %s", params.model.c_str());
return;
}
llama->initialize();
} catch (std::exception &e) {
err->id = -1;
snprintf(err->msg, err->msg_len, "exception %s", e.what());
} catch (...) {
err->id = -1;
snprintf(err->msg, err->msg_len,
"Unknown exception initializing llama server");
}
}
void llama_server_start() {
assert(llama != NULL);
// TODO mutex to protect thread creation
ext_server_thread = std::thread([&]() {
try {
LOG_TEE("llama server main loop starting\n");
ggml_time_init();
llama->queue_tasks.on_new_task(std::bind(
&llama_server_context::process_single_task, llama, std::placeholders::_1));
llama->queue_tasks.on_finish_multitask(std::bind(
&llama_server_context::on_finish_multitask, llama, std::placeholders::_1));
llama->queue_tasks.on_run_slots(std::bind(
&llama_server_context::update_slots, llama));
llama->queue_results.on_multitask_update(std::bind(
&llama_server_queue::update_multitask,
&llama->queue_tasks,
std::placeholders::_1,
std::placeholders::_2,
std::placeholders::_3
));
llama->queue_tasks.start_loop();
} catch (std::exception &e) {
LOG_TEE("caught exception in llama server main loop: %s\n", e.what());
} catch (...) {
LOG_TEE("caught unknown exception in llama server main loop\n");
}
LOG_TEE("\nllama server shutting down\n");
llama_backend_free();
});
}
void llama_server_stop() {
assert(llama != NULL);
// Shutdown any in-flight requests and block incoming requests.
LOG_TEE("\ninitiating shutdown - draining remaining tasks...\n");
shutting_down = true;
while (recv_counter.load() > 0) {
std::this_thread::sleep_for(std::chrono::milliseconds(50));
}
// This may take a while for any pending tasks to drain
// TODO - consider a timeout to cancel tasks if it's taking too long
llama->queue_tasks.terminate();
ext_server_thread.join();
delete llama;
llama = NULL;
LOG_TEE("llama server shutdown complete\n");
shutting_down = false;
}
void llama_server_completion(const char *json_req, ext_server_resp_t *resp) {
assert(llama != NULL && json_req != NULL && resp != NULL);
resp->id = -1;
resp->msg[0] = '\0';
try {
if (shutting_down) {
throw std::runtime_error("server shutting down");
}
json data = json::parse(json_req);
resp->id = llama->queue_tasks.get_new_id();
llama->queue_results.add_waiting_task_id(resp->id);
llama->request_completion(resp->id, data, false, false, -1);
} catch (std::exception &e) {
snprintf(resp->msg, resp->msg_len, "exception %s", e.what());
} catch (...) {
snprintf(resp->msg, resp->msg_len, "Unknown exception during completion");
}
}
void llama_server_completion_next_result(const int task_id,
ext_server_task_result_t *resp) {
assert(llama != NULL && resp != NULL);
resp->id = -1;
resp->stop = false;
resp->error = false;
resp->json_resp = NULL;
std::string result_json;
try {
atomicRecv ar(recv_counter);
task_result result = llama->queue_results.recv(task_id);
result_json =
result.result_json.dump(-1, ' ', false, json::error_handler_t::replace);
resp->id = result.id;
resp->stop = result.stop;
resp->error = result.error;
if (result.error) {
LOG_TEE("next result cancel on error\n");
llama->request_cancel(task_id);
LOG_TEE("next result removing waiting tak ID: %d\n", task_id);
llama->queue_results.remove_waiting_task_id(task_id);
} else if (result.stop) {
LOG_TEE("next result cancel on stop\n");
llama->request_cancel(task_id);
LOG_TEE("next result removing waiting task ID: %d\n", task_id);
llama->queue_results.remove_waiting_task_id(task_id);
} else if (shutting_down) {
LOG_TEE("aborting completion due to shutdown %d\n", task_id);
llama->request_cancel(task_id);
llama->queue_results.remove_waiting_task_id(task_id);
resp->stop = true;
}
} catch (std::exception &e) {
resp->error = true;
resp->id = -1;
result_json = "{\"error\":\"exception " + std::string(e.what()) + "\"}";
LOG_TEE("llama server completion exception %s\n", e.what());
} catch (...) {
resp->error = true;
resp->id = -1;
result_json = "{\"error\":\"Unknown exception during completion\"}";
LOG_TEE("llama server completion unknown exception\n");
}
const std::string::size_type size = result_json.size() + 1;
resp->json_resp = new char[size];
snprintf(resp->json_resp, size, "%s", result_json.c_str());
}
void llama_server_release_task_result(ext_server_task_result_t *result) {
if (result == NULL || result->json_resp == NULL) {
return;
}
delete[] result->json_resp;
}
void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err) {
assert(llama != NULL && err != NULL);
err->id = 0;
err->msg[0] = '\0';
try {
llama->request_cancel(task_id);
llama->queue_results.remove_waiting_task_id(task_id);
} catch (std::exception &e) {
err->id = -1;
snprintf(err->msg, err->msg_len, "exception %s", e.what());
} catch (...) {
err->id = -1;
snprintf(err->msg, err->msg_len,
"Unknown exception completion cancel in llama server");
}
}
void llama_server_tokenize(const char *json_req, char **json_resp,
ext_server_resp_t *err) {
assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
*json_resp = NULL;
err->id = 0;
err->msg[0] = '\0';
try {
if (shutting_down) {
throw std::runtime_error("server shutting down");
}
const json body = json::parse(json_req);
std::vector<llama_token> tokens;
if (body.count("content") != 0) {
tokens = llama->tokenize(body["content"], false);
}
const json data = format_tokenizer_response(tokens);
std::string result_json = data.dump();
const std::string::size_type size = result_json.size() + 1;
*json_resp = new char[size];
snprintf(*json_resp, size, "%s", result_json.c_str());
} catch (std::exception &e) {
err->id = -1;
snprintf(err->msg, err->msg_len, "exception %s", e.what());
} catch (...) {
err->id = -1;
snprintf(err->msg, err->msg_len, "Unknown exception during tokenize");
}
}
void llama_server_release_json_resp(char **json_resp) {
if (json_resp == NULL || *json_resp == NULL) {
return;
}
delete[] *json_resp;
}
void llama_server_detokenize(const char *json_req, char **json_resp,
ext_server_resp_t *err) {
assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
*json_resp = NULL;
err->id = 0;
err->msg[0] = '\0';
try {
if (shutting_down) {
throw std::runtime_error("server shutting down");
}
const json body = json::parse(json_req);
std::string content;
if (body.count("tokens") != 0) {
const std::vector<llama_token> tokens = body["tokens"];
content = tokens_to_str(llama->ctx, tokens.cbegin(), tokens.cend());
}
const json data = format_detokenized_response(content);
std::string result_json = data.dump();
const std::string::size_type size = result_json.size() + 1;
*json_resp = new char[size];
snprintf(*json_resp, size, "%s", result_json.c_str());
} catch (std::exception &e) {
err->id = -1;
snprintf(err->msg, err->msg_len, "exception %s", e.what());
} catch (...) {
err->id = -1;
snprintf(err->msg, err->msg_len, "Unknown exception during detokenize");
}
}
void llama_server_embedding(const char *json_req, char **json_resp,
ext_server_resp_t *err) {
assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
*json_resp = NULL;
err->id = 0;
err->msg[0] = '\0';
try {
if (shutting_down) {
throw std::runtime_error("server shutting down");
}
const json body = json::parse(json_req);
json prompt;
if (body.count("content") != 0) {
prompt = body["content"];
} else {
prompt = "";
}
const int task_id = llama->queue_tasks.get_new_id();
llama->queue_results.add_waiting_task_id(task_id);
llama->request_completion(task_id, {{"prompt", prompt}, {"n_predict", 0}}, false, true, -1);
atomicRecv ar(recv_counter);
task_result result = llama->queue_results.recv(task_id);
std::string result_json = result.result_json.dump();
const std::string::size_type size = result_json.size() + 1;
*json_resp = new char[size];
snprintf(*json_resp, size, "%s", result_json.c_str());
llama->queue_results.remove_waiting_task_id(task_id);
} catch (std::exception &e) {
err->id = -1;
snprintf(err->msg, err->msg_len, "exception %s", e.what());
} catch (...) {
err->id = -1;
snprintf(err->msg, err->msg_len, "Unknown exception during embedding");
}
}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment