Commit 35934b2e authored by Daniel Hiltgen's avatar Daniel Hiltgen
Browse files

Adapted rocm support to cgo based llama.cpp

parent f8ef4439
......@@ -48,4 +48,8 @@ init_vars
git_module_setup
apply_patches
build
install
\ No newline at end of file
install
# TODO - implement ROCm support on windows
md gguf/build/winrocm/lib -ea 0
echo $null >> gguf/build/winrocm/lib/.generated
package llm
//go:generate sh ./gen_linux.sh
//go:generate bash ./gen_linux.sh
//go:build cuda
package llm
//go:generate git submodule init
//go:generate git submodule update --force ggml
//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
//go:generate git -C ggml apply ../patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
//go:generate git -C ggml apply ../patches/0001-copy-cuda-runtime-libraries.patch
//go:generate rm -rf ggml/build/cuda
//go:generate cmake -S ggml -B ggml/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
//go:generate cmake --build ggml/build/cuda --target server --config Release
//go:generate mv ggml/build/cuda/bin/server ggml/build/cuda/bin/ollama-runner
//go:generate git -C gguf apply ../patches/0001-copy-cuda-runtime-libraries.patch
//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
//go:generate rm -rf gguf/build/cuda
//go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off
//go:generate cmake --build gguf/build/cuda --target server --config Release
//go:generate mv gguf/build/cuda/bin/server gguf/build/cuda/bin/ollama-runner
//go:build rocm
package llm
//go:generate git submodule init
//go:generate git submodule update --force ggml
//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
//go:generate git -C ggml apply ../patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
//go:generate git -C ggml apply ../patches/0001-copy-cuda-runtime-libraries.patch
//go:generate git submodule update --force gguf
//go:generate git -C gguf apply ../patches/0001-copy-cuda-runtime-libraries.patch
//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
//go:generate rm -rf ggml/build/rocm
//go:generate cmake -S ggml -B ggml/build/rocm -DLLAMA_CLBLAST=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
//go:generate cmake --build ggml/build/rocm --target server --config Release
//go:generate mv ggml/build/rocm/bin/server ggml/build/rocm/bin/ollama-runner
//go:generate rm -rf gguf/build/rocm
//go:generate cmake -S gguf -B gguf/build/rocm -DLLAMA_HIPBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102' -DGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102'
//go:generate cmake --build gguf/build/rocm --target server --config Release
//go:generate mv gguf/build/rocm/bin/server gguf/build/rocm/bin/ollama-runner
......@@ -3,6 +3,7 @@ package llm
import (
"bytes"
"context"
_ "embed"
"errors"
"fmt"
"os"
......@@ -112,12 +113,6 @@ type ImageData struct {
ID int `json:"id"`
}
type llama struct {
api.Options
ImageData []ImageData
Running
}
var (
errNvidiaSMI = errors.New("warning: gpu support may not be enabled, check that you have installed GPU drivers: nvidia-smi command failed")
errAvailableVRAM = errors.New("not enough VRAM available, falling back to CPU only")
......@@ -166,7 +161,8 @@ type prediction struct {
}
const maxBufferSize = 512 * format.KiloByte
const maxRetries = 6
const maxRetries = 3
const retryDelay = 1 * time.Second
type PredictOpts struct {
Prompt string
......
......@@ -11,6 +11,7 @@ import (
"github.com/jmorganca/ollama/api"
"github.com/jmorganca/ollama/format"
"github.com/jmorganca/ollama/gpu"
)
type LLM interface {
......@@ -19,7 +20,6 @@ type LLM interface {
Encode(context.Context, string) ([]int, error)
Decode(context.Context, []int) (string, error)
Close()
Ping(context.Context) error
}
func New(workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
......@@ -78,5 +78,17 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
opts.NumGQA = 0
opts.RopeFrequencyBase = 0.0
opts.RopeFrequencyScale = 0.0
return newLlamaExtServer(model, adapters, projectors, ggml.NumLayers(), opts)
gpuInfo := gpu.GetGPUInfo()
switch gpuInfo.Driver {
case "ROCM":
return newRocmShimExtServer(model, adapters, projectors, ggml.NumLayers(), opts)
default:
// Rely on the built-in CUDA based server which will fall back to CPU
return newLlamaExtServer(model, adapters, projectors, ggml.NumLayers(), opts)
}
}
// Give any native cgo implementations an opportunity to initialize
func Init(workdir string) error {
return nativeInit(workdir)
}
#include "rocm_shim.h"
#include <stdio.h>
#include <string.h>
#ifndef _WIN32
#include <dlfcn.h>
#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
#define LOAD_ERR() dlerror()
#define UNLOAD_LIBRARY(handle) dlclose(handle)
#else
#include <windows.h>
#define LOAD_LIBRARY(lib, flags) LoadLibrary(lib)
#define LOAD_SYMBOL(handle, sym) GetProcAddress(handle, sym)
#define UNLOAD_LIBRARY(handle) FreeLibrary(handle)
// TODO - refactor this with proper error message handling on windows
inline static char *LOAD_ERR() {
static char errbuf[8];
snprintf(errbuf, 8, "0x%lx", GetLastError());
return errbuf;
}
#endif
void rocm_shim_init(const char *libPath, struct rocm_llama_server *s,
ext_server_resp_t *err) {
int i = 0;
struct lookup {
char *s;
void **p;
} l[] = {
{"llama_server_init", (void *)&s->llama_server_init},
{"llama_server_start", (void *)&s->llama_server_start},
{"llama_server_stop", (void *)&s->llama_server_stop},
{"llama_server_completion", (void *)&s->llama_server_completion},
{"llama_server_completion_next_result",
(void *)&s->llama_server_completion_next_result},
{"llama_server_completion_cancel",
(void *)&s->llama_server_completion_cancel},
{"llama_server_release_task_result",
(void *)&s->llama_server_release_task_result},
{"llama_server_tokenize", (void *)&s->llama_server_tokenize},
{"llama_server_detokenize", (void *)&s->llama_server_detokenize},
{"llama_server_embedding", (void *)&s->llama_server_embedding},
{"llama_server_release_json_resp",
(void *)&s->llama_server_release_json_resp},
{"", NULL},
};
printf("Lazy loading %s library\n", libPath);
s->handle = LOAD_LIBRARY(libPath, RTLD_LAZY);
if (!s->handle) {
err->id = -1;
snprintf(
err->msg, err->msg_len,
"Unable to load rocm server library: %s (If you have a Radeon card, "
"did you install the ROCM libraries?)",
LOAD_ERR());
return;
}
for (i = 0; l[i].p != NULL; i++) {
*l[i].p = LOAD_SYMBOL(s->handle, l[i].s);
if (!l[i].p) {
UNLOAD_LIBRARY(s->handle);
err->id = -1;
snprintf(err->msg, err->msg_len, "symbol lookup for %s failed: %s",
l[i].s, LOAD_ERR());
return;
}
}
}
inline void rocm_shim_llama_server_init(struct rocm_llama_server s,
ext_server_params_t *sparams,
ext_server_resp_t *err) {
s.llama_server_init(sparams, err);
}
inline void rocm_shim_llama_server_start(struct rocm_llama_server s) {
s.llama_server_start();
}
inline void rocm_shim_llama_server_stop(struct rocm_llama_server s) {
s.llama_server_stop();
}
inline void rocm_shim_llama_server_completion(struct rocm_llama_server s,
const char *json_req,
ext_server_resp_t *resp) {
s.llama_server_completion(json_req, resp);
}
inline void rocm_shim_llama_server_completion_next_result(
struct rocm_llama_server s, const int task_id,
ext_server_task_result_t *result) {
s.llama_server_completion_next_result(task_id, result);
}
inline void rocm_shim_llama_server_completion_cancel(struct rocm_llama_server s,
const int task_id,
ext_server_resp_t *err) {
s.llama_server_completion_cancel(task_id, err);
}
inline void rocm_shim_llama_server_release_task_result(
struct rocm_llama_server s, ext_server_task_result_t *result) {
s.llama_server_release_task_result(result);
}
inline void rocm_shim_llama_server_tokenize(struct rocm_llama_server s,
const char *json_req,
char **json_resp,
ext_server_resp_t *err) {
s.llama_server_tokenize(json_req, json_resp, err);
}
inline void rocm_shim_llama_server_detokenize(struct rocm_llama_server s,
const char *json_req,
char **json_resp,
ext_server_resp_t *err) {
s.llama_server_detokenize(json_req, json_resp, err);
}
inline void rocm_shim_llama_server_embedding(struct rocm_llama_server s,
const char *json_req,
char **json_resp,
ext_server_resp_t *err) {
s.llama_server_embedding(json_req, json_resp, err);
}
inline void rocm_shim_llama_server_release_json_resp(struct rocm_llama_server s,
char **json_resp) {
s.llama_server_release_json_resp(json_resp);
}
#include <stdlib.h>
#include "server.h"
#ifdef __cplusplus
extern "C" {
#endif
struct rocm_llama_server {
void *handle;
void (*llama_server_init)(ext_server_params_t *sparams,
ext_server_resp_t *err);
void (*llama_server_start)();
void (*llama_server_stop)();
void (*llama_server_completion)(const char *json_req,
ext_server_resp_t *resp);
void (*llama_server_completion_next_result)(const int task_id,
ext_server_task_result_t *result);
void (*llama_server_completion_cancel)(const int task_id,
ext_server_resp_t *err);
void (*llama_server_release_task_result)(ext_server_task_result_t *result);
void (*llama_server_tokenize)(const char *json_req, char **json_resp,
ext_server_resp_t *err);
void (*llama_server_detokenize)(const char *json_req, char **json_resp,
ext_server_resp_t *err);
void (*llama_server_embedding)(const char *json_req, char **json_resp,
ext_server_resp_t *err);
void (*llama_server_release_json_resp)(char **json_resp);
};
void rocm_shim_init(const char *libPath, struct rocm_llama_server *s,
ext_server_resp_t *err);
// No good way to call C function pointers from Go so inline the indirection
void rocm_shim_llama_server_init(struct rocm_llama_server s,
ext_server_params_t *sparams,
ext_server_resp_t *err);
void rocm_shim_llama_server_start(struct rocm_llama_server s);
void rocm_shim_llama_server_stop(struct rocm_llama_server s);
void rocm_shim_llama_server_completion(struct rocm_llama_server s,
const char *json_req,
ext_server_resp_t *resp);
void rocm_shim_llama_server_completion_next_result(
struct rocm_llama_server s, const int task_id,
ext_server_task_result_t *result);
void rocm_shim_llama_server_completion_cancel(struct rocm_llama_server s,
const int task_id,
ext_server_resp_t *err);
void rocm_shim_llama_server_release_task_result(
struct rocm_llama_server s, ext_server_task_result_t *result);
void rocm_shim_llama_server_tokenize(struct rocm_llama_server s,
const char *json_req, char **json_resp,
ext_server_resp_t *err);
void rocm_shim_llama_server_detokenize(struct rocm_llama_server s,
const char *json_req, char **json_resp,
ext_server_resp_t *err);
void rocm_shim_llama_server_embedding(struct rocm_llama_server s,
const char *json_req, char **json_resp,
ext_server_resp_t *err);
void rocm_shim_llama_server_release_json_resp(struct rocm_llama_server s,
char **json_resp);
#ifdef __cplusplus
}
#endif
\ No newline at end of file
package llm
import (
"fmt"
"github.com/jmorganca/ollama/api"
)
// no-op stubs for mac
func newRocmShimExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
// should never happen...
return nil, fmt.Errorf("ROCM GPUs not supported on Mac")
}
func nativeInit(workDir string) error {
return nil
}
//go:build !darwin
package llm
/*
#include <stdlib.h>
#include "rocm_shim.h"
*/
import "C"
import (
"context"
"embed"
"errors"
"fmt"
"io"
"io/fs"
"log"
"os"
"path/filepath"
"runtime"
"sync"
"unsafe"
"github.com/jmorganca/ollama/api"
)
//go:embed llama.cpp/gguf/build/*/lib/*
var libEmbed embed.FS
var RocmShimMissing = fmt.Errorf("ROCm shim library not included in this build of ollama. Radeon GPUs are not supported")
var NoShim = true
type shimExtServer struct {
s C.struct_rocm_llama_server
options api.Options
}
// Note: current implementation does not support concurrent instantiations
var shimMutex sync.Mutex
var llm *shimExtServer
func (llm *shimExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) {
C.rocm_shim_llama_server_init(llm.s, sparams, err)
}
func (llm *shimExtServer) llama_server_start() {
C.rocm_shim_llama_server_start(llm.s)
}
func (llm *shimExtServer) llama_server_stop() {
C.rocm_shim_llama_server_stop(llm.s)
}
func (llm *shimExtServer) llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) {
C.rocm_shim_llama_server_completion(llm.s, json_req, resp)
}
func (llm *shimExtServer) llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) {
C.rocm_shim_llama_server_completion_next_result(llm.s, task_id, resp)
}
func (llm *shimExtServer) llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) {
C.rocm_shim_llama_server_completion_cancel(llm.s, task_id, err)
}
func (llm *shimExtServer) llama_server_release_task_result(result *C.ext_server_task_result_t) {
C.rocm_shim_llama_server_release_task_result(llm.s, result)
}
func (llm *shimExtServer) llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
C.rocm_shim_llama_server_tokenize(llm.s, json_req, json_resp, err)
}
func (llm *shimExtServer) llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
C.rocm_shim_llama_server_detokenize(llm.s, json_req, json_resp, err)
}
func (llm *shimExtServer) llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
C.rocm_shim_llama_server_embedding(llm.s, json_req, json_resp, err)
}
func (llm *shimExtServer) llama_server_release_json_resp(json_resp **C.char) {
C.rocm_shim_llama_server_release_json_resp(llm.s, json_resp)
}
func newRocmShimExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
if NoShim {
return nil, RocmShimMissing
}
log.Printf("Loading ROCM llm server")
if llm == nil {
return nil, fmt.Errorf("nativeInit wasnt called or libary load failed")
}
llm.options = opts
return newExtServer(llm, model, adapters, projectors, numLayers, opts)
}
func (llm *shimExtServer) Predict(ctx context.Context, pred PredictOpts, fn func(PredictResult)) error {
return predict(llm, llm.options, ctx, pred, fn)
}
func (llm *shimExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
return encode(llm, ctx, prompt)
}
func (llm *shimExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
return decode(llm, ctx, tokens)
}
func (llm *shimExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
return embedding(llm, ctx, input)
}
func (llm *shimExtServer) Close() {
close(llm)
}
func nativeInit(workdir string) error {
err := extractLib(workdir)
if err != nil {
if err == RocmShimMissing {
log.Printf("%s", err)
return nil
}
return err
}
// Verify we have permissions - either running as root, or we have group access to the driver
fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666)
if err != nil {
if errors.Is(err, fs.ErrPermission) {
log.Fatalf("Radeon card detected, but permissions not set up properly. Either run ollama as root, or add you user account to the render group.")
return err
} else if errors.Is(err, fs.ErrNotExist) {
// expected behavior without a radeon card
return nil
}
return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)
}
fd.Close()
shimMutex.Lock()
defer shimMutex.Unlock()
if llm != nil {
return nil
}
var libName string
switch runtime.GOOS {
case "darwin":
// shouldn't happen
return nil
case "linux":
libName = "librocm_server.so"
case "windows":
libName = "rocm_server.dll"
default:
// shouldn't happen
return nil
}
libPath := C.CString(filepath.Join(workdir, libName))
defer C.free(unsafe.Pointer(libPath))
resp := newExtServerResp(128)
defer freeExtServerResp(resp)
var srv C.struct_rocm_llama_server
C.rocm_shim_init(libPath, &srv, &resp)
if resp.id < 0 {
// TODO - consider softening this failure mode to allow fall-back to the CUDA based built-in llm
// and run against CPU
return fmt.Errorf("Unable to load AMD GPU library: %s", C.GoString(resp.msg))
}
llm = &shimExtServer{
s: srv,
options: api.DefaultOptions(),
}
return nil
}
func extractLib(workDir string) error {
files, err := fs.Glob(libEmbed, "llama.cpp/gguf/build/*/lib/*rocm_server*")
if err != nil || len(files) == 0 {
// this is expected, ollama may be compiled without shim library packed in
return RocmShimMissing
}
if len(files) != 1 {
// Shouldn't happen, but just use the first one we find
log.Printf("WARNING: multiple rocm libraries detected - using %s", files[0])
}
srcFile, err := libEmbed.Open(files[0])
if err != nil {
return fmt.Errorf("read ROCm shim %s: %v", files[0], err)
}
defer srcFile.Close()
if err := os.MkdirAll(workDir, 0o755); err != nil {
return fmt.Errorf("create ROCm shim temp dir %s: %v", workDir, err)
}
destFile := filepath.Join(workDir, filepath.Base(files[0]))
_, err = os.Stat(destFile)
switch {
case errors.Is(err, os.ErrNotExist):
destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
if err != nil {
return fmt.Errorf("write ROCm shim %s: %v", files[0], err)
}
defer destFile.Close()
if _, err := io.Copy(destFile, srcFile); err != nil {
return fmt.Errorf("copy ROCm shim %s: %v", files[0], err)
}
case err != nil:
return fmt.Errorf("stat ROCm shim %s: %v", files[0], err)
}
NoShim = false
return nil
}
......@@ -8,7 +8,7 @@ export GOFLAGS="'-ldflags=-w -s \"-X=github.com/jmorganca/ollama/version.Version
mkdir -p dist
for TARGETARCH in amd64 arm64; do
docker buildx build --load --platform=linux/$TARGETARCH --build-arg=VERSION --build-arg=GOFLAGS -f Dockerfile.build -t builder:$TARGETARCH .
docker buildx build --load --progress=plain --platform=linux/$TARGETARCH --build-arg=VERSION --build-arg=GOFLAGS -f Dockerfile.build -t builder:$TARGETARCH .
docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH
docker cp builder-$TARGETARCH:/go/src/github.com/jmorganca/ollama/ollama ./dist/ollama-linux-$TARGETARCH
docker rm builder-$TARGETARCH
......
#!/usr/bin/env python3
import subprocess
import sys
from urllib.parse import urlparse
from git import Repo
# Helper script to be able to build on remote repos using git to push local changes
# (e.g. particularly helpful to target a remote windows build system)
#
# Typical windows remote git config looks like this:
#
#[remote "windows-pa"]
# url = jdoe@desktop-foo:C:/Users/Jdoe/code/ollama
# fetch = +refs/heads/*:refs/remotes/windows-pa/*
# uploadpack = powershell git upload-pack
# receivepack = powershell git receive-pack
#
# TODO - add argpare and make this more configurable
# - force flag becomes optional
# - generate, build or test ...
# Note: remote repo will need this run once:
# git config --local receive.denyCurrentBranch updateInstead
repo = Repo(".")
# On linux, add links in /usr/local/bin to the go binaries to avoid needing this
# GoCmd = "/usr/local/go/bin/go"
GoCmd = "go"
if repo.is_dirty():
print("Tree is dirty. Commit your changes before running this script")
sys.exit(1)
if len(sys.argv) != 2:
print("Please specify the remote name: " + ', '.join([r.name for r in repo.remotes]))
sys.exit(1)
remote_name = sys.argv[1]
remote = {r.name: r for r in repo.remotes}[remote_name]
raw_url = list(remote.urls)[0]
url = urlparse(raw_url)
# Windows urls don't quite parse properly
if url.scheme == "" and url.netloc == "":
url = urlparse("ssh://" + raw_url)
print("URL: " + str(url))
netloc = url.netloc.split(":")[0]
path = url.path
branch_name = repo.active_branch.name
print("Force pushing content to remote...")
# Use with care given the force push
remote.push(force=True).raise_if_error()
print("Ensuring correct branch checked out on remote via ssh...")
subprocess.check_call(['ssh', netloc, 'cd', path, ';', 'git', 'checkout', branch_name])
# TODO - add some hardening to try to figure out how to set up the path properly
# subprocess.check_call(['ssh', netloc, 'cd', path, ';', 'env'])
# TODO - or consider paramiko maybe
print("Performing generate")
subprocess.check_call(['ssh', netloc, 'cd', path, ';', GoCmd, 'generate', './...'])
print("Building")
subprocess.check_call(['ssh', netloc, 'cd', path, ';', GoCmd, 'build', '.'])
......@@ -32,4 +32,4 @@ for LAYER in $(cat ${OLLAMA_MODELS}/manifests/${REGISTRY}/${TEST_MODEL}/${TEST_M
curl -L -C - --header "${ACCEPT_HEADER}" \
-o ${OLLAMA_MODELS}/blobs/${LAYER} \
${REGISTRY_SCHEME}://${REGISTRY}/v2/${TEST_MODEL}/blobs/${LAYER}
done
\ No newline at end of file
done
......@@ -2,14 +2,17 @@ package server
import (
"context"
"os"
"strings"
"sync"
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/jmorganca/ollama/api"
"github.com/jmorganca/ollama/llm"
)
// TODO - this would ideally be in the llm package, but that would require some refactoring of interfaces in the server
......@@ -33,12 +36,16 @@ var (
}
resp = [2]string{
"once upon a time",
"fourth thursday",
"united states thanksgiving",
}
)
func TestIntegrationSimpleOrcaMini(t *testing.T) {
SkipIFNoTestData(t)
workDir, err := os.MkdirTemp("", "ollama")
require.NoError(t, err)
defer os.RemoveAll(workDir)
require.NoError(t, llm.Init(workDir))
ctx, cancel := context.WithTimeout(context.Background(), time.Second*60)
defer cancel()
opts := api.DefaultOptions()
......@@ -56,7 +63,13 @@ func TestIntegrationSimpleOrcaMini(t *testing.T) {
// get true concurrency working with n_parallel support in the backend
func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
SkipIFNoTestData(t)
t.Skip("concurrent prediction on single runner not currently supported")
workDir, err := os.MkdirTemp("", "ollama")
require.NoError(t, err)
defer os.RemoveAll(workDir)
require.NoError(t, llm.Init(workDir))
ctx, cancel := context.WithTimeout(context.Background(), time.Second*60)
defer cancel()
opts := api.DefaultOptions()
......@@ -79,6 +92,10 @@ func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
func TestIntegrationConcurrentRunnersOrcaMini(t *testing.T) {
SkipIFNoTestData(t)
workDir, err := os.MkdirTemp("", "ollama")
require.NoError(t, err)
defer os.RemoveAll(workDir)
require.NoError(t, llm.Init(workDir))
ctx, cancel := context.WithTimeout(context.Background(), time.Second*60)
defer cancel()
opts := api.DefaultOptions()
......@@ -87,6 +104,7 @@ func TestIntegrationConcurrentRunnersOrcaMini(t *testing.T) {
var wg sync.WaitGroup
wg.Add(len(req))
t.Logf("Running %d concurrently", len(req))
for i := 0; i < len(req); i++ {
go func(i int) {
defer wg.Done()
......
......@@ -25,6 +25,7 @@ import (
"github.com/gin-gonic/gin"
"github.com/jmorganca/ollama/api"
"github.com/jmorganca/ollama/gpu"
"github.com/jmorganca/ollama/llm"
"github.com/jmorganca/ollama/parser"
"github.com/jmorganca/ollama/version"
......@@ -81,20 +82,6 @@ func load(c *gin.Context, modelName string, reqOpts map[string]interface{}, sess
return nil, err
}
ctx := c.Request.Context()
// check if the loaded model is still running in a subprocess, in case something unexpected happened
if loaded.runner != nil {
if err := loaded.runner.Ping(ctx); err != nil {
log.Print("loaded llm process not responding, closing now")
// the subprocess is no longer running, so close it
loaded.runner.Close()
loaded.runner = nil
loaded.Model = nil
loaded.Options = nil
}
}
needLoad := loaded.runner == nil || // is there a model loaded?
loaded.ModelPath != model.ModelPath || // has the base model changed?
!reflect.DeepEqual(loaded.AdapterPaths, model.AdapterPaths) || // have the adapters changed?
......@@ -905,9 +892,12 @@ func Serve(ln net.Listener) error {
os.Exit(0)
}()
if runtime.GOOS == "linux" {
if err := llm.Init(s.WorkDir); err != nil {
return fmt.Errorf("unable to initialize llm library %w", err)
}
if runtime.GOOS == "linux" { // TODO - windows too
// check compatibility to log warnings
if _, err := llm.CheckVRAM(); err != nil {
if _, err := gpu.CheckVRAM(); err != nil {
log.Print(err.Error())
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment