Unverified Commit 96fb441a authored by Daniel Hiltgen's avatar Daniel Hiltgen Committed by GitHub
Browse files

Merge pull request #1146 from dhiltgen/ext_server_cgo

Add cgo implementation for llama.cpp
parents fabf2f34 495c06e4
#include <stdlib.h>
#include "server.h"
#ifdef __cplusplus
extern "C" {
#endif
struct dynamic_llama_server {
void *handle;
void (*llama_server_init)(ext_server_params_t *sparams,
ext_server_resp_t *err);
void (*llama_server_start)();
void (*llama_server_stop)();
void (*llama_server_completion)(const char *json_req,
ext_server_resp_t *resp);
void (*llama_server_completion_next_result)(const int task_id,
ext_server_task_result_t *result);
void (*llama_server_completion_cancel)(const int task_id,
ext_server_resp_t *err);
void (*llama_server_release_task_result)(ext_server_task_result_t *result);
void (*llama_server_tokenize)(const char *json_req, char **json_resp,
ext_server_resp_t *err);
void (*llama_server_detokenize)(const char *json_req, char **json_resp,
ext_server_resp_t *err);
void (*llama_server_embedding)(const char *json_req, char **json_resp,
ext_server_resp_t *err);
void (*llama_server_release_json_resp)(char **json_resp);
};
void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s,
ext_server_resp_t *err);
// No good way to call C function pointers from Go so inline the indirection
void dynamic_shim_llama_server_init(struct dynamic_llama_server s,
ext_server_params_t *sparams,
ext_server_resp_t *err);
void dynamic_shim_llama_server_start(struct dynamic_llama_server s);
void dynamic_shim_llama_server_stop(struct dynamic_llama_server s);
void dynamic_shim_llama_server_completion(struct dynamic_llama_server s,
const char *json_req,
ext_server_resp_t *resp);
void dynamic_shim_llama_server_completion_next_result(
struct dynamic_llama_server s, const int task_id,
ext_server_task_result_t *result);
void dynamic_shim_llama_server_completion_cancel(struct dynamic_llama_server s,
const int task_id,
ext_server_resp_t *err);
void dynamic_shim_llama_server_release_task_result(
struct dynamic_llama_server s, ext_server_task_result_t *result);
void dynamic_shim_llama_server_tokenize(struct dynamic_llama_server s,
const char *json_req, char **json_resp,
ext_server_resp_t *err);
void dynamic_shim_llama_server_detokenize(struct dynamic_llama_server s,
const char *json_req,
char **json_resp,
ext_server_resp_t *err);
void dynamic_shim_llama_server_embedding(struct dynamic_llama_server s,
const char *json_req, char **json_resp,
ext_server_resp_t *err);
void dynamic_shim_llama_server_release_json_resp(struct dynamic_llama_server s,
char **json_resp);
#ifdef __cplusplus
}
#endif
\ No newline at end of file
package llm
/*
#cgo CFLAGS: -I${SRCDIR}/llama.cpp/gguf -I${SRCDIR}/llama.cpp/gguf/common -I${SRCDIR}/llama.cpp/gguf/examples/server
#cgo CFLAGS: -DNDEBUG -DLLAMA_SERVER_LIBRARY=1 -D_XOPEN_SOURCE=600 -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
#cgo CFLAGS: -Wmissing-noreturn -Wall -Wextra -Wcast-qual -Wno-unused-function -Wno-array-bounds
#cgo CPPFLAGS: -Ofast -Wall -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations -Wno-unused-but-set-variable
#cgo darwin CFLAGS: -D_DARWIN_C_SOURCE
#cgo darwin CPPFLAGS: -DGGML_USE_ACCELERATE
#cgo darwin CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_NDEBUG
#cgo darwin LDFLAGS: -lc++ -framework Accelerate
#cgo darwin LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/metal/common/libcommon.a
#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/metal/examples/server/libext_server.a
#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/metal/libllama.a
#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/metal/libggml_static.a
#cgo linux CFLAGS: -D_GNU_SOURCE
#cgo linux windows CFLAGS: -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_USE_CUBLAS
#cgo linux LDFLAGS: -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib/stubs
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/examples/server/libext_server.a
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/common/libcommon.a
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/libllama.a
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/libggml_static.a
#cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm
#cgo windows LDFLAGS: -L${SRCDIR}/llama.cpp/gguf/build/wincpu/dist/lib
#cgo windows LDFLAGS: -lcpu_server -lpthread
#include <stdlib.h>
#include "server.h"
*/
import "C"
import (
"bytes"
"context"
"encoding/json"
"fmt"
"log"
"os"
"strings"
"sync"
"time"
"unsafe"
"github.com/jmorganca/ollama/api"
"github.com/jmorganca/ollama/gpu"
)
func newExtServerResp(len C.size_t) C.ext_server_resp_t {
var resp C.ext_server_resp_t
resp.msg_len = len
bytes := make([]byte, len)
resp.msg = (*C.char)(C.CBytes(bytes))
return resp
}
func freeExtServerResp(resp C.ext_server_resp_t) {
if resp.msg_len == 0 {
return
}
C.free(unsafe.Pointer(resp.msg))
}
func extServerResponseToErr(resp C.ext_server_resp_t) error {
return fmt.Errorf(C.GoString(resp.msg))
}
type extServer interface {
LLM
llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t)
llama_server_start()
llama_server_stop()
llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t)
llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t)
llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t)
llama_server_release_task_result(result *C.ext_server_task_result_t)
llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
llama_server_release_json_resp(json_resp **C.char)
}
type llamaExtServer struct {
api.Options
}
// Note: current implementation does not support concurrent instantiations
var mutex sync.Mutex
func (llm *llamaExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) {
C.llama_server_init(sparams, err)
}
func (llm *llamaExtServer) llama_server_start() {
C.llama_server_start()
}
func (llm *llamaExtServer) llama_server_stop() {
C.llama_server_stop()
}
func (llm *llamaExtServer) llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) {
C.llama_server_completion(json_req, resp)
}
func (llm *llamaExtServer) llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) {
C.llama_server_completion_next_result(task_id, resp)
}
func (llm *llamaExtServer) llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) {
C.llama_server_completion_cancel(task_id, err)
}
func (llm *llamaExtServer) llama_server_release_task_result(result *C.ext_server_task_result_t) {
C.llama_server_release_task_result(result)
}
func (llm *llamaExtServer) llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
C.llama_server_tokenize(json_req, json_resp, err)
}
func (llm *llamaExtServer) llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
C.llama_server_detokenize(json_req, json_resp, err)
}
func (llm *llamaExtServer) llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
C.llama_server_embedding(json_req, json_resp, err)
}
func (llm *llamaExtServer) llama_server_release_json_resp(json_resp **C.char) {
C.llama_server_release_json_resp(json_resp)
}
func newDefaultExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
server := &llamaExtServer{opts}
return newExtServer(server, model, adapters, projectors, numLayers, opts)
}
func newExtServer(server extServer, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
if !mutex.TryLock() {
log.Printf("concurrent llm servers not yet supported, waiting for prior server to complete")
mutex.Lock()
}
fileInfo, err := os.Stat(model)
if err != nil {
return nil, err
}
var sparams C.ext_server_params_t
sparams.model = C.CString(model)
defer C.free(unsafe.Pointer(sparams.model))
numGPU := gpu.NumGPU(numLayers, fileInfo.Size(), opts)
sparams.embedding = true
sparams.n_ctx = C.uint(opts.NumCtx)
sparams.n_batch = C.uint(opts.NumBatch)
sparams.n_gpu_layers = C.int(numGPU)
sparams.main_gpu = C.int(opts.MainGPU)
sparams.n_parallel = 1 // TODO - wire up concurrency
// Always use the value encoded in the model
sparams.rope_freq_base = 0.0
sparams.rope_freq_scale = 0.0
sparams.memory_f16 = C.bool(opts.F16KV)
sparams.use_mlock = C.bool(opts.UseMLock)
sparams.use_mmap = C.bool(opts.UseMMap)
sparams.numa = C.bool(opts.UseNUMA)
sparams.lora_adapters = nil
for i := 0; i < len(adapters); i++ {
la := (*C.ext_server_lora_adapter_t)(C.malloc(C.sizeof_ext_server_lora_adapter_t))
defer C.free(unsafe.Pointer(la))
la.adapter = C.CString(adapters[i])
defer C.free(unsafe.Pointer(la.adapter))
la.scale = C.float(1.0) // TODO expose scale/weights up through ollama UX
la.next = nil
if i == 0 {
sparams.lora_adapters = la
} else {
tmp := sparams.lora_adapters
for ; tmp.next != nil; tmp = tmp.next {
}
tmp.next = la
}
}
if len(projectors) > 0 {
// TODO: applying multiple projectors is not supported by the llama.cpp server yet
sparams.mmproj = C.CString(projectors[0])
defer C.free(unsafe.Pointer(sparams.mmproj))
} else {
sparams.mmproj = nil
}
sparams.n_threads = C.uint(opts.NumThread)
log.Printf("Initializing internal llama server")
resp := newExtServerResp(128)
defer freeExtServerResp(resp)
server.llama_server_init(&sparams, &resp)
if resp.id < 0 {
return nil, extServerResponseToErr(resp)
}
log.Printf("Starting internal llama main loop")
server.llama_server_start()
return server, nil
}
func (llm *llamaExtServer) Predict(ctx context.Context, pred PredictOpts, fn func(PredictResult)) error {
return predict(llm, llm.Options, ctx, pred, fn)
}
func predict(llm extServer, opts api.Options, ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
resp := newExtServerResp(128)
defer freeExtServerResp(resp)
var imageData []ImageData
if len(predict.Images) > 0 {
for cnt, i := range predict.Images {
imageData = append(imageData, ImageData{Data: i, ID: cnt})
}
}
log.Printf("loaded %d images", len(imageData))
request := map[string]any{
"prompt": predict.Prompt,
"stream": true,
"n_predict": opts.NumPredict,
"n_keep": opts.NumKeep,
"temperature": opts.Temperature,
"top_k": opts.TopK,
"top_p": opts.TopP,
"tfs_z": opts.TFSZ,
"typical_p": opts.TypicalP,
"repeat_last_n": opts.RepeatLastN,
"repeat_penalty": opts.RepeatPenalty,
"presence_penalty": opts.PresencePenalty,
"frequency_penalty": opts.FrequencyPenalty,
"mirostat": opts.Mirostat,
"mirostat_tau": opts.MirostatTau,
"mirostat_eta": opts.MirostatEta,
"penalize_nl": opts.PenalizeNewline,
"seed": opts.Seed,
"stop": opts.Stop,
"image_data": imageData,
}
if predict.Format == "json" {
request["grammar"] = jsonGrammar
}
retryDelay := 100 * time.Microsecond
for retries := 0; retries < maxRetries; retries++ {
if retries > 0 {
time.Sleep(retryDelay) // wait before retrying
retryDelay *= 2 // exponential backoff
}
// Handling JSON marshaling with special characters unescaped.
buffer := &bytes.Buffer{}
enc := json.NewEncoder(buffer)
enc.SetEscapeHTML(false)
if err := enc.Encode(request); err != nil {
return fmt.Errorf("failed to marshal data: %w", err)
}
req := C.CString(buffer.String())
defer C.free(unsafe.Pointer(req))
llm.llama_server_completion(req, &resp)
if resp.id < 0 {
return extServerResponseToErr(resp)
}
retryNeeded := false
out:
for {
select {
case <-ctx.Done():
// This handles the request cancellation
llm.llama_server_completion_cancel(resp.id, &resp)
if resp.id < 0 {
return extServerResponseToErr(resp)
} else {
return nil
}
default:
var result C.ext_server_task_result_t
llm.llama_server_completion_next_result(resp.id, &result)
json_resp := C.GoString(result.json_resp)
llm.llama_server_release_task_result(&result)
var p prediction
if err := json.Unmarshal([]byte(json_resp), &p); err != nil {
llm.llama_server_completion_cancel(resp.id, &resp)
if resp.id < 0 {
return fmt.Errorf("error unmarshaling llm prediction response: %w and cancel %s", err, C.GoString(resp.msg))
} else {
return fmt.Errorf("error unmarshaling llm prediction response: %w", err)
}
}
if bool(result.error) && strings.Contains(json_resp, "slot unavailable") {
retryNeeded = true
// task will already be canceled
break out
}
if p.Content != "" {
fn(PredictResult{
Content: p.Content,
})
}
if p.Stop {
fn(PredictResult{
Done: true,
PromptEvalCount: p.Timings.PromptN,
PromptEvalDuration: parseDurationMs(p.Timings.PromptMS),
EvalCount: p.Timings.PredictedN,
EvalDuration: parseDurationMs(p.Timings.PredictedMS),
})
return nil
}
}
}
if !retryNeeded {
return nil // success
}
}
// should never reach here ideally
return fmt.Errorf("max retries exceeded")
}
func (llm *llamaExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
return encode(llm, ctx, prompt)
}
func encode(llm extServer, ctx context.Context, prompt string) ([]int, error) {
data, err := json.Marshal(TokenizeRequest{Content: prompt})
if err != nil {
return nil, fmt.Errorf("marshaling encode data: %w", err)
}
req := C.CString(string(data))
defer C.free(unsafe.Pointer(req))
var json_resp *C.char
resp := newExtServerResp(128)
defer freeExtServerResp(resp)
llm.llama_server_tokenize(req, &json_resp, &resp)
if resp.id < 0 {
return nil, extServerResponseToErr(resp)
}
defer llm.llama_server_release_json_resp(&json_resp)
var encoded TokenizeResponse
if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &encoded); err2 != nil {
return nil, fmt.Errorf("unmarshal encode response: %w", err2)
}
return encoded.Tokens, err
}
func (llm *llamaExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
return decode(llm, ctx, tokens)
}
func decode(llm extServer, ctx context.Context, tokens []int) (string, error) {
if len(tokens) == 0 {
return "", nil
}
data, err := json.Marshal(DetokenizeRequest{Tokens: tokens})
if err != nil {
return "", fmt.Errorf("marshaling decode data: %w", err)
}
req := C.CString(string(data))
defer C.free(unsafe.Pointer(req))
var json_resp *C.char
resp := newExtServerResp(128)
defer freeExtServerResp(resp)
llm.llama_server_detokenize(req, &json_resp, &resp)
if resp.id < 0 {
return "", extServerResponseToErr(resp)
}
defer llm.llama_server_release_json_resp(&json_resp)
var decoded DetokenizeResponse
if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &decoded); err2 != nil {
return "", fmt.Errorf("unmarshal encode response: %w", err2)
}
return decoded.Content, err
}
func (llm *llamaExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
return embedding(llm, ctx, input)
}
func embedding(llm extServer, ctx context.Context, input string) ([]float64, error) {
data, err := json.Marshal(TokenizeRequest{Content: input})
if err != nil {
return nil, fmt.Errorf("error marshaling embed data: %w", err)
}
req := C.CString(string(data))
defer C.free(unsafe.Pointer(req))
var json_resp *C.char
resp := newExtServerResp(128)
defer freeExtServerResp(resp)
llm.llama_server_embedding(req, &json_resp, &resp)
if resp.id < 0 {
return nil, extServerResponseToErr(resp)
}
defer llm.llama_server_release_json_resp(&json_resp)
var embedding EmbeddingResponse
if err := json.Unmarshal([]byte(C.GoString(json_resp)), &embedding); err != nil {
return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
}
return embedding.Embedding, nil
}
func (llm *llamaExtServer) Close() {
close(llm)
}
func close(llm extServer) {
llm.llama_server_stop()
mutex.Unlock()
}
......@@ -86,74 +86,6 @@ type container interface {
Decode(*readSeekOffset) (model, error)
}
type containerGGML struct{}
func (c *containerGGML) Name() string {
return "ggml"
}
func (c *containerGGML) Decode(ro *readSeekOffset) (model, error) {
// file contents aren't decoded
ro.Seek(0, io.SeekEnd)
return nil, nil
}
type containerGGMF struct {
version uint32
}
func (c *containerGGMF) Name() string {
return "ggmf"
}
func (c *containerGGMF) Decode(ro *readSeekOffset) (model, error) {
var version uint32
binary.Read(ro, binary.LittleEndian, &version)
switch version {
case 1:
default:
return nil, errors.New("invalid version")
}
c.version = version
// remaining file contents aren't decoded
ro.Seek(0, io.SeekEnd)
return nil, nil
}
type containerGGJT struct {
version uint32
}
func (c *containerGGJT) Name() string {
return "ggjt"
}
func (c *containerGGJT) Decode(ro *readSeekOffset) (model, error) {
var version uint32
binary.Read(ro, binary.LittleEndian, &version)
switch version {
case 1, 2, 3:
default:
return nil, errors.New("invalid version")
}
c.version = version
// different model types may have different layouts for hyperparameters
var llama llamaModel
binary.Read(ro, binary.LittleEndian, &llama.hyperparameters)
// remaining file contents aren't decoded
ro.Seek(0, io.SeekEnd)
return &llama, nil
}
type containerLORA struct {
version uint32
}
......@@ -194,6 +126,8 @@ const (
FILE_MAGIC_GGUF_BE = 0x47475546
)
var ErrUnsupportedFormat = errors.New("unsupported model format")
func DecodeGGML(r io.ReadSeeker) (*GGML, error) {
ro := readSeekOffset{ReadSeeker: r}
......@@ -204,12 +138,8 @@ func DecodeGGML(r io.ReadSeeker) (*GGML, error) {
var c container
switch magic {
case FILE_MAGIC_GGML:
c = &containerGGML{}
case FILE_MAGIC_GGMF:
c = &containerGGMF{}
case FILE_MAGIC_GGJT:
c = &containerGGJT{}
case FILE_MAGIC_GGML, FILE_MAGIC_GGMF, FILE_MAGIC_GGJT:
return nil, ErrUnsupportedFormat
case FILE_MAGIC_GGLA:
c = &containerLORA{}
case FILE_MAGIC_GGUF_LE:
......
# common logic accross linux and darwin
init_vars() {
LLAMACPP_DIR=gguf
PATCHES="0001-Expose-callable-API-for-server.patch"
CMAKE_DEFS="-DLLAMA_ACCELERATE=on -DLLAMA_SERVER_VERBOSE=off"
# TODO - LLAMA_K_QUANTS is stale and needs to be mapped to newer cmake settings
CMAKE_TARGETS="--target ggml --target ggml_static --target llama --target build_info --target common --target ext_server --target llava_static"
if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on ${CMAKE_DEFS}"
else
# TODO - add additional optimization flags...
CMAKE_DEFS="-DCMAKE_BUILD_TYPE=Release ${CMAKE_DEFS}"
fi
}
git_module_setup() {
if [ -n "${OLLAMA_SKIP_PATCHING}" ]; then
echo "Skipping submodule initialization"
return
fi
git submodule init
git submodule update --force gguf
}
apply_patches() {
if [ -n "${OLLAMA_SKIP_PATCHING}" ]; then
echo "Skipping submodule patching"
return
fi
# Workaround git apply not handling creation well for iteration
rm -f gguf/examples/server/server.h
for patch in ${PATCHES}; do
git -C gguf apply ../patches/${patch}
done
}
build() {
cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS}
cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8
}
#!/bin/bash
# This script is intended to run inside the go generate
# working directory must be ../llm/llama.cpp
# TODO - add hardening to detect missing tools (cmake, etc.)
set -ex
set -o pipefail
echo "Starting darwin generate script"
source $(dirname $0)/gen_common.sh
init_vars
CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DLLAMA_METAL=on ${CMAKE_DEFS}"
BUILD_DIR="gguf/build/metal"
case "${GOARCH}" in
"amd64")
CMAKE_DEFS="-DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 ${CMAKE_DEFS}"
;;
"arm64")
CMAKE_DEFS="-DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 ${CMAKE_DEFS}"
;;
*)
echo "GOARCH must be set"
echo "this script is meant to be run from within go generate"
exit 1
;;
esac
git_module_setup
apply_patches
build
\ No newline at end of file
#!/bin/bash
# This script is intended to run inside the go generate
# working directory must be llm/llama.cpp
# First we build our default built-in library which will be linked into the CGO
# binary as a normal dependency. This default build is CPU based.
#
# Then we build a CUDA dynamic library (although statically linked with the CUDA
# library dependencies for maximum portability)
#
# Then if we detect ROCm, we build a dynamically loaded ROCm lib. ROCm is particularly
# important to be a dynamic lib even if it's the only GPU library detected because
# we can't redistribute the objectfiles but must rely on dynamic libraries at
# runtime, which could lead the server not to start if not present.
set -ex
set -o pipefail
echo "Starting linux generate script"
if [ -z "${CUDACXX}" -a -x /usr/local/cuda/bin/nvcc ]; then
export CUDACXX=/usr/local/cuda/bin/nvcc
fi
COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_ACCELERATE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off"
OLLAMA_DYN_LIB_DIR="gguf/build/lib"
source $(dirname $0)/gen_common.sh
init_vars
git_module_setup
apply_patches
mkdir -p ${OLLAMA_DYN_LIB_DIR}
touch ${OLLAMA_DYN_LIB_DIR}/.generated
#
# CPU first for the default library
#
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
BUILD_DIR="gguf/build/cpu"
build
if [ -d /usr/local/cuda/lib64/ ]; then
echo "CUDA libraries detected - building dynamic CUDA library"
init_vars
CMAKE_DEFS="-DLLAMA_CUBLAS=on ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
BUILD_DIR="gguf/build/cuda"
CUDA_LIB_DIR=/usr/local/cuda/lib64
build
gcc -fPIC -g -shared -o ${OLLAMA_DYN_LIB_DIR}/libcuda_server.so \
-Wl,--whole-archive \
${BUILD_DIR}/examples/server/libext_server.a \
${BUILD_DIR}/common/libcommon.a \
${BUILD_DIR}/libllama.a \
-Wl,--no-whole-archive \
${CUDA_LIB_DIR}/libcudart_static.a \
${CUDA_LIB_DIR}/libcublas_static.a \
${CUDA_LIB_DIR}/libcublasLt_static.a \
${CUDA_LIB_DIR}/libcudadevrt.a \
${CUDA_LIB_DIR}/libculibos.a \
-lrt -lpthread -ldl -lstdc++ -lm
fi
if [ -z "${ROCM_PATH}" ]; then
# Try the default location in case it exists
ROCM_PATH=/opt/rocm
fi
if [ -z "${CLBlast_DIR}" ]; then
# Try the default location in case it exists
if [ -d /usr/lib/cmake/CLBlast ]; then
export CLBlast_DIR=/usr/lib/cmake/CLBlast
fi
fi
if [ -d "${ROCM_PATH}" ]; then
echo "ROCm libraries detected - building dynamic ROCm library"
init_vars
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102' -DGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102'"
BUILD_DIR="gguf/build/rocm"
build
gcc -fPIC -g -shared -o ${OLLAMA_DYN_LIB_DIR}/librocm_server.so \
-Wl,--whole-archive \
${BUILD_DIR}/examples/server/libext_server.a \
${BUILD_DIR}/common/libcommon.a \
${BUILD_DIR}/libllama.a \
-Wl,--no-whole-archive \
-lrt -lpthread -ldl -lstdc++ -lm \
-L/opt/rocm/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ \
-Wl,-rpath,/opt/rocm/lib,-rpath,/opt/amdgpu/lib/x86_64-linux-gnu/ \
-lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu
fi
#!powershell
$ErrorActionPreference = "Stop"
function init_vars {
$script:patches = @("0001-Expose-callable-API-for-server.patch")
$script:cmakeDefs = @("-DBUILD_SHARED_LIBS=on", "-DLLAMA_NATIVE=off", "-DLLAMA_F16C=off", "-DLLAMA_FMA=off", "-DLLAMA_AVX512=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX=on", "-DLLAMA_K_QUANTS=on", "-DLLAMA_ACCELERATE=on", "-A","x64")
if ($env:CGO_CFLAGS -contains "-g") {
$script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on")
$script:config = "RelWithDebInfo"
} else {
$script:config = "Release"
}
}
function git_module_setup {
# TODO add flags to skip the init/patch logic to make it easier to mod llama.cpp code in-repo
& git submodule init
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
& git submodule update --force gguf
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
}
function apply_patches {
rm -erroraction ignore -path "gguf/examples/server/server.h"
foreach ($patch in $script:patches) {
write-host "Applying patch $patch"
& git -C gguf apply ../patches/$patch
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
}
}
function build {
write-host "generating config with: cmake -S gguf -B $script:buildDir $script:cmakeDefs"
& cmake --version
& cmake -S gguf -B $script:buildDir $script:cmakeDefs
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
write-host "building with: cmake --build $script:buildDir --config $script:config"
& cmake --build $script:buildDir --config $script:config
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
}
function install {
rm -erroraction ignore -recurse -force -path $script:installDir
& cmake --install $script:buildDir --prefix $script:installDir --config $script:config
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
}
init_vars
git_module_setup
apply_patches
# first build CPU based
$script:buildDir="gguf/build/wincpu"
$script:installDir="gguf/build/wincpu/dist"
build
# install
md gguf/build/lib -ea 0
md gguf/build/wincpu/dist/lib -ea 0
mv gguf/build/wincpu/bin/$script:config/ext_server_shared.dll gguf/build/wincpu/dist/lib/cpu_server.dll
# Nope, this barfs on lots of symbol problems
#mv gguf/build/wincpu/examples/server/$script:config/ext_server_shared.dll gguf/build/wincpu/dist/lib/cpu_server.lib
# Nope: this needs lots of include paths to pull in things like msvcprt.lib and other deps
# & cl.exe `
# gguf/build/wincpu/examples/server/$script:config/ext_server.lib `
# gguf/build/wincpu/common/$script:config/common.lib `
# gguf/build/wincpu/$script:config/llama.lib `
# gguf/build/wincpu/$script:config/ggml_static.lib `
# /link /DLL /DEF:cpu_server.def /NOENTRY /MACHINE:X64 /OUT:gguf/build/wincpu/dist/lib/cpu_server.dll
# if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
# Then build cuda as a dynamically loaded library
init_vars
$script:buildDir="gguf/build/wincuda"
$script:installDir="gguf/build/wincuda/dist"
$script:cmakeDefs += @("-DLLAMA_CUBLAS=ON", "-DBUILD_SHARED_LIBS=on")
build
install
cp gguf/build/wincuda/dist/bin/ext_server_shared.dll gguf/build/lib/cuda_server.dll
# TODO - more to do here to create a usable dll
# TODO - implement ROCm support on windows
md gguf/build/winrocm/lib -ea 0
echo $null >> gguf/build/winrocm/lib/.generated
package llm
//go:generate sh ./gen_darwin.sh
package llm
//go:generate git submodule init
//go:generate git submodule update --force ggml
//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
//go:generate git -C ggml apply ../patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
//go:generate git -C ggml apply ../patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
//go:generate cmake --build ggml/build/cpu --target server --config Release
//go:generate mv ggml/build/cpu/bin/server ggml/build/cpu/bin/ollama-runner
//go:generate git submodule update --force gguf
//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_METAL=off -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=on
//go:generate cmake --build gguf/build/cpu --target server --config Release
//go:generate mv gguf/build/cpu/bin/server gguf/build/cpu/bin/ollama-runner
package llm
//go:generate git submodule init
//go:generate git submodule update --force ggml
//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
//go:generate git -C ggml apply ../patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
//go:generate git -C ggml apply ../patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
//go:generate cmake -S ggml -B ggml/build/metal -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
//go:generate cmake --build ggml/build/metal --target server --config Release
//go:generate mv ggml/build/metal/bin/server ggml/build/metal/bin/ollama-runner
//go:generate git submodule update --force gguf
//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
//go:generate cmake -S gguf -B gguf/build/metal -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
//go:generate cmake --build gguf/build/metal --target server --config Release
//go:generate mv gguf/build/metal/bin/server gguf/build/metal/bin/ollama-runner
package llm
//go:generate git submodule init
//go:generate git submodule update --force ggml
//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
//go:generate git -C ggml apply ../patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
//go:generate git -C ggml apply ../patches/0001-copy-cuda-runtime-libraries.patch
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
//go:generate cmake --build ggml/build/cpu --target server --config Release
//go:generate mv ggml/build/cpu/bin/server ggml/build/cpu/bin/ollama-runner
//go:generate git submodule update --force gguf
//go:generate git -C gguf apply ../patches/0001-copy-cuda-runtime-libraries.patch
//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off
//go:generate cmake --build gguf/build/cpu --target server --config Release
//go:generate mv gguf/build/cpu/bin/server gguf/build/cpu/bin/ollama-runner
//go:generate cmake -S ggml -B ggml/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
//go:generate cmake --build ggml/build/cuda --target server --config Release
//go:generate mv ggml/build/cuda/bin/server ggml/build/cuda/bin/ollama-runner
//go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_CUDA_PEER_MAX_BATCH_SIZE=0
//go:generate cmake --build gguf/build/cuda --target server --config Release
//go:generate mv gguf/build/cuda/bin/server gguf/build/cuda/bin/ollama-runner
//go:generate bash ./gen_linux.sh
package llm
//go:generate git submodule init
//go:generate git submodule update --force ggml
//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
//go:generate cmake --build ggml/build/cpu --target server --config Release
//go:generate cmd /c move ggml\build\cpu\bin\Release\server.exe ggml\build\cpu\bin\Release\ollama-runner.exe
//go:generate git submodule update --force gguf
//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off
//go:generate cmake --build gguf/build/cpu --target server --config Release
//go:generate cmd /c move gguf\build\cpu\bin\Release\server.exe gguf\build\cpu\bin\Release\ollama-runner.exe
//go:generate cmake -S ggml -B ggml/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
//go:generate cmake --build ggml/build/cuda --target server --config Release
//go:generate cmd /c move ggml\build\cuda\bin\Release\server.exe ggml\build\cuda\bin\Release\ollama-runner.exe
//go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off
//go:generate cmake --build gguf/build/cuda --target server --config Release
//go:generate cmd /c move gguf\build\cuda\bin\Release\server.exe gguf\build\cuda\bin\Release\ollama-runner.exe
//go:generate powershell -ExecutionPolicy Bypass -File ./gen_windows.ps1
Subproject commit 9e232f0234073358e7031c1b8d7aa45020469a3b
Subproject commit a7aee47b98e45539d491071b25778b833b77e387
Subproject commit 328b83de23b33240e28f4e74900d1d06726f5eb1
From 4c72576c5f6c2217b1ecf7fd8523616acc5526ae Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Mon, 13 Nov 2023 12:25:58 -0800
Subject: [PATCH] Expose callable API for server
This adds an extern "C" interface within the example server
---
examples/server/CMakeLists.txt | 24 +++
examples/server/server.cpp | 279 +++++++++++++++++++++++++++++++++
examples/server/server.h | 89 +++++++++++
ggml-cuda.cu | 1 +
4 files changed, 393 insertions(+)
create mode 100644 examples/server/server.h
diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
index 859cd12..4ea47a7 100644
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -11,3 +11,27 @@ if (WIN32)
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
endif()
target_compile_features(${TARGET} PRIVATE cxx_std_11)
+
+set(TARGET ext_server)
+option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
+add_library(${TARGET} STATIC server.cpp)
+target_include_directories(${TARGET} PRIVATE ../../common)
+target_include_directories(${TARGET} PRIVATE ../..)
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_definitions(${TARGET} PUBLIC LLAMA_SERVER_LIBRARY=1)
+target_link_libraries(${TARGET} PRIVATE common llama llava ${CMAKE_THREAD_LIBS_INIT})
+
+if (BUILD_SHARED_LIBS)
+ set_target_properties(ext_server PROPERTIES POSITION_INDEPENDENT_CODE ON)
+ target_compile_definitions(ext_server PRIVATE LLAMA_SHARED LLAMA_BUILD)
+ add_library(ext_server_shared SHARED $<TARGET_OBJECTS:ext_server>)
+ target_link_libraries(ext_server_shared PRIVATE ggml llama llava common ${CMAKE_THREAD_LIBS_INIT})
+ install(TARGETS ext_server_shared LIBRARY)
+endif()
+
+if (CUDAToolkit_FOUND)
+ target_include_directories(${TARGET} PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+ if (WIN32)
+ target_link_libraries(ext_server_shared PRIVATE nvml)
+ endif()
+endif()
\ No newline at end of file
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 0403853..5e78e4d 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -5,6 +5,9 @@
#include "../llava/clip.h"
#include "stb_image.h"
+#if defined(LLAMA_SERVER_LIBRARY)
+#include "server.h"
+#endif
#ifndef NDEBUG
// crash the server in debug mode, otherwise send an http 500 error
@@ -2643,6 +2646,7 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
}
}
+#ifndef LLAMA_SERVER_LIBRARY
int main(int argc, char **argv)
{
#if SERVER_VERBOSE != 1
@@ -3123,3 +3127,278 @@ int main(int argc, char **argv)
llama_backend_free();
return 0;
}
+
+#else // LLAMA_SERVER_LIBRARY
+// Expose the llama server as a callable extern "C" API
+llama_server_context *llama = NULL;
+std::atomic<bool> ext_server_running(false);
+std::thread ext_server_thread;
+
+void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err)
+{
+#if SERVER_VERBOSE != 1
+ log_disable();
+#endif
+ assert(err != NULL && sparams != NULL);
+ err->id = 0;
+ err->msg[0] = '\0';
+ try {
+ llama = new llama_server_context;
+ log_set_target(stdout);
+ gpt_params params;
+ params.n_ctx = sparams->n_ctx;
+ params.n_batch = sparams->n_batch;
+ if (sparams->n_threads > 0) {
+ params.n_threads = sparams->n_threads;
+ }
+ params.n_parallel = sparams->n_parallel;
+ params.rope_freq_base = sparams->rope_freq_base;
+ params.rope_freq_scale = sparams->rope_freq_scale;
+
+ if (sparams->memory_f16) {
+ params.cache_type_k = "f16";
+ params.cache_type_v = "f16";
+ } else {
+ params.cache_type_k = "f32";
+ params.cache_type_v = "f32";
+ }
+
+ params.n_gpu_layers = sparams->n_gpu_layers;
+ params.main_gpu = sparams->main_gpu;
+ params.use_mlock = sparams->use_mlock;
+ params.use_mmap = sparams->use_mmap;
+ params.numa = sparams->numa;
+ params.embedding = sparams->embedding;
+ if (sparams->model != NULL) {
+ params.model = sparams->model;
+ }
+
+ for (ext_server_lora_adapter *la = sparams->lora_adapters; la != NULL; la = la->next) {
+ params.lora_adapter.push_back(std::make_tuple(la->adapter, la->scale));
+ }
+
+ if (sparams->mmproj != NULL) {
+ params.mmproj = std::string(sparams->mmproj);
+ }
+
+ llama_backend_init(params.numa);
+
+ // load the model
+ if (!llama->load_model(params))
+ {
+ // TODO - consider modifying the logging logic or patching load_model so we can capture more detailed error messages
+ // and pass them back to the caller for better UX
+ err->id = -1;
+ snprintf(err->msg, err->msg_len, "error loading model %s", params.model.c_str());
+ return;
+ }
+
+ llama->initialize();
+ } catch (std::exception &e) {
+ err->id = -1;
+ snprintf(err->msg, err->msg_len, "exception %s", e.what());
+ } catch (...) {
+ err->id = -1;
+ snprintf(err->msg, err->msg_len, "Unknown exception initializing llama server");
+ }
+}
+
+void llama_server_start()
+{
+ assert(llama != NULL);
+ // TODO mutex to protect thread creation
+ ext_server_thread = std::thread([&]()
+ {
+ ext_server_running = true;
+ try {
+ LOG_TEE("llama server main loop starting\n");
+ ggml_time_init();
+ while (ext_server_running.load())
+ {
+ if (!llama->update_slots()) {
+ LOG_TEE("unexpected error in llama server update_slots - exiting main loop\n");
+ break;
+ }
+ }
+ } catch (std::exception &e) {
+ LOG_TEE("caught exception in llama server main loop: %s\n", e.what());
+ } catch (...) {
+ LOG_TEE("caught unknown exception in llama server main loop\n");
+ }
+ LOG_TEE("\nllama server shutting down\n");
+ llama_backend_free();
+ });
+}
+
+void llama_server_stop() {
+ assert(llama != NULL);
+ // TODO - too verbose, remove once things are solid
+ LOG_TEE("requesting llama server shutdown\n");
+ ext_server_running = false;
+ ext_server_thread.join();
+ delete llama;
+ llama = NULL;
+ LOG_TEE("llama server shutdown complete\n");
+}
+
+void llama_server_completion(const char *json_req, ext_server_resp_t *resp) {
+ assert(llama != NULL && json_req != NULL && resp != NULL);
+ resp->id = -1;
+ resp->msg[0] = '\0';
+ try {
+ json data = json::parse(json_req);
+ resp->id = llama->request_completion(data, false, false, -1);
+ } catch (std::exception &e) {
+ snprintf(resp->msg, resp->msg_len, "exception %s", e.what());
+ } catch (...) {
+ snprintf(resp->msg, resp->msg_len, "Unknown exception during completion");
+ }
+}
+
+void llama_server_completion_next_result(const int task_id, ext_server_task_result_t *resp) {
+ assert(llama != NULL && resp != NULL);
+ std::string msg;
+ resp->id = -1;
+ resp->stop = false;
+ resp->error = false;
+ resp->json_resp = NULL;
+ std::string result_json;
+ try {
+ task_result result = llama->next_result(task_id);
+ result_json = result.result_json.dump(-1, ' ', false, json::error_handler_t::replace);
+ resp->id = result.id;
+ resp->stop = result.stop;
+ resp->error = result.error;
+ if (result.error) {
+ llama->request_cancel(task_id);
+ } else if (result.stop) {
+ llama->request_cancel(task_id);
+ }
+ } catch (std::exception &e) {
+ resp->error = true;
+ resp->id = -1;
+ result_json = "{\"error\":\"exception " + std::string(e.what()) + "\"}";
+ } catch (...) {
+ resp->error = true;
+ resp->id = -1;
+ result_json = "{\"error\":\"Unknown exception during completion\"}";
+ }
+ const std::string::size_type size = result_json.size() + 1;
+ resp->json_resp = new char[size];
+ snprintf(resp->json_resp, size, "%s", result_json.c_str());
+}
+
+void llama_server_release_task_result(ext_server_task_result_t *result) {
+ if (result == NULL || result->json_resp == NULL) {
+ return;
+ }
+ delete[] result->json_resp;
+}
+
+void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err) {
+ assert(llama != NULL && err != NULL);
+ err->id = 0;
+ err->msg[0] = '\0';
+ try {
+ llama->request_cancel(task_id);
+ } catch (std::exception &e) {
+ err->id = -1;
+ snprintf(err->msg, err->msg_len, "exception %s", e.what());
+ } catch (...) {
+ err->id = -1;
+ snprintf(err->msg, err->msg_len, "Unknown exception completion cancel in llama server");
+ }
+}
+
+void llama_server_tokenize(const char *json_req, char **json_resp, ext_server_resp_t *err) {
+ assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
+ *json_resp = NULL;
+ err->id = 0;
+ err->msg[0] = '\0';
+ try {
+ const json body = json::parse(json_req);
+ std::vector<llama_token> tokens;
+ if (body.count("content") != 0)
+ {
+ tokens = llama->tokenize(body["content"], false);
+ }
+ const json data = format_tokenizer_response(tokens);
+ std::string result_json = data.dump();
+ const std::string::size_type size = result_json.size() + 1;
+ *json_resp = new char[size];
+ snprintf(*json_resp, size, "%s", result_json.c_str());
+ } catch (std::exception &e) {
+ err->id = -1;
+ snprintf(err->msg, err->msg_len, "exception %s", e.what());
+ } catch (...) {
+ err->id = -1;
+ snprintf(err->msg, err->msg_len, "Unknown exception during tokenize");
+ }
+}
+
+void llama_server_release_json_resp(char **json_resp) {
+ if (json_resp == NULL || *json_resp == NULL) {
+ return;
+ }
+ delete[] *json_resp;
+}
+
+void llama_server_detokenize(const char *json_req, char **json_resp, ext_server_resp_t *err) {
+ assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
+ *json_resp = NULL;
+ err->id = 0;
+ err->msg[0] = '\0';
+ try {
+ const json body = json::parse(json_req);
+ std::string content;
+ if (body.count("tokens") != 0)
+ {
+ const std::vector<llama_token> tokens = body["tokens"];
+ content = tokens_to_str(llama->ctx, tokens.cbegin(), tokens.cend());
+ }
+ const json data = format_detokenized_response(content);
+ std::string result_json = data.dump();
+ const std::string::size_type size = result_json.size() + 1;
+ *json_resp = new char[size];
+ snprintf(*json_resp, size, "%s", result_json.c_str());
+ } catch (std::exception &e) {
+ err->id = -1;
+ snprintf(err->msg, err->msg_len, "exception %s", e.what());
+ } catch (...) {
+ err->id = -1;
+ snprintf(err->msg, err->msg_len, "Unknown exception during detokenize");
+ }
+}
+
+void llama_server_embedding(const char *json_req, char** json_resp, ext_server_resp_t *err) {
+ assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
+ *json_resp = NULL;
+ err->id = 0;
+ err->msg[0] = '\0';
+ try {
+ const json body = json::parse(json_req);
+ json prompt;
+ if (body.count("content") != 0)
+ {
+ prompt = body["content"];
+ }
+ else
+ {
+ prompt = "";
+ }
+ const int task_id = llama->request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true, -1);
+ task_result result = llama->next_result(task_id);
+ std::string result_json = result.result_json.dump();
+ const std::string::size_type size = result_json.size() + 1;
+ *json_resp = new char[size];
+ snprintf(*json_resp, size, "%s", result_json.c_str());
+ } catch (std::exception &e) {
+ err->id = -1;
+ snprintf(err->msg, err->msg_len, "exception %s", e.what());
+ } catch (...) {
+ err->id = -1;
+ snprintf(err->msg, err->msg_len, "Unknown exception during embedding");
+ }
+}
+
+#endif // LLAMA_SERVER_LIBRARY
\ No newline at end of file
diff --git a/examples/server/server.h b/examples/server/server.h
new file mode 100644
index 0000000..d22f1b6
--- /dev/null
+++ b/examples/server/server.h
@@ -0,0 +1,89 @@
+#if defined(LLAMA_SERVER_LIBRARY)
+#ifndef LLAMA_SERVER_H
+#define LLAMA_SERVER_H
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdbool.h>
+
+// This exposes extern C entrypoints into the llama_server
+// To enable the server compile with LLAMA_SERVER_LIBRARY
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+ typedef struct ext_server_resp {
+ int id; // < 0 on error
+ size_t msg_len; // caller must allocate msg and set msg_len
+ char *msg;
+ } ext_server_resp_t;
+
+ // Allocated and freed by caller
+ typedef struct ext_server_lora_adapter {
+ char *adapter;
+ float scale;
+ struct ext_server_lora_adapter *next;
+ } ext_server_lora_adapter_t;
+
+ // Allocated and freed by caller
+ typedef struct ext_server_params
+ {
+ char *model;
+ uint32_t n_ctx; // text context, 0 = from model
+ uint32_t n_batch; // prompt processing maximum batch size
+ uint32_t n_threads; // number of threads to use for generation
+ int32_t n_parallel; // number of parallel sequences to decodewra
+ float rope_freq_base; // RoPE base frequency, 0 = from model
+ float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
+ bool memory_f16; // use f16 instead of f32 for memory kv
+ int32_t n_gpu_layers; // number of layers to store in VRAM (-1 - use default)
+ int32_t main_gpu; // the GPU that is used for scratch and small tensors
+ bool use_mlock; // force system to keep model in RAM
+ bool use_mmap; // use mmap if possible
+ bool numa; // attempt optimizations that help on some NUMA systems
+ bool embedding; // get only sentence embedding
+ ext_server_lora_adapter_t* lora_adapters;
+ char *mmproj;
+ } ext_server_params_t;
+
+ typedef struct ext_server_task_result
+ {
+ int id;
+ bool stop;
+ bool error;
+ char* json_resp; // null terminated, memory managed by ext_server
+ } ext_server_task_result_t;
+
+ // Initialize the server once per process
+ // err->id = 0 for success and err->msg[0] = NULL
+ // err->id != 0 for failure, and err->msg contains error message
+ void llama_server_init(ext_server_params_t *sparams, ext_server_resp_t *err);
+
+ // Run the main loop, called once per init
+ void llama_server_start();
+ // Stop the main loop and free up resources allocated in init and start. Init must be called again to reuse
+ void llama_server_stop();
+
+ // json_req null terminated string, memory managed by caller
+ // resp->id >= 0 on success (task ID)
+ // resp->id < 0 on error, and resp->msg contains error message
+ void llama_server_completion(const char *json_req, ext_server_resp_t *resp);
+
+ // Caller must call llama_server_release_task_result to free resp->json_resp
+ void llama_server_completion_next_result(const int task_id, ext_server_task_result_t *result);
+ void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err);
+ void llama_server_release_task_result(ext_server_task_result_t *result);
+
+ // Caller must call llama_server_releaes_json_resp to free json_resp if err.id < 0
+ void llama_server_tokenize(const char *json_req, char **json_resp, ext_server_resp_t *err);
+ void llama_server_detokenize(const char *json_req, char **json_resp, ext_server_resp_t *err);
+ void llama_server_embedding(const char *json_req, char** json_resp, ext_server_resp_t *err);
+ void llama_server_release_json_resp(char **json_resp);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+#endif // LLAMA_SERVER_LIBRARY
\ No newline at end of file
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index f20846f..9640cf3 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -6757,6 +6757,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
CUDA_CHECK(cudaGetDevice(&id));
src_ptr = (char *) extra->data_device[id];
} else {
+ fprintf(stderr, "ggml_cuda_cpy_tensor_2d assert: backend: %d\n", src->backend);
GGML_ASSERT(false);
}
char * dst_ptr = (char *) dst;
--
2.39.3 (Apple Git-145)
From 032ef7ff2423f5117bb59d42fb71be9cebf0a2de Mon Sep 17 00:00:00 2001
From: Bruce MacDonald <brucewmacdonald@gmail.com>
Date: Mon, 28 Aug 2023 18:08:12 -0400
Subject: [PATCH] add detokenize endpoint
---
examples/server/server.cpp | 21 +++++++++++++++++++++
1 file changed, 21 insertions(+)
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 9966045..5014691 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1075,6 +1075,12 @@ static json format_tokenizer_response(const std::vector<llama_token> &tokens)
{"tokens", tokens}};
}
+static json format_detokenized_response(std::string content)
+{
+ return json{
+ {"content", content}};
+}
+
static void parse_options_completion(const json &body, llama_server_context &llama)
{
gpt_params default_params;
@@ -1361,6 +1367,21 @@ int main(int argc, char **argv)
const json data = format_tokenizer_response(tokens);
return res.set_content(data.dump(), "application/json"); });
+ svr.Post("/detokenize", [&llama](const Request &req, Response &res)
+ {
+ auto lock = llama.lock();
+
+ const json body = json::parse(req.body);
+ std::string content;
+ if (body.count("tokens") != 0)
+ {
+ const std::vector<llama_token> tokens = body["tokens"];
+ content = tokens_to_str(llama.ctx, tokens.cbegin(), tokens.cend());
+ }
+
+ const json data = format_detokenized_response(content);
+ return res.set_content(data.dump(), "application/json"); });
+
svr.Post("/embedding", [&llama](const Request &req, Response &res)
{
auto lock = llama.lock();
--
2.39.2 (Apple Git-143)
From 5dd02993e8cc2ce309157736b95bb572f274a3fd Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 20 Sep 2023 14:19:52 -0700
Subject: [PATCH] copy cuda runtime libraries
---
CMakeLists.txt | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 824d9f2..dd24137 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -274,6 +274,10 @@ if (LLAMA_CUBLAS)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
endif()
+ configure_file(${CUDAToolkit_LIBRARY_DIR}/libcudart.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcudart.so.${CUDAToolkit_VERSION_MAJOR}.0 COPYONLY)
+ configure_file(${CUDAToolkit_LIBRARY_DIR}/libcublas.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcublas.so.${CUDAToolkit_VERSION_MAJOR} COPYONLY)
+ configure_file(${CUDAToolkit_LIBRARY_DIR}/libcublasLt.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcublasLt.so.${CUDAToolkit_VERSION_MAJOR} COPYONLY)
+
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
# 52 == lowest CUDA 12 standard
# 60 == f16 CUDA intrinsics
--
2.42.0
From 6465fec6290f0a7f5d4d0fbe6bcf634e4810dde6 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Mon, 23 Oct 2023 10:39:34 -0700
Subject: [PATCH] default log stderr
---
common/log.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/common/log.h b/common/log.h
index b8953fd..25522cd 100644
--- a/common/log.h
+++ b/common/log.h
@@ -90,7 +90,7 @@
// }
//
#ifndef LOG_TARGET
- #define LOG_TARGET log_handler()
+ #define LOG_TARGET nullptr
#endif
#ifndef LOG_TEE_TARGET
--
2.42.0
From 6145068a6613c37bb43a7408b5496524bdcfc402 Mon Sep 17 00:00:00 2001
From: Bruce MacDonald <brucewmacdonald@gmail.com>
Date: Mon, 28 Aug 2023 18:08:53 -0400
Subject: [PATCH] 34B model support
---
llama.cpp | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/llama.cpp b/llama.cpp
index f2cbe76..62c5cdf 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -79,6 +79,7 @@ enum e_model {
MODEL_7B,
MODEL_13B,
MODEL_30B,
+ MODEL_34B,
MODEL_65B,
MODEL_70B,
};
@@ -122,6 +123,7 @@ static std::map<e_model, size_t> MEM_REQ_SCRATCH0(int n_ctx)
{ MODEL_7B, ((size_t) n_ctx / 16ull + 100ull) * MB },
{ MODEL_13B, ((size_t) n_ctx / 12ull + 120ull) * MB },
{ MODEL_30B, ((size_t) n_ctx / 9ull + 160ull) * MB },
+ { MODEL_34B, ((size_t) n_ctx / 9ull + 160ull) * MB },
{ MODEL_65B, ((size_t) n_ctx / 6ull + 256ull) * MB }, // guess
{ MODEL_70B, ((size_t) n_ctx / 7ull + 164ull) * MB },
};
@@ -135,6 +137,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
{ MODEL_7B, 160ull * MB },
{ MODEL_13B, 192ull * MB },
{ MODEL_30B, 256ull * MB },
+ { MODEL_34B, 256ull * MB },
{ MODEL_65B, 384ull * MB }, // guess
{ MODEL_70B, 304ull * MB },
};
@@ -149,6 +152,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
{ MODEL_7B, 10ull * MB },
{ MODEL_13B, 12ull * MB },
{ MODEL_30B, 16ull * MB },
+ { MODEL_34B, 16ull * MB },
{ MODEL_65B, 24ull * MB }, // guess
{ MODEL_70B, 24ull * MB },
};
@@ -164,6 +168,7 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
{ MODEL_7B, 512ull * kB },
{ MODEL_13B, 640ull * kB },
{ MODEL_30B, 768ull * kB },
+ { MODEL_34B, 768ull * kB },
{ MODEL_65B, 1280ull * kB },
{ MODEL_70B, 1280ull * kB },
};
@@ -179,6 +184,7 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
{ MODEL_7B, 128ull },
{ MODEL_13B, 160ull },
{ MODEL_30B, 208ull },
+ { MODEL_34B, 208ull },
{ MODEL_65B, 256ull },
{ MODEL_70B, 256ull },
};
@@ -1027,6 +1033,7 @@ static const char * llama_model_type_name(e_model type) {
case MODEL_7B: return "7B";
case MODEL_13B: return "13B";
case MODEL_30B: return "30B";
+ case MODEL_34B: return "34B";
case MODEL_65B: return "65B";
case MODEL_70B: return "70B";
default: LLAMA_ASSERT(false);
@@ -1074,6 +1081,7 @@ static void llama_model_load_internal(
case 26: model.type = e_model::MODEL_3B; break;
case 32: model.type = e_model::MODEL_7B; break;
case 40: model.type = e_model::MODEL_13B; break;
+ case 48: model.type = e_model::MODEL_34B; break;
case 60: model.type = e_model::MODEL_30B; break;
case 80: model.type = e_model::MODEL_65B; break;
default:
@@ -1094,6 +1102,8 @@ static void llama_model_load_internal(
LLAMA_LOG_WARN("%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
model.type = e_model::MODEL_70B;
hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model
+ } else if (model.type == e_model::MODEL_34B && n_gqa == 8) {
+ hparams.f_ffn_mult = 1.0f; // from the params.json of the 34B model
}
hparams.rope_freq_base = rope_freq_base;
--
2.39.2 (Apple Git-143)
From dadbed99e65252d79f81101a392d0d6497b86caa Mon Sep 17 00:00:00 2001
From: Shouzheng Liu <lshzh.hi@gmail.com>
Date: Mon, 21 Aug 2023 06:59:29 -0400
Subject: [PATCH] metal : fix synchronization in new matrix multiplication
kernel (#2686)
---
ggml-metal.metal | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/ggml-metal.metal b/ggml-metal.metal
index 3f31252..88d48f6 100644
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -1898,10 +1898,11 @@ kernel void kernel_mul_mm(device const uchar * src0,
threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
+ 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
for (int i = 0; i < 8; i++) {
+ threadgroup_barrier(mem_flags::mem_device);
simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
}
- threadgroup_barrier(mem_flags::mem_threadgroup);
+ threadgroup_barrier(mem_flags::mem_device);
device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
if (sgitg==0) {
for (int i = 0; i < n_rows; i++) {
--
2.41.0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment