"git@developer.sourcefind.cn:OpenDAS/mmcv.git" did not exist on "3bf3e8ef24c78fda94b577eabdfe76ce9dc02022"
Commit fd4792ec authored by Michael Yang's avatar Michael Yang
Browse files

call llama.cpp directly from go

parent a3ec1ec2
...@@ -8,3 +8,16 @@ dist ...@@ -8,3 +8,16 @@ dist
__pycache__ __pycache__
ollama ollama
ggml-metal.metal ggml-metal.metal
# cmake gitignore
CMakeLists.txt.user
CMakeCache.txt
CMakeFiles
CMakeScripts
Testing
Makefile
cmake_install.cmake
install_manifest.txt
compile_commands.json
CTestTestfile.cmake
_deps
cmake_minimum_required(VERSION 3.12)
project(ollama)
include(FetchContent)
FetchContent_Declare(
"llama.cpp"
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git
GIT_TAG 55dbb91
)
FetchContent_MakeAvailable(llama.cpp)
add_custom_target(
ollama
ALL
DEPENDS
${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal
COMMAND
${CMAKE_COMMAND} -E
env
CGO_CPPFLAGS='-I${llama.cpp_SOURCE_DIR}'
CGO_LDFLAGS='-L${llama.cpp_BINARY_DIR} -lllama -lggml_static -lm -lstdc++'
CGO_CXXFLAGS='-std=c++11'
--
go build .
WORKING_DIRECTORY
${CMAKE_CURRENT_SOURCE_DIR}
)
add_custom_command(
OUTPUT
${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal
COMMAND
${CMAKE_COMMAND} -E
copy_if_different
${llama.cpp_SOURCE_DIR}/ggml-metal.metal
${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal
WORKING_DIRECTORY
${CMAKE_CURRENT_SOURCE_DIR}
)
add_dependencies(ollama llama ggml_static)
default: ollama
.PHONY: llama
llama:
cmake -S llama -B llama/build -DLLAMA_METAL=on
cmake --build llama/build
.PHONY: ollama
ollama: llama
go build .
.PHONY: app
app: ollama
npm install --prefix app
npm run --prefix app make:sign
clean:
go clean
rm -rf llama/build
package api package api
import "runtime"
type PullRequest struct { type PullRequest struct {
Model string `json:"model"` Model string `json:"model"`
} }
...@@ -14,93 +16,76 @@ type GenerateRequest struct { ...@@ -14,93 +16,76 @@ type GenerateRequest struct {
Model string `json:"model"` Model string `json:"model"`
Prompt string `json:"prompt"` Prompt string `json:"prompt"`
ModelOptions *ModelOptions `json:"model_opts,omitempty"` Options `json:"options"`
PredictOptions *PredictOptions `json:"predict_opts,omitempty"`
} }
type ModelOptions struct { type GenerateResponse struct {
ContextSize int `json:"context_size,omitempty"` Response string `json:"response"`
Seed int `json:"seed,omitempty"`
NBatch int `json:"n_batch,omitempty"`
F16Memory bool `json:"memory_f16,omitempty"`
MLock bool `json:"mlock,omitempty"`
MMap bool `json:"mmap,omitempty"`
VocabOnly bool `json:"vocab_only,omitempty"`
LowVRAM bool `json:"low_vram,omitempty"`
Embeddings bool `json:"embeddings,omitempty"`
NUMA bool `json:"numa,omitempty"`
NGPULayers int `json:"gpu_layers,omitempty"`
MainGPU string `json:"main_gpu,omitempty"`
TensorSplit string `json:"tensor_split,omitempty"`
} }
type PredictOptions struct { type Options struct {
Seed int `json:"seed,omitempty"` Seed int `json:"seed,omitempty"`
Threads int `json:"threads,omitempty"`
Tokens int `json:"tokens,omitempty"`
TopK int `json:"top_k,omitempty"`
Repeat int `json:"repeat,omitempty"`
Batch int `json:"batch,omitempty"`
NKeep int `json:"nkeep,omitempty"`
TopP float64 `json:"top_p,omitempty"`
Temperature float64 `json:"temp,omitempty"`
Penalty float64 `json:"penalty,omitempty"`
F16KV bool
DebugMode bool
StopPrompts []string
IgnoreEOS bool `json:"ignore_eos,omitempty"`
TailFreeSamplingZ float64 `json:"tfs_z,omitempty"`
TypicalP float64 `json:"typical_p,omitempty"`
FrequencyPenalty float64 `json:"freq_penalty,omitempty"`
PresencePenalty float64 `json:"pres_penalty,omitempty"`
Mirostat int `json:"mirostat,omitempty"`
MirostatETA float64 `json:"mirostat_lr,omitempty"`
MirostatTAU float64 `json:"mirostat_ent,omitempty"`
PenalizeNL bool `json:"penalize_nl,omitempty"`
LogitBias string `json:"logit_bias,omitempty"`
PathPromptCache string
MLock bool `json:"mlock,omitempty"`
MMap bool `json:"mmap,omitempty"`
PromptCacheAll bool
PromptCacheRO bool
MainGPU string
TensorSplit string
}
var DefaultModelOptions ModelOptions = ModelOptions{ // Backend options
ContextSize: 512, UseNUMA bool `json:"numa,omitempty"`
Seed: 0,
F16Memory: true, // Model options
MLock: false, NumCtx int `json:"num_ctx,omitempty"`
Embeddings: true, NumBatch int `json:"num_batch,omitempty"`
MMap: true, NumGPU int `json:"num_gpu,omitempty"`
LowVRAM: false, MainGPU int `json:"main_gpu,omitempty"`
} LowVRAM bool `json:"low_vram,omitempty"`
F16KV bool `json:"f16_kv,omitempty"`
LogitsAll bool `json:"logits_all,omitempty"`
VocabOnly bool `json:"vocab_only,omitempty"`
UseMMap bool `json:"use_mmap,omitempty"`
UseMLock bool `json:"use_mlock,omitempty"`
EmbeddingOnly bool `json:"embedding_only,omitempty"`
// Predict options
RepeatLastN int `json:"repeat_last_n,omitempty"`
RepeatPenalty float32 `json:"repeat_penalty,omitempty"`
FrequencyPenalty float32 `json:"frequency_penalty,omitempty"`
PresencePenalty float32 `json:"presence_penalty,omitempty"`
Temperature float32 `json:"temperature,omitempty"`
TopK int `json:"top_k,omitempty"`
TopP float32 `json:"top_p,omitempty"`
TFSZ float32 `json:"tfs_z,omitempty"`
TypicalP float32 `json:"typical_p,omitempty"`
Mirostat int `json:"mirostat,omitempty"`
MirostatTau float32 `json:"mirostat_tau,omitempty"`
MirostatEta float32 `json:"mirostat_eta,omitempty"`
var DefaultPredictOptions PredictOptions = PredictOptions{ NumThread int `json:"num_thread,omitempty"`
Seed: -1,
Threads: -1,
Tokens: 512,
Penalty: 1.1,
Repeat: 64,
Batch: 512,
NKeep: 64,
TopK: 90,
TopP: 0.86,
TailFreeSamplingZ: 1.0,
TypicalP: 1.0,
Temperature: 0.8,
FrequencyPenalty: 0.0,
PresencePenalty: 0.0,
Mirostat: 0,
MirostatTAU: 5.0,
MirostatETA: 0.1,
MMap: true,
StopPrompts: []string{"llama"},
} }
type GenerateResponse struct { func DefaultOptions() Options {
Response string `json:"response"` return Options{
Seed: -1,
UseNUMA: false,
NumCtx: 512,
NumBatch: 512,
NumGPU: 1,
LowVRAM: false,
F16KV: true,
UseMMap: true,
UseMLock: false,
RepeatLastN: 512,
RepeatPenalty: 1.1,
FrequencyPenalty: 0.0,
PresencePenalty: 0.0,
Temperature: 0.8,
TopK: 40,
TopP: 0.9,
TFSZ: 1.0,
TypicalP: 1.0,
Mirostat: 0,
MirostatTau: 5.0,
MirostatEta: 0.1,
NumThread: runtime.NumCPU(),
}
} }
...@@ -39,6 +39,7 @@ require ( ...@@ -39,6 +39,7 @@ require (
golang.org/x/arch v0.3.0 // indirect golang.org/x/arch v0.3.0 // indirect
golang.org/x/crypto v0.10.0 // indirect golang.org/x/crypto v0.10.0 // indirect
golang.org/x/net v0.10.0 // indirect golang.org/x/net v0.10.0 // indirect
golang.org/x/sync v0.3.0
golang.org/x/sys v0.10.0 // indirect golang.org/x/sys v0.10.0 // indirect
golang.org/x/term v0.10.0 golang.org/x/term v0.10.0
golang.org/x/text v0.10.0 // indirect golang.org/x/text v0.10.0 // indirect
......
...@@ -99,6 +99,8 @@ golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= ...@@ -99,6 +99,8 @@ golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.3.0 h1:ftCYgMx6zT/asHUrPw8BLLscYtGznsLAnjq5RH9P66E=
golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
......
cmake_minimum_required(VERSION 3.12)
project(binding)
include(FetchContent)
FetchContent_Declare(
llama_cpp
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git
GIT_TAG 55dbb91
)
FetchContent_MakeAvailable(llama_cpp)
add_library(binding ${CMAKE_CURRENT_SOURCE_DIR}/binding/binding.cpp ${llama_cpp_SOURCE_DIR}/examples/common.cpp)
target_include_directories(binding PRIVATE ${llama_cpp_SOURCE_DIR}/examples)
target_link_libraries(binding llama ggml_static)
if (LLAMA_METAL)
configure_file(${llama_cpp_SOURCE_DIR}/ggml-metal.metal ${CMAKE_CURRENT_BINARY_DIR}/../../ggml-metal.metal COPYONLY)
endif()
add_custom_target(copy_libllama ALL COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:llama> ${CMAKE_CURRENT_BINARY_DIR})
add_custom_target(copy_libggml_static ALL COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:ggml_static> ${CMAKE_CURRENT_BINARY_DIR})
This diff is collapsed.
// MIT License
// Copyright (c) 2023 go-skynet authors
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#ifdef __cplusplus
extern "C" {
#endif
#include <stdbool.h>
extern unsigned char tokenCallback(void *, char *);
int load_state(void *ctx, char *statefile, char *modes);
int eval(void *params_ptr, void *ctx, char *text);
void save_state(void *ctx, char *dst, char *modes);
void *load_model(const char *fname, int n_ctx, int n_seed, bool memory_f16,
bool mlock, bool embeddings, bool mmap, bool low_vram,
bool vocab_only, int n_gpu, int n_batch, const char *maingpu,
const char *tensorsplit, bool numa);
int get_embeddings(void *params_ptr, void *state_pr, float *res_embeddings);
int get_token_embeddings(void *params_ptr, void *state_pr, int *tokens,
int tokenSize, float *res_embeddings);
void *llama_allocate_params(
const char *prompt, int seed, int threads, int tokens, int top_k,
float top_p, float temp, float repeat_penalty, int repeat_last_n,
bool ignore_eos, bool memory_f16, int n_batch, int n_keep,
const char **antiprompt, int antiprompt_count, float tfs_z, float typical_p,
float frequency_penalty, float presence_penalty, int mirostat,
float mirostat_eta, float mirostat_tau, bool penalize_nl,
const char *logit_bias, bool mlock, bool mmap, const char *maingpu,
const char *tensorsplit);
void llama_free_params(void *params_ptr);
void llama_binding_free_model(void *state);
int llama_predict(void *params_ptr, void *state_pr, char *result, bool debug);
#ifdef __cplusplus
}
#endif
// MIT License
// Copyright (c) 2023 go-skynet authors
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
package llama package llama
// #cgo LDFLAGS: -Lbuild -lbinding -lllama -lm -lggml_static -lstdc++ /*
// #cgo CXXFLAGS: -std=c++11 #cgo darwin LDFLAGS: -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
// #cgo darwin LDFLAGS: -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders #include <stdlib.h>
// #include "binding/binding.h" #include "llama.h"
// #include <stdlib.h>
import "C" struct llama_sample_options
{
float repeat_penalty;
float frequency_penalty;
float presence_penalty;
float temperature;
int32_t top_k;
float top_p;
float tfs_z;
float typical_p;
int mirostat;
float mirostat_tau;
float mirostat_eta;
};
llama_token llama_sample(
struct llama_context *ctx,
struct llama_token_data *candidates,
size_t n_candidates,
const llama_token *last_tokens,
size_t n_last_tokens,
struct llama_sample_options *opts)
{
llama_token_data_array candidates_p = {
candidates,
n_candidates,
false,
};
llama_sample_repetition_penalty(
ctx, &candidates_p,
last_tokens, n_last_tokens,
opts->repeat_penalty);
llama_sample_frequency_and_presence_penalties(
ctx, &candidates_p,
last_tokens, n_last_tokens,
opts->frequency_penalty, opts->presence_penalty);
if (opts->temperature <= 0) {
return llama_sample_token_greedy(ctx, &candidates_p);
}
if (opts->mirostat == 1) {
int mirostat_m = 100;
float mirostat_mu = 2.0f * opts->mirostat_tau;
llama_sample_temperature(ctx, &candidates_p, opts->temperature);
return llama_sample_token_mirostat(
ctx, &candidates_p,
opts->mirostat_tau, opts->mirostat_eta,
mirostat_m, &mirostat_mu);
} else if (opts->mirostat == 2) {
float mirostat_mu = 2.0f * opts->mirostat_tau;
llama_sample_temperature(ctx, &candidates_p, opts->temperature);
return llama_sample_token_mirostat_v2(
ctx, &candidates_p,
opts->mirostat_tau, opts->mirostat_eta,
&mirostat_mu);
} else {
llama_sample_top_k(ctx, &candidates_p, opts->top_k, 1);
llama_sample_tail_free(ctx, &candidates_p, opts->tfs_z, 1);
llama_sample_typical(ctx, &candidates_p, opts->typical_p, 1);
llama_sample_top_p(ctx, &candidates_p, opts->top_p, 1);
llama_sample_temperature(ctx, &candidates_p, opts->temperature);
return llama_sample_token(ctx, &candidates_p);
}
}
*/
import "C"
import ( import (
"fmt" "errors"
"io"
"os"
"strings" "strings"
"sync"
"unsafe" "unsafe"
"github.com/jmorganca/ollama/api"
) )
type LLama struct { type llama struct {
ctx unsafe.Pointer params *C.struct_llama_context_params
embeddings bool model *C.struct_llama_model
contextSize int ctx *C.struct_llama_context
}
func New(model string, mo ModelOptions) (*LLama, error) { api.Options
modelPath := C.CString(model) }
defer C.free(unsafe.Pointer(modelPath))
ctx := C.load_model(modelPath, C.int(mo.ContextSize), C.int(mo.Seed), C.bool(mo.F16Memory), C.bool(mo.MLock), C.bool(mo.Embeddings), C.bool(mo.MMap), C.bool(mo.LowVRAM), C.bool(mo.VocabOnly), C.int(mo.NGPULayers), C.int(mo.NBatch), C.CString(mo.MainGPU), C.CString(mo.TensorSplit), C.bool(mo.NUMA)) func New(model string, opts api.Options) (*llama, error) {
if ctx == nil { if _, err := os.Stat(model); err != nil {
return nil, fmt.Errorf("failed loading model") return nil, err
} }
ll := &LLama{ctx: ctx, contextSize: mo.ContextSize, embeddings: mo.Embeddings} llm := llama{Options: opts}
return ll, nil C.llama_init_backend(C.bool(llm.UseNUMA))
params := C.llama_context_default_params()
params.seed = C.uint(llm.Seed)
params.n_ctx = C.int(llm.NumCtx)
params.n_batch = C.int(llm.NumBatch)
params.n_gpu_layers = C.int(llm.NumGPU)
params.main_gpu = C.int(llm.MainGPU)
params.low_vram = C.bool(llm.LowVRAM)
params.f16_kv = C.bool(llm.F16KV)
params.logits_all = C.bool(llm.LogitsAll)
params.vocab_only = C.bool(llm.VocabOnly)
params.use_mmap = C.bool(llm.UseMMap)
params.use_mlock = C.bool(llm.UseMLock)
params.embedding = C.bool(llm.EmbeddingOnly)
llm.params = &params
cModel := C.CString(model)
defer C.free(unsafe.Pointer(cModel))
llm.model = C.llama_load_model_from_file(cModel, params)
llm.ctx = C.llama_new_context_with_model(llm.model, params)
// warm up the model
bos := []C.llama_token{C.llama_token_bos()}
C.llama_eval(llm.ctx, unsafe.SliceData(bos), C.int(len(bos)), 0, C.int(opts.NumThread))
C.llama_reset_timings(llm.ctx)
return &llm, nil
} }
func (l *LLama) Free() { func (llm *llama) Close() {
C.llama_binding_free_model(l.ctx) defer C.llama_free_model(llm.model)
defer C.llama_free(llm.ctx)
C.llama_print_timings(llm.ctx)
} }
func (l *LLama) Eval(text string, po PredictOptions) error { func (llm *llama) Predict(prompt string, fn func(string)) error {
input := C.CString(text) if tokens := llm.tokenize(prompt); tokens != nil {
if po.Tokens == 0 { return llm.generate(tokens, fn)
po.Tokens = 99999999
}
defer C.free(unsafe.Pointer(input))
reverseCount := len(po.StopPrompts)
reversePrompt := make([]*C.char, reverseCount)
var pass **C.char
for i, s := range po.StopPrompts {
cs := C.CString(s)
reversePrompt[i] = cs
pass = &reversePrompt[0]
defer C.free(unsafe.Pointer(cs))
} }
cLogitBias := C.CString(po.LogitBias) return errors.New("llama: tokenize")
defer C.free(unsafe.Pointer(cLogitBias)) }
cMainGPU := C.CString(po.MainGPU)
defer C.free(unsafe.Pointer(cMainGPU))
cTensorSplit := C.CString(po.TensorSplit)
defer C.free(unsafe.Pointer(cTensorSplit))
params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK), func (llm *llama) tokenize(prompt string) []C.llama_token {
C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat), cPrompt := C.CString(prompt)
C.bool(po.IgnoreEOS), C.bool(po.F16KV), defer C.free(unsafe.Pointer(cPrompt))
C.int(po.Batch), C.int(po.NKeep), pass, C.int(reverseCount),
C.float(po.TailFreeSamplingZ), C.float(po.TypicalP), C.float(po.FrequencyPenalty), C.float(po.PresencePenalty),
C.int(po.Mirostat), C.float(po.MirostatETA), C.float(po.MirostatTAU), C.bool(po.PenalizeNL), cLogitBias,
C.bool(po.MLock), C.bool(po.MMap), cMainGPU, cTensorSplit,
)
defer C.llama_free_params(params)
ret := C.eval(params, l.ctx, input) tokens := make([]C.llama_token, llm.NumCtx)
if ret != 0 { if n := C.llama_tokenize(llm.ctx, cPrompt, unsafe.SliceData(tokens), C.int(len(tokens)), true); n > 0 {
return fmt.Errorf("inference failed") return tokens[:n]
} }
return nil return nil
} }
func (l *LLama) Predict(text string, po PredictOptions) (string, error) { func (llm *llama) detokenize(tokens ...C.llama_token) string {
if po.TokenCallback != nil { var sb strings.Builder
setCallback(l.ctx, po.TokenCallback) for _, token := range tokens {
sb.WriteString(C.GoString(C.llama_token_to_str(llm.ctx, token)))
} }
input := C.CString(text) return sb.String()
if po.Tokens == 0 {
po.Tokens = 99999999
}
defer C.free(unsafe.Pointer(input))
out := make([]byte, po.Tokens)
reverseCount := len(po.StopPrompts)
reversePrompt := make([]*C.char, reverseCount)
var pass **C.char
for i, s := range po.StopPrompts {
cs := C.CString(s)
reversePrompt[i] = cs
pass = &reversePrompt[0]
defer C.free(unsafe.Pointer(cs))
}
cLogitBias := C.CString(po.LogitBias)
defer C.free(unsafe.Pointer(cLogitBias))
cMainGPU := C.CString(po.MainGPU)
defer C.free(unsafe.Pointer(cMainGPU))
cTensorSplit := C.CString(po.TensorSplit)
defer C.free(unsafe.Pointer(cTensorSplit))
params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat),
C.bool(po.IgnoreEOS), C.bool(po.F16KV),
C.int(po.Batch), C.int(po.NKeep), pass, C.int(reverseCount),
C.float(po.TailFreeSamplingZ), C.float(po.TypicalP), C.float(po.FrequencyPenalty), C.float(po.PresencePenalty),
C.int(po.Mirostat), C.float(po.MirostatETA), C.float(po.MirostatTAU), C.bool(po.PenalizeNL), cLogitBias,
C.bool(po.MLock), C.bool(po.MMap), cMainGPU, cTensorSplit,
)
defer C.llama_free_params(params)
ret := C.llama_predict(params, l.ctx, (*C.char)(unsafe.Pointer(&out[0])), C.bool(po.DebugMode))
if ret != 0 {
return "", fmt.Errorf("inference failed")
}
res := C.GoString((*C.char)(unsafe.Pointer(&out[0])))
res = strings.TrimPrefix(res, " ")
res = strings.TrimPrefix(res, text)
res = strings.TrimPrefix(res, "\n")
for _, s := range po.StopPrompts {
res = strings.TrimRight(res, s)
}
if po.TokenCallback != nil {
setCallback(l.ctx, nil)
}
return res, nil
}
// CGo only allows us to use static calls from C to Go, we can't just dynamically pass in func's.
// This is the next best thing, we register the callbacks in this map and call tokenCallback from
// the C code. We also attach a finalizer to LLama, so it will unregister the callback when the
// garbage collection frees it.
// SetTokenCallback registers a callback for the individual tokens created when running Predict. It
// will be called once for each token. The callback shall return true as long as the model should
// continue predicting the next token. When the callback returns false the predictor will return.
// The tokens are just converted into Go strings, they are not trimmed or otherwise changed. Also
// the tokens may not be valid UTF-8.
// Pass in nil to remove a callback.
//
// It is save to call this method while a prediction is running.
func (l *LLama) SetTokenCallback(callback func(token string) bool) {
setCallback(l.ctx, callback)
} }
var ( func (llm *llama) generate(tokens []C.llama_token, fn func(string)) error {
m sync.Mutex var opts C.struct_llama_sample_options
callbacks = map[uintptr]func(string) bool{} opts.repeat_penalty = C.float(llm.RepeatPenalty)
) opts.frequency_penalty = C.float(llm.FrequencyPenalty)
opts.presence_penalty = C.float(llm.PresencePenalty)
//export tokenCallback opts.temperature = C.float(llm.Temperature)
func tokenCallback(statePtr unsafe.Pointer, token *C.char) bool { opts.top_k = C.int(llm.TopK)
m.Lock() opts.top_p = C.float(llm.TopP)
defer m.Unlock() opts.tfs_z = C.float(llm.TFSZ)
opts.typical_p = C.float(llm.TypicalP)
if callback, ok := callbacks[uintptr(statePtr)]; ok { opts.mirostat = C.int(llm.Mirostat)
return callback(C.GoString(token)) opts.mirostat_tau = C.float(llm.MirostatTau)
opts.mirostat_eta = C.float(llm.MirostatEta)
pastTokens := deque[C.llama_token]{capacity: llm.RepeatLastN}
for C.llama_get_kv_cache_token_count(llm.ctx) < C.int(llm.NumCtx) {
if retval := C.llama_eval(llm.ctx, unsafe.SliceData(tokens), C.int(len(tokens)), C.llama_get_kv_cache_token_count(llm.ctx), C.int(llm.NumThread)); retval != 0 {
return errors.New("llama: eval")
}
token, err := llm.sample(pastTokens, &opts)
switch {
case err != nil:
return err
case errors.Is(err, io.EOF):
return nil
}
fn(llm.detokenize(token))
tokens = []C.llama_token{token}
pastTokens.PushLeft(token)
} }
return true return nil
} }
// setCallback can be used to register a token callback for LLama. Pass in a nil callback to func (llm *llama) sample(pastTokens deque[C.llama_token], opts *C.struct_llama_sample_options) (C.llama_token, error) {
// remove the callback. numVocab := int(C.llama_n_vocab(llm.ctx))
func setCallback(statePtr unsafe.Pointer, callback func(string) bool) { logits := unsafe.Slice(C.llama_get_logits(llm.ctx), numVocab)
m.Lock()
defer m.Unlock() candidates := make([]C.struct_llama_token_data, 0, numVocab)
for i := 0; i < numVocab; i++ {
candidates = append(candidates, C.llama_token_data{
id: C.int(i),
logit: logits[i],
p: 0,
})
}
if callback == nil { token := C.llama_sample(
delete(callbacks, uintptr(statePtr)) llm.ctx,
} else { unsafe.SliceData(candidates), C.ulong(len(candidates)),
callbacks[uintptr(statePtr)] = callback unsafe.SliceData(pastTokens.Data()), C.ulong(pastTokens.Len()),
opts)
if token != C.llama_token_eos() {
return token, nil
} }
return 0, io.EOF
} }
//go:build cublas
// +build cublas
package llama
/*
#cgo LDFLAGS: -lcublas -lcudart -L/usr/local/cuda/lib64/
*/
import "C"
//go:build metal
package llama
//go:build openblas
// +build openblas
package llama
/*
#cgo LDFLAGS: -lopenblas
*/
import "C"
// MIT License
// Copyright (c) 2023 go-skynet authors
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
package llama
type ModelOptions struct {
ContextSize int
Seed int
NBatch int
F16Memory bool
MLock bool
MMap bool
VocabOnly bool
LowVRAM bool
Embeddings bool
NUMA bool
NGPULayers int
MainGPU string
TensorSplit string
}
type PredictOptions struct {
Seed, Threads, Tokens, TopK, Repeat, Batch, NKeep int
TopP, Temperature, Penalty float64
F16KV bool
DebugMode bool
StopPrompts []string
IgnoreEOS bool
TailFreeSamplingZ float64
TypicalP float64
FrequencyPenalty float64
PresencePenalty float64
Mirostat int
MirostatETA float64
MirostatTAU float64
PenalizeNL bool
LogitBias string
TokenCallback func(string) bool
MLock, MMap bool
MainGPU string
TensorSplit string
}
type PredictOption func(p *PredictOptions)
type ModelOption func(p *ModelOptions)
var DefaultModelOptions ModelOptions = ModelOptions{
ContextSize: 512,
Seed: 0,
F16Memory: false,
MLock: false,
Embeddings: false,
MMap: true,
LowVRAM: false,
}
var DefaultOptions PredictOptions = PredictOptions{
Seed: -1,
Threads: 4,
Tokens: 128,
Penalty: 1.1,
Repeat: 64,
Batch: 512,
NKeep: 64,
TopK: 40,
TopP: 0.95,
TailFreeSamplingZ: 1.0,
TypicalP: 1.0,
Temperature: 0.8,
FrequencyPenalty: 0.0,
PresencePenalty: 0.0,
Mirostat: 0,
MirostatTAU: 5.0,
MirostatETA: 0.1,
MMap: true,
}
package llama
type node[T any] struct {
t T
next *node[T]
prev *node[T]
}
type deque[T any] struct {
head *node[T]
tail *node[T]
size int
capacity int
}
func (d *deque[T]) Empty() bool {
return d.size == 0
}
func (d *deque[T]) Len() int {
return d.size
}
func (d *deque[T]) Cap() int {
return d.capacity
}
func (d *deque[T]) Push(t T) {
if d.capacity > 0 && d.size >= d.capacity {
d.PopLeft()
}
n := node[T]{t: t}
if d.head != nil {
n.next = d.head
d.head.prev = &n
d.head = &n
} else {
d.head = &n
d.tail = &n
}
d.size++
}
func (d *deque[T]) PushLeft(t T) {
if d.capacity > 0 && d.size >= d.capacity {
d.Pop()
}
n := node[T]{t: t}
if d.tail != nil {
n.prev = d.tail
d.tail.next = &n
d.tail = &n
} else {
d.head = &n
d.tail = &n
}
d.size++
}
func (d *deque[T]) Pop() *T {
if d.Empty() {
return nil
}
head := d.head
d.head = head.next
if d.head != nil {
d.head.prev = nil
} else {
d.tail = nil
}
d.size--
return &head.t
}
func (d *deque[T]) PopLeft() *T {
if d.Empty() {
return nil
}
tail := d.tail
d.tail = tail.prev
if d.tail != nil {
d.tail.next = nil
} else {
d.head = nil
}
d.size--
return &tail.t
}
func (d *deque[T]) Data() (data []T) {
for n := d.head; n != nil; n = n.next {
data = append(data, n.t)
}
return data
}
...@@ -11,12 +11,12 @@ import ( ...@@ -11,12 +11,12 @@ import (
"net/http" "net/http"
"os" "os"
"path" "path"
"runtime"
"strings" "strings"
"text/template" "text/template"
"github.com/gin-gonic/gin" "github.com/gin-gonic/gin"
"github.com/lithammer/fuzzysearch/fuzzy" "github.com/lithammer/fuzzysearch/fuzzy"
"golang.org/x/sync/errgroup"
"github.com/jmorganca/ollama/api" "github.com/jmorganca/ollama/api"
"github.com/jmorganca/ollama/llama" "github.com/jmorganca/ollama/llama"
...@@ -36,14 +36,10 @@ func cacheDir() string { ...@@ -36,14 +36,10 @@ func cacheDir() string {
} }
func generate(c *gin.Context) { func generate(c *gin.Context) {
var req api.GenerateRequest req := api.GenerateRequest{
if req.ModelOptions == nil { Options: api.DefaultOptions(),
req.ModelOptions = &api.DefaultModelOptions
} }
if req.PredictOptions == nil {
req.PredictOptions = &api.DefaultPredictOptions
}
if err := c.ShouldBindJSON(&req); err != nil { if err := c.ShouldBindJSON(&req); err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
return return
...@@ -60,15 +56,12 @@ func generate(c *gin.Context) { ...@@ -60,15 +56,12 @@ func generate(c *gin.Context) {
req.Model = path.Join(cacheDir(), "models", req.Model+".bin") req.Model = path.Join(cacheDir(), "models", req.Model+".bin")
} }
modelOpts := getModelOpts(req) llm, err := llama.New(req.Model, req.Options)
modelOpts.NGPULayers = 1 // hard-code this for now
model, err := llama.New(req.Model, modelOpts)
if err != nil { if err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return return
} }
defer model.Free() defer llm.Close()
templateNames := make([]string, 0, len(templates.Templates())) templateNames := make([]string, 0, len(templates.Templates()))
for _, template := range templates.Templates() { for _, template := range templates.Templates() {
...@@ -87,43 +80,41 @@ func generate(c *gin.Context) { ...@@ -87,43 +80,41 @@ func generate(c *gin.Context) {
} }
ch := make(chan string) ch := make(chan string)
model.SetTokenCallback(func(token string) bool { g, _ := errgroup.WithContext(c.Request.Context())
ch <- token g.Go(func() error {
return true
})
predictOpts := getPredictOpts(req)
go func() {
defer close(ch) defer close(ch)
_, err := model.Predict(req.Prompt, predictOpts) return llm.Predict(req.Prompt, func(s string) {
if err != nil { ch <- s
panic(err) })
} })
}()
c.Stream(func(w io.Writer) bool { g.Go(func() error {
token, ok := <-ch c.Stream(func(w io.Writer) bool {
if !ok { s, ok := <-ch
return false if !ok {
} return false
}
resp := api.GenerateResponse{ bts, err := json.Marshal(api.GenerateResponse{Response: s})
Response: token, if err != nil {
} return false
}
bts, err := json.Marshal(resp) bts = append(bts, '\n')
if err != nil { if _, err := w.Write(bts); err != nil {
return false return false
} }
bts = append(bts, '\n') return true
if _, err := w.Write(bts); err != nil { })
return false
}
return true return nil
}) })
if err := g.Wait(); err != nil && !errors.Is(err, io.EOF) {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
} }
func Serve(ln net.Listener) error { func Serve(ln net.Listener) error {
...@@ -195,53 +186,3 @@ func matchRankOne(source string, targets []string) (bestMatch string, bestRank i ...@@ -195,53 +186,3 @@ func matchRankOne(source string, targets []string) (bestMatch string, bestRank i
return return
} }
func getModelOpts(req api.GenerateRequest) llama.ModelOptions {
var opts llama.ModelOptions
opts.ContextSize = req.ModelOptions.ContextSize
opts.Seed = req.ModelOptions.Seed
opts.F16Memory = req.ModelOptions.F16Memory
opts.MLock = req.ModelOptions.MLock
opts.Embeddings = req.ModelOptions.Embeddings
opts.MMap = req.ModelOptions.MMap
opts.LowVRAM = req.ModelOptions.LowVRAM
opts.NBatch = req.ModelOptions.NBatch
opts.VocabOnly = req.ModelOptions.VocabOnly
opts.NUMA = req.ModelOptions.NUMA
opts.NGPULayers = req.ModelOptions.NGPULayers
opts.MainGPU = req.ModelOptions.MainGPU
opts.TensorSplit = req.ModelOptions.TensorSplit
return opts
}
func getPredictOpts(req api.GenerateRequest) llama.PredictOptions {
var opts llama.PredictOptions
if req.PredictOptions.Threads == -1 {
opts.Threads = runtime.NumCPU()
} else {
opts.Threads = req.PredictOptions.Threads
}
opts.Seed = req.PredictOptions.Seed
opts.Tokens = req.PredictOptions.Tokens
opts.Penalty = req.PredictOptions.Penalty
opts.Repeat = req.PredictOptions.Repeat
opts.Batch = req.PredictOptions.Batch
opts.NKeep = req.PredictOptions.NKeep
opts.TopK = req.PredictOptions.TopK
opts.TopP = req.PredictOptions.TopP
opts.TailFreeSamplingZ = req.PredictOptions.TailFreeSamplingZ
opts.TypicalP = req.PredictOptions.TypicalP
opts.Temperature = req.PredictOptions.Temperature
opts.FrequencyPenalty = req.PredictOptions.FrequencyPenalty
opts.PresencePenalty = req.PredictOptions.PresencePenalty
opts.Mirostat = req.PredictOptions.Mirostat
opts.MirostatTAU = req.PredictOptions.MirostatTAU
opts.MirostatETA = req.PredictOptions.MirostatETA
opts.MMap = req.PredictOptions.MMap
return opts
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment