remove prompt cache

da74384a · Bruce MacDonald · 45bf83ff · da74384a · da74384a · da74384a
Commit da74384a authored Jul 06, 2023 by Bruce MacDonald
Showing with 12 additions and 41 deletions

llama/binding/binding.cpp llama/binding/binding.cpp +3 -8

llama/binding/binding.h llama/binding/binding.h +2 -3

llama/llama.go llama/llama.go +4 -10

llama/options.go llama/options.go +3 -20

No files found.
--- a/llama/binding/binding.cpp
+++ b/llama/binding/binding.cpp
@@ -24,7 +24,7 @@
 #include <windows.h>
 #endif
-#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) ||          \
+#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) || \
    defined(_WIN32)
 void sigint_handler(int signo) {
  if (signo == SIGINT) {
@@ -573,15 +573,13 @@ void *llama_allocate_params(
    const char **antiprompt, int antiprompt_count, float tfs_z, float typical_p,
    float frequency_penalty, float presence_penalty, int mirostat,
    float mirostat_eta, float mirostat_tau, bool penalize_nl,
-    const char *logit_bias, const char *session_file, bool prompt_cache_all,
+    const char *logit_bias, bool mlock, bool mmap, const char *maingpu,
-    bool mlock, bool mmap, const char *maingpu, const char *tensorsplit,
+    const char *tensorsplit) {
-    bool prompt_cache_ro) {
  gpt_params *params = new gpt_params;
  params->seed = seed;
  params->n_threads = threads;
  params->n_predict = tokens;
  params->repeat_last_n = repeat_last_n;
-  params->prompt_cache_ro = prompt_cache_ro;
  params->top_k = top_k;
  params->top_p = top_p;
  params->memory_f16 = memory_f16;
@@ -612,9 +610,6 @@ void *llama_allocate_params(
    }
  }
-  params->prompt_cache_all = prompt_cache_all;
-  params->path_prompt_cache = session_file;
  if (ignore_eos) {
    params->logit_bias[llama_token_eos()] = -INFINITY;
  }

--- a/llama/binding/binding.h
+++ b/llama/binding/binding.h
@@ -31,9 +31,8 @@ void *llama_allocate_params(
    const char **antiprompt, int antiprompt_count, float tfs_z, float typical_p,
    float frequency_penalty, float presence_penalty, int mirostat,
    float mirostat_eta, float mirostat_tau, bool penalize_nl,
-    const char *logit_bias, const char *session_file, bool prompt_cache_all,
+    const char *logit_bias, bool mlock, bool mmap, const char *maingpu,
-    bool mlock, bool mmap, const char *maingpu, const char *tensorsplit,
+    const char *tensorsplit);
-    bool prompt_cache_ro);
 void llama_free_params(void *params_ptr);

--- a/llama/llama.go
+++ b/llama/llama.go
@@ -28,6 +28,7 @@ package llama
 // #include "binding/binding.h"
 // #include <stdlib.h>
 import "C"
 import (
 	"fmt"
 	"strings"
@@ -69,7 +70,7 @@ func (l *LLama) Eval(text string, opts ...PredictOption) error {
 		po.Tokens = 99999999
 	}
 	defer C.free(unsafe.Pointer(input))
 	reverseCount := len(po.StopPrompts)
 	reversePrompt := make([]*C.char, reverseCount)
 	var pass **C.char
@@ -86,9 +87,7 @@ func (l *LLama) Eval(text string, opts ...PredictOption) error {
 		C.int(po.Batch), C.int(po.NKeep), pass, C.int(reverseCount),
 		C.float(po.TailFreeSamplingZ), C.float(po.TypicalP), C.float(po.FrequencyPenalty), C.float(po.PresencePenalty),
 		C.int(po.Mirostat), C.float(po.MirostatETA), C.float(po.MirostatTAU), C.bool(po.PenalizeNL), C.CString(po.LogitBias),
-		C.CString(po.PathPromptCache), C.bool(po.PromptCacheAll), C.bool(po.MLock), C.bool(po.MMap),
+		C.bool(po.MLock), C.bool(po.MMap), C.CString(po.MainGPU), C.CString(po.TensorSplit),
-		C.CString(po.MainGPU), C.CString(po.TensorSplit),
-		C.bool(po.PromptCacheRO),
 	)
 	defer C.llama_free_params(params)
@@ -128,9 +127,6 @@ func (l *LLama) Predict(text string, opts ...PredictOption) (string, error) {
 	cLogitBias := C.CString(po.LogitBias)
 	defer C.free(unsafe.Pointer(cLogitBias))
-	cPathPromptCache := C.CString(po.PathPromptCache)
-	defer C.free(unsafe.Pointer(cPathPromptCache))
 	cMainGPU := C.CString(po.MainGPU)
 	defer C.free(unsafe.Pointer(cMainGPU))
@@ -143,9 +139,7 @@ func (l *LLama) Predict(text string, opts ...PredictOption) (string, error) {
 		C.int(po.Batch), C.int(po.NKeep), pass, C.int(reverseCount),
 		C.float(po.TailFreeSamplingZ), C.float(po.TypicalP), C.float(po.FrequencyPenalty), C.float(po.PresencePenalty),
 		C.int(po.Mirostat), C.float(po.MirostatETA), C.float(po.MirostatTAU), C.bool(po.PenalizeNL), cLogitBias,
-		cPathPromptCache, C.bool(po.PromptCacheAll), C.bool(po.MLock), C.bool(po.MMap),
+		C.bool(po.MLock), C.bool(po.MMap), cMainGPU, cTensorSplit,
-		cMainGPU, cTensorSplit,
-		C.bool(po.PromptCacheRO),
 	)
 	defer C.llama_free_params(params)

--- a/llama/options.go
+++ b/llama/options.go
@@ -57,11 +57,9 @@ type PredictOptions struct {
 	LogitBias         string
 	TokenCallback     func(string) bool
-	PathPromptCache             string
+	MLock, MMap bool
-	MLock, MMap, PromptCacheAll bool
+	MainGPU     string
-	PromptCacheRO               bool
+	TensorSplit string
-	MainGPU                     string
-	TensorSplit                 string
 }
 type PredictOption func(p *PredictOptions)
@@ -182,14 +180,6 @@ var Debug PredictOption = func(p *PredictOptions) {
 	p.DebugMode = true
 }
-var EnablePromptCacheAll PredictOption = func(p *PredictOptions) {
-	p.PromptCacheAll = true
-}
-var EnablePromptCacheRO PredictOption = func(p *PredictOptions) {
-	p.PromptCacheRO = true
-}
 var EnableMLock ModelOption = func(p *ModelOptions) {
 	p.MLock = true
 }
@@ -284,13 +274,6 @@ func SetTemperature(temp float64) PredictOption {
 	}
 }
-// SetPathPromptCache sets the session file to store the prompt cache.
-func SetPathPromptCache(f string) PredictOption {
-	return func(p *PredictOptions) {
-		p.PathPromptCache = f
-	}
-}
 // SetPenalty sets the repetition penalty for text generation.
 func SetPenalty(penalty float64) PredictOption {
 	return func(p *PredictOptions) {