lock on llm.lock(); decrease batch size

c4904161 · Michael Yang · f62a8827 · c4904161 · c4904161
Commit c4904161 authored Jul 20, 2023 by Michael Yang
Show whitespace changes
Inline Side-by-side

Showing with 11 additions and 7 deletions

api/types.go api/types.go +1 -1

llama/llama.go llama/llama.go +10 -6

No files found.
--- a/api/types.go
+++ b/api/types.go
@@ -177,7 +177,7 @@ func DefaultOptions() Options {
 		UseNUMA: false,
 		NumCtx:   2048,
-		NumBatch: 512,
+		NumBatch: 32,
 		NumGPU:   1,
 		LowVRAM:  false,
 		F16KV:    true,

--- a/llama/llama.go
+++ b/llama/llama.go
@@ -172,9 +172,6 @@ func (llm *LLM) Close() {
 }
 func (llm *LLM) Predict(ctx []int, prompt string, fn func(api.GenerateResponse)) error {
-	llm.mu.Lock()
-	defer llm.mu.Unlock()
 	C.llama_reset_timings(llm.ctx)
 	tokens := make([]C.llama_token, len(ctx))
@@ -193,12 +190,12 @@ func (llm *LLM) Predict(ctx []int, prompt string, fn func(api.GenerateResponse))
 	var b bytes.Buffer
 	for {
 		token, err := llm.next()
-		if errors.Is(err, io.EOF) {
+		if llm.gc {
+			return nil
+		} else if errors.Is(err, io.EOF) {
 			break
 		} else if err != nil {
 			return err
-		} else if llm.gc {
-			return io.EOF
 		}
 		b.WriteString(llm.detokenize(token))
@@ -293,6 +290,9 @@ func (llm *LLM) detokenize(tokens ...C.llama_token) string {
 }
 func (llm *LLM) next() (C.llama_token, error) {
+	llm.mu.Lock()
+	defer llm.mu.Unlock()
 	if len(llm.embd) >= llm.NumCtx {
 		numLeft := (llm.NumCtx - llm.NumKeep) / 2
 		truncated := llm.embd[:llm.NumKeep]
@@ -304,6 +304,10 @@ func (llm *LLM) next() (C.llama_token, error) {
 	}
 	for {
+		if llm.gc {
+			return 0, io.EOF
+		}
 		if llm.cursor >= len(llm.embd) {
 			break
 		}