relay CUDA errors to the client (#825)

565648f3 · Bruce MacDonald · GitHub · 3a247717 · 565648f3
Unverified Commit 565648f3 authored Oct 18, 2023 by Bruce MacDonald Committed by GitHub Oct 18, 2023
Show whitespace changes
Inline Side-by-side

Showing with 35 additions and 12 deletions

llm/llama.go llm/llama.go +35 -12

No files found.
--- a/llm/llama.go
+++ b/llm/llama.go
@@ -188,7 +188,7 @@ type Running struct {
 	Cancel        context.CancelFunc
 	exitOnce      sync.Once
 	exitCh        chan error // channel to receive the exit status of the subprocess
-	exitErr  error      // error returned by the subprocess
+	*StatusWriter            // captures error messages from the llama runner process
 }
 type llama struct {
@@ -260,6 +260,7 @@ func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
 // StatusWriter is a writer that captures error messages from the llama runner process
 type StatusWriter struct {
 	ErrCh      chan error
+	LastErrMsg string
 }
 func NewStatusWriter() *StatusWriter {
@@ -269,9 +270,18 @@ func NewStatusWriter() *StatusWriter {
 }
 func (w *StatusWriter) Write(b []byte) (int, error) {
+	var errMsg string
 	if _, after, ok := bytes.Cut(b, []byte("error:")); ok {
-		w.ErrCh <- fmt.Errorf("llama runner: %s", bytes.TrimSpace(after))
+		errMsg = string(bytes.TrimSpace(after))
+	} else if _, after, ok := bytes.Cut(b, []byte("CUDA error")); ok {
+		errMsg = string(bytes.TrimSpace(after))
 	}
+	if errMsg != "" {
+		w.LastErrMsg = errMsg
+		w.ErrCh <- fmt.Errorf("llama runner: %s", errMsg)
+	}
 	return os.Stderr.Write(b)
 }
@@ -359,7 +369,13 @@ func newLlama(model string, adapters []string, runners []ModelRunner, numLayers
 		// monitor the llama runner process and signal when it exits
 		go func() {
 			err := llm.Cmd.Wait()
-			llm.exitErr = err
+			// default to printing the exit message of the command process, it will probably just say 'exit staus 1'
+			errMsg := err.Error()
+			// try to set a better error message if llama runner logs captured an error
+			if statusWriter.LastErrMsg != "" {
+				errMsg = statusWriter.LastErrMsg
+			}
+			log.Println(errMsg)
 			// llm.Cmd.Wait() can only be called once, use this exit channel to signal that the process has exited
 			llm.exitOnce.Do(func() {
 				close(llm.exitCh)
@@ -429,10 +445,9 @@ func (llm *llama) Close() {
 	// wait for the command to exit to prevent race conditions with the next run
 	<-llm.exitCh
-	err := llm.exitErr
-	if err != nil {
+	if llm.StatusWriter != nil && llm.StatusWriter.LastErrMsg != "" {
-		log.Printf("llama runner stopped with error: %v", err)
+		log.Printf("llama runner stopped with error: %v", llm.StatusWriter.LastErrMsg)
 	} else {
 		log.Print("llama runner stopped successfully")
 	}
@@ -569,6 +584,14 @@ func (llm *llama) Predict(ctx context.Context, prevContext []int, prompt string,
 	}
 	if err := scanner.Err(); err != nil {
+		if strings.Contains(err.Error(), "unexpected EOF") {
+			// this means the llama runner subprocess crashed
+			llm.Close()
+			if llm.StatusWriter != nil && llm.StatusWriter.LastErrMsg != "" {
+				return fmt.Errorf("llama runner exited: %v", llm.StatusWriter.LastErrMsg)
+			}
+			return fmt.Errorf("llama runner exited, you may not have enough available memory to run this model")
+		}
 		return fmt.Errorf("error reading llm response: %v", err)
 	}