main.go 2.42 KB
Newer Older
Jeffrey Morgan's avatar
Jeffrey Morgan committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
package main

import (
	"bytes"
	"context"
	"fmt"
	"io"
	"net/http"
	"os"
	"runtime"

	"github.com/sashabaranov/go-openai"

	llama "github.com/go-skynet/go-llama.cpp"
)


type Model interface {
	Name() string
	Handler(w http.ResponseWriter, r *http.Request)
}

type LLama7B struct {
	llama *llama.LLama
}

func NewLLama7B() *LLama7B {
	llama, err := llama.New("./models/7B/ggml-model-q4_0.bin", llama.EnableF16Memory, llama.SetContext(128), llama.EnableEmbeddings, llama.SetGPULayers(128))
	if err != nil {
		fmt.Println("Loading the model failed:", err.Error())
		os.Exit(1)
	}

	return &LLama7B{
		llama: llama,
	}
}

func (l *LLama7B) Name() string {
	return "LLaMA 7B"
}

func (m *LLama7B) Handler(w http.ResponseWriter, r *http.Request) {
	var text bytes.Buffer
	io.Copy(&text, r.Body)

	_, err := m.llama.Predict(text.String(), llama.Debug, llama.SetTokenCallback(func(token string) bool {
		w.Write([]byte(token))
		return true
	}), llama.SetTokens(512), llama.SetThreads(runtime.NumCPU()), llama.SetTopK(90), llama.SetTopP(0.86), llama.SetStopWords("llama"))

	if err != nil {
		fmt.Println("Predict failed:", err.Error())
		os.Exit(1)
	}

	embeds, err := m.llama.Embeddings(text.String())
	if err != nil {
		fmt.Printf("Embeddings: error %s \n", err.Error())
	}
	fmt.Printf("Embeddings: %v", embeds)

	w.Header().Set("Content-Type", "text/event-stream")
    w.Header().Set("Cache-Control", "no-cache")
    w.Header().Set("Connection", "keep-alive")
}

type GPT4 struct {
	apiKey string
}

func (g *GPT4) Name() string {
	return "OpenAI GPT-4"
}

func (g *GPT4) Handler(w http.ResponseWriter, r *http.Request) {
	w.WriteHeader(http.StatusOK)
	client := openai.NewClient("your token")
	resp, err := client.CreateChatCompletion(
		context.Background(),
		openai.ChatCompletionRequest{
			Model: openai.GPT3Dot5Turbo,
			Messages: []openai.ChatCompletionMessage{
				{
					Role:    openai.ChatMessageRoleUser,
					Content: "Hello!",
				},
			},
		},
	)
	if err != nil {
		fmt.Printf("chat completion error: %v\n", err)
		return
	}

	fmt.Println(resp.Choices[0].Message.Content)

	w.Header().Set("Content-Type", "text/plain; charset=utf-8")
	w.WriteHeader(http.StatusOK)
}

// TODO: add subcommands to spawn different models
func main() {
	model := &LLama7B{}
	
	http.HandleFunc("/generate", model.Handler)

	fmt.Println("Starting server on :8080")
	if err := http.ListenAndServe(":8080", nil); err != nil {
		fmt.Printf("Error starting server: %s\n", err)
		return
	}
}