simple.cpp 6 KB
Newer Older
xuxzh1's avatar
init  
xuxzh1 committed
1
2
#include "llama.h"
#include <cstdio>
xuxzh1's avatar
update  
xuxzh1 committed
3
#include <cstring>
xuxzh1's avatar
init  
xuxzh1 committed
4
5
6
#include <string>
#include <vector>

xuxzh1's avatar
update  
xuxzh1 committed
7
8
9
10
static void print_usage(int, char ** argv) {
    printf("\nexample usage:\n");
    printf("\n    %s -m model.gguf [-n n_predict] [-ngl n_gpu_layers] [prompt]\n", argv[0]);
    printf("\n");
xuxzh1's avatar
init  
xuxzh1 committed
11
12
13
}

int main(int argc, char ** argv) {
xuxzh1's avatar
update  
xuxzh1 committed
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
    // path to the model gguf file
    std::string model_path;
    // prompt to generate text from
    std::string prompt = "Hello my name is";
    // number of layers to offload to the GPU
    int ngl = 99;
    // number of tokens to predict
    int n_predict = 32;

    // parse command line arguments

    {
        int i = 1;
        for (; i < argc; i++) {
            if (strcmp(argv[i], "-m") == 0) {
                if (i + 1 < argc) {
                    model_path = argv[++i];
                } else {
                    print_usage(argc, argv);
                    return 1;
                }
            } else if (strcmp(argv[i], "-n") == 0) {
                if (i + 1 < argc) {
                    try {
                        n_predict = std::stoi(argv[++i]);
                    } catch (...) {
                        print_usage(argc, argv);
                        return 1;
                    }
                } else {
                    print_usage(argc, argv);
                    return 1;
                }
            } else if (strcmp(argv[i], "-ngl") == 0) {
                if (i + 1 < argc) {
                    try {
                        ngl = std::stoi(argv[++i]);
                    } catch (...) {
                        print_usage(argc, argv);
                        return 1;
                    }
                } else {
                    print_usage(argc, argv);
                    return 1;
                }
            } else {
                // prompt starts here
                break;
            }
        }
        if (model_path.empty()) {
            print_usage(argc, argv);
            return 1;
        }
        if (i < argc) {
            prompt = argv[i++];
            for (; i < argc; i++) {
                prompt += " ";
                prompt += argv[i];
            }
        }
xuxzh1's avatar
init  
xuxzh1 committed
75
76
77
78
    }

    // initialize the model

xuxzh1's avatar
update  
xuxzh1 committed
79
80
    llama_model_params model_params = llama_model_default_params();
    model_params.n_gpu_layers = ngl;
xuxzh1's avatar
init  
xuxzh1 committed
81

xuxzh1's avatar
update  
xuxzh1 committed
82
    llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params);
xuxzh1's avatar
init  
xuxzh1 committed
83
84
85
86
87
88

    if (model == NULL) {
        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
        return 1;
    }

xuxzh1's avatar
update  
xuxzh1 committed
89
    // tokenize the prompt
xuxzh1's avatar
init  
xuxzh1 committed
90

xuxzh1's avatar
update  
xuxzh1 committed
91
92
    // find the number of tokens in the prompt
    const int n_prompt = -llama_tokenize(model, prompt.c_str(), prompt.size(), NULL, 0, true, true);
xuxzh1's avatar
init  
xuxzh1 committed
93

xuxzh1's avatar
update  
xuxzh1 committed
94
95
96
97
    // allocate space for the tokens and tokenize the prompt
    std::vector<llama_token> prompt_tokens(n_prompt);
    if (llama_tokenize(model, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) {
        fprintf(stderr, "%s: error: failed to tokenize the prompt\n", __func__);
xuxzh1's avatar
init  
xuxzh1 committed
98
99
100
        return 1;
    }

xuxzh1's avatar
update  
xuxzh1 committed
101
    // initialize the context
xuxzh1's avatar
init  
xuxzh1 committed
102

xuxzh1's avatar
update  
xuxzh1 committed
103
104
105
106
107
108
109
    llama_context_params ctx_params = llama_context_default_params();
    // n_ctx is the context size
    ctx_params.n_ctx = n_prompt + n_predict - 1;
    // n_batch is the maximum number of tokens that can be processed in a single call to llama_decode
    ctx_params.n_batch = n_prompt;
    // enable performance counters
    ctx_params.no_perf = false;
xuxzh1's avatar
init  
xuxzh1 committed
110

xuxzh1's avatar
update  
xuxzh1 committed
111
    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
xuxzh1's avatar
init  
xuxzh1 committed
112

xuxzh1's avatar
update  
xuxzh1 committed
113
114
    if (ctx == NULL) {
        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
xuxzh1's avatar
init  
xuxzh1 committed
115
116
117
        return 1;
    }

xuxzh1's avatar
update  
xuxzh1 committed
118
    // initialize the sampler
xuxzh1's avatar
init  
xuxzh1 committed
119

xuxzh1's avatar
update  
xuxzh1 committed
120
121
122
    auto sparams = llama_sampler_chain_default_params();
    sparams.no_perf = false;
    llama_sampler * smpl = llama_sampler_chain_init(sparams);
xuxzh1's avatar
init  
xuxzh1 committed
123

xuxzh1's avatar
update  
xuxzh1 committed
124
    llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
xuxzh1's avatar
init  
xuxzh1 committed
125

xuxzh1's avatar
update  
xuxzh1 committed
126
    // print the prompt token-by-token
xuxzh1's avatar
init  
xuxzh1 committed
127

xuxzh1's avatar
update  
xuxzh1 committed
128
129
130
131
132
133
134
135
136
    for (auto id : prompt_tokens) {
        char buf[128];
        int n = llama_token_to_piece(model, id, buf, sizeof(buf), 0, true);
        if (n < 0) {
            fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
            return 1;
        }
        std::string s(buf, n);
        printf("%s", s.c_str());
xuxzh1's avatar
init  
xuxzh1 committed
137
138
    }

xuxzh1's avatar
update  
xuxzh1 committed
139
    // prepare a batch for the prompt
xuxzh1's avatar
init  
xuxzh1 committed
140

xuxzh1's avatar
update  
xuxzh1 committed
141
    llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
xuxzh1's avatar
init  
xuxzh1 committed
142
143
144

    // main loop

xuxzh1's avatar
update  
xuxzh1 committed
145
    const auto t_main_start = ggml_time_us();
xuxzh1's avatar
init  
xuxzh1 committed
146
    int n_decode = 0;
xuxzh1's avatar
update  
xuxzh1 committed
147
    llama_token new_token_id;
xuxzh1's avatar
init  
xuxzh1 committed
148

xuxzh1's avatar
update  
xuxzh1 committed
149
150
151
152
153
154
155
156
    for (int n_pos = 0; n_pos + batch.n_tokens < n_prompt + n_predict; ) {
        // evaluate the current batch with the transformer model
        if (llama_decode(ctx, batch)) {
            fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
            return 1;
        }

        n_pos += batch.n_tokens;
xuxzh1's avatar
init  
xuxzh1 committed
157
158
159

        // sample the next token
        {
xuxzh1's avatar
update  
xuxzh1 committed
160
            new_token_id = llama_sampler_sample(smpl, ctx, -1);
xuxzh1's avatar
init  
xuxzh1 committed
161
162

            // is it an end of generation?
xuxzh1's avatar
update  
xuxzh1 committed
163
            if (llama_token_is_eog(model, new_token_id)) {
xuxzh1's avatar
init  
xuxzh1 committed
164
165
166
                break;
            }

xuxzh1's avatar
update  
xuxzh1 committed
167
168
169
170
171
172
173
174
            char buf[128];
            int n = llama_token_to_piece(model, new_token_id, buf, sizeof(buf), 0, true);
            if (n < 0) {
                fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
                return 1;
            }
            std::string s(buf, n);
            printf("%s", s.c_str());
xuxzh1's avatar
init  
xuxzh1 committed
175
176
            fflush(stdout);

xuxzh1's avatar
update  
xuxzh1 committed
177
178
            // prepare the next batch with the sampled token
            batch = llama_batch_get_one(&new_token_id, 1);
xuxzh1's avatar
init  
xuxzh1 committed
179
180
181
182
183

            n_decode += 1;
        }
    }

xuxzh1's avatar
update  
xuxzh1 committed
184
    printf("\n");
xuxzh1's avatar
init  
xuxzh1 committed
185
186
187

    const auto t_main_end = ggml_time_us();

xuxzh1's avatar
update  
xuxzh1 committed
188
    fprintf(stderr, "%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
xuxzh1's avatar
init  
xuxzh1 committed
189
190
            __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));

xuxzh1's avatar
update  
xuxzh1 committed
191
192
193
    fprintf(stderr, "\n");
    llama_perf_sampler_print(smpl);
    llama_perf_context_print(ctx);
xuxzh1's avatar
init  
xuxzh1 committed
194
195
    fprintf(stderr, "\n");

xuxzh1's avatar
update  
xuxzh1 committed
196
    llama_sampler_free(smpl);
xuxzh1's avatar
init  
xuxzh1 committed
197
198
199
200
201
    llama_free(ctx);
    llama_free_model(model);

    return 0;
}