parallel.cpp 15.3 KB
Newer Older
xuxzh1's avatar
init  
xuxzh1 committed
1
2
3
// A basic application simulating a server with multiple clients.
// The clients submit requests to the server and they are processed in parallel.

xuxzh1's avatar
update  
xuxzh1 committed
4
#include "arg.h"
xuxzh1's avatar
init  
xuxzh1 committed
5
#include "common.h"
xuxzh1's avatar
update  
xuxzh1 committed
6
7
#include "sampling.h"
#include "log.h"
xuxzh1's avatar
init  
xuxzh1 committed
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#include "llama.h"

#include <cmath>
#include <cstdio>
#include <string>
#include <vector>
#include <ctime>

// trim whitespace from the beginning and end of a string
static std::string trim(const std::string & str) {
    size_t start = 0;
    size_t end = str.size();

    while (start < end && isspace(str[start])) {
        start += 1;
    }

    while (end > start && isspace(str[end - 1])) {
        end -= 1;
    }

    return str.substr(start, end - start);
}

static std::string k_system =
R"(Transcript of a never ending dialog, where the User interacts with an Assistant.
The Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.

User: Recommend a nice restaurant in the area.
Assistant: I recommend the restaurant "The Golden Duck". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.
User: Who is Richard Feynman?
Assistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including "Surely You're Joking, Mr. Feynman!" and "What Do You Care What Other People Think?".
User:)";

static std::vector<std::string> k_prompts = {
    "What is the meaning of life?",
    "Tell me an interesting fact about llamas.",
    "What is the best way to cook a steak?",
    "Are you familiar with the Special Theory of Relativity and can you explain it to me?",
    "Recommend some interesting books to read.",
    "What is the best way to learn a new language?",
    "How to get a job at Google?",
    "If you could have any superpower, what would it be?",
    "I want to learn how to play the piano.",
};

struct client {
    ~client() {
xuxzh1's avatar
update  
xuxzh1 committed
56
57
        if (smpl) {
            common_sampler_free(smpl);
xuxzh1's avatar
init  
xuxzh1 committed
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
        }
    }

    int32_t id = 0;

    llama_seq_id seq_id = -1;

    llama_token sampled;

    int64_t t_start_prompt;
    int64_t t_start_gen;

    int32_t n_prompt  = 0;
    int32_t n_decoded = 0;
    int32_t i_batch   = -1;

    std::string input;
    std::string prompt;
    std::string response;

xuxzh1's avatar
update  
xuxzh1 committed
78
    struct common_sampler * smpl = nullptr;
xuxzh1's avatar
init  
xuxzh1 committed
79
80
81
82
83
84
85
86
};

static void print_date_time() {
    std::time_t current_time = std::time(nullptr);
    std::tm* local_time = std::localtime(&current_time);
    char buffer[80];
    strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", local_time);

xuxzh1's avatar
update  
xuxzh1 committed
87
88
89
    LOG_INF("\n");
    LOG_INF("\033[35mrun parameters as of %s\033[0m\n", buffer);
    LOG_INF("\n");
xuxzh1's avatar
init  
xuxzh1 committed
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
}

// Define a split string function to ...
static std::vector<std::string> split_string(const std::string& input, char delimiter) {
    std::vector<std::string> tokens;
    std::istringstream stream(input);
    std::string token;
    while (std::getline(stream, token, delimiter)) {
        tokens.push_back(token);
    }
    return tokens;
}

int main(int argc, char ** argv) {
    srand(1234);

xuxzh1's avatar
update  
xuxzh1 committed
106
    common_params params;
xuxzh1's avatar
init  
xuxzh1 committed
107

xuxzh1's avatar
update  
xuxzh1 committed
108
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
xuxzh1's avatar
init  
xuxzh1 committed
109
110
111
        return 1;
    }

xuxzh1's avatar
update  
xuxzh1 committed
112
113
    common_init();

xuxzh1's avatar
init  
xuxzh1 committed
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
    // number of simultaneous "clients" to simulate
    const int32_t n_clients = params.n_parallel;

    // dedicate one sequence to the system prompt
    params.n_parallel += 1;

    // requests to simulate
    const int32_t n_seq = params.n_sequences;

    // insert new requests as soon as the previous one is done
    const bool cont_batching = params.cont_batching;

    const bool dump_kv_cache = params.dump_kv_cache;

    // init llama.cpp
    llama_backend_init();
    llama_numa_init(params.numa);

    // load the target model
xuxzh1's avatar
update  
xuxzh1 committed
133
    common_init_result llama_init = common_init_from_params(params);
xuxzh1's avatar
init  
xuxzh1 committed
134
135
136
137
138
139

    llama_model * model = llama_init.model;
    llama_context * ctx = llama_init.context;

    // load the prompts from an external file if there are any
    if (params.prompt.empty()) {
xuxzh1's avatar
update  
xuxzh1 committed
140
        LOG_INF("\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
xuxzh1's avatar
init  
xuxzh1 committed
141
142
143
    } else {
        // Output each line of the input params.prompts vector and copy to k_prompts
        int index = 0;
xuxzh1's avatar
update  
xuxzh1 committed
144
        LOG_INF("\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str());
xuxzh1's avatar
init  
xuxzh1 committed
145
146
147
148
149
150

        std::vector<std::string> prompts = split_string(params.prompt, '\n');
        for (const auto& prompt : prompts) {
            k_prompts.resize(index + 1);
            k_prompts[index] = prompt;
            index++;
xuxzh1's avatar
update  
xuxzh1 committed
151
            LOG_INF("%3d prompt: %s\n", index, prompt.c_str());
xuxzh1's avatar
init  
xuxzh1 committed
152
153
154
        }
    }

xuxzh1's avatar
update  
xuxzh1 committed
155
    LOG_INF("\n\n");
xuxzh1's avatar
init  
xuxzh1 committed
156
157
158
159
160
161
162

    const int n_ctx = llama_n_ctx(ctx);

    std::vector<client> clients(n_clients);
    for (size_t i = 0; i < clients.size(); ++i) {
        auto & client = clients[i];
        client.id = i;
xuxzh1's avatar
update  
xuxzh1 committed
163
        client.smpl = common_sampler_init(model, params.sampling);
xuxzh1's avatar
init  
xuxzh1 committed
164
165
166
    }

    std::vector<llama_token> tokens_system;
xuxzh1's avatar
update  
xuxzh1 committed
167
    tokens_system = common_tokenize(ctx, k_system, true);
xuxzh1's avatar
init  
xuxzh1 committed
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
    const int32_t n_tokens_system = tokens_system.size();

    llama_seq_id g_seq_id = 0;

    // the max batch size is as large as the context to handle cases where we get very long input prompt from multiple
    // users. regardless of the size, the main loop will chunk the batch into a maximum of params.n_batch tokens at a time
    llama_batch batch = llama_batch_init(n_ctx, 0, 1);

    int32_t n_total_prompt = 0;
    int32_t n_total_gen    = 0;
    int32_t n_cache_miss   = 0;

    struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, n_clients);

    const auto t_main_start = ggml_time_us();

xuxzh1's avatar
update  
xuxzh1 committed
184
185
186
    LOG_INF("%s: Simulating parallel requests from clients:\n", __func__);
    LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
    LOG_INF("\n");
xuxzh1's avatar
init  
xuxzh1 committed
187
188

    {
xuxzh1's avatar
update  
xuxzh1 committed
189
        LOG_INF("%s: Evaluating the system prompt ...\n", __func__);
xuxzh1's avatar
init  
xuxzh1 committed
190
191

        for (int32_t i = 0; i < n_tokens_system; ++i) {
xuxzh1's avatar
update  
xuxzh1 committed
192
            common_batch_add(batch, tokens_system[i], i, { 0 }, false);
xuxzh1's avatar
init  
xuxzh1 committed
193
194
195
        }

        if (llama_decode(ctx, batch) != 0) {
xuxzh1's avatar
update  
xuxzh1 committed
196
            LOG_ERR("%s: llama_decode() failed\n", __func__);
xuxzh1's avatar
init  
xuxzh1 committed
197
198
199
200
201
202
203
204
            return 1;
        }

        // assign the system KV cache to all parallel sequences
        for (int32_t i = 1; i <= n_clients; ++i) {
            llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
        }

xuxzh1's avatar
update  
xuxzh1 committed
205
        LOG_INF("\n");
xuxzh1's avatar
init  
xuxzh1 committed
206
207
    }

xuxzh1's avatar
update  
xuxzh1 committed
208
    LOG_INF("Processing requests ...\n\n");
xuxzh1's avatar
init  
xuxzh1 committed
209
210
211
212

    while (true) {
        if (dump_kv_cache) {
            llama_kv_cache_view_update(ctx, &kvc_view);
xuxzh1's avatar
update  
xuxzh1 committed
213
            common_kv_cache_dump_view_seqs(kvc_view, 40);
xuxzh1's avatar
init  
xuxzh1 committed
214
215
        }

xuxzh1's avatar
update  
xuxzh1 committed
216
        common_batch_clear(batch);
xuxzh1's avatar
init  
xuxzh1 committed
217
218
219
220
221
222
223
224
225

        // decode any currently ongoing sequences
        for (auto & client : clients) {
            if (client.seq_id == -1) {
                continue;
            }

            client.i_batch = batch.n_tokens;

xuxzh1's avatar
update  
xuxzh1 committed
226
            common_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true);
xuxzh1's avatar
init  
xuxzh1 committed
227
228
229
230
231
232
233
234
235
236
237
238

            client.n_decoded += 1;
        }

        if (batch.n_tokens == 0) {
            // all sequences have ended - clear the entire KV cache
            for (int i = 1; i <= n_clients; ++i) {
                llama_kv_cache_seq_rm(ctx, i, -1, -1);
                // but keep the system prompt
                llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
            }

xuxzh1's avatar
update  
xuxzh1 committed
239
            LOG_INF("%s: clearing the KV cache\n", __func__);
xuxzh1's avatar
init  
xuxzh1 committed
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
        }

        // insert new sequences for decoding
        if (cont_batching || batch.n_tokens == 0) {
            for (auto & client : clients) {
                if (client.seq_id == -1 && g_seq_id < n_seq) {
                    client.seq_id = g_seq_id;

                    client.t_start_prompt = ggml_time_us();
                    client.t_start_gen    = 0;

                    client.input    = k_prompts[rand() % k_prompts.size()];
                    client.prompt   = client.input + "\nAssistant:";
                    client.response = "";

xuxzh1's avatar
update  
xuxzh1 committed
255
                    common_sampler_reset(client.smpl);
xuxzh1's avatar
init  
xuxzh1 committed
256
257
258

                    // do not prepend BOS because we have a system prompt!
                    std::vector<llama_token> tokens_prompt;
xuxzh1's avatar
update  
xuxzh1 committed
259
                    tokens_prompt = common_tokenize(ctx, client.prompt, false);
xuxzh1's avatar
init  
xuxzh1 committed
260
261

                    for (size_t i = 0; i < tokens_prompt.size(); ++i) {
xuxzh1's avatar
update  
xuxzh1 committed
262
                        common_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false);
xuxzh1's avatar
init  
xuxzh1 committed
263
264
265
266
267
268
269
270
271
272
273
                    }

                    // extract the logits only for the last token
                    if (batch.n_tokens > 0) {
                        batch.logits[batch.n_tokens - 1] = true;
                    }

                    client.n_prompt  = tokens_prompt.size();
                    client.n_decoded = 0;
                    client.i_batch   = batch.n_tokens - 1;

xuxzh1's avatar
update  
xuxzh1 committed
274
                    LOG_INF("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
xuxzh1's avatar
init  
xuxzh1 committed
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316

                    g_seq_id += 1;

                    // insert new requests one-by-one
                    //if (cont_batching) {
                    //    break;
                    //}
                }
            }
        }

        if (batch.n_tokens == 0) {
            break;
        }

        // process in chunks of params.n_batch
        int32_t n_batch = params.n_batch;

        for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
            // experiment: process in powers of 2
            //if (i + n_batch > (int32_t) batch.n_tokens && n_batch > 32) {
            //    n_batch /= 2;
            //    i -= n_batch;
            //    continue;
            //}

            const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));

            llama_batch batch_view = {
                n_tokens,
                batch.token    + i,
                nullptr,
                batch.pos      + i,
                batch.n_seq_id + i,
                batch.seq_id   + i,
                batch.logits   + i,
            };

            const int ret = llama_decode(ctx, batch_view);
            if (ret != 0) {
                if (n_batch == 1 || ret < 0) {
                    // if you get here, it means the KV cache is full - try increasing it via the context size
xuxzh1's avatar
update  
xuxzh1 committed
317
                    LOG_ERR("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
xuxzh1's avatar
init  
xuxzh1 committed
318
319
320
                    return 1;
                }

xuxzh1's avatar
update  
xuxzh1 committed
321
                LOG_ERR("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
xuxzh1's avatar
init  
xuxzh1 committed
322
323
324
325
326
327
328
329
330
331

                n_cache_miss += 1;

                // retry with half the batch size to try to find a free slot in the KV cache
                n_batch /= 2;
                i -= n_batch;

                continue;
            }

xuxzh1's avatar
update  
xuxzh1 committed
332
            LOG_DBG("%s : decoded batch of %d tokens\n", __func__, n_tokens);
xuxzh1's avatar
init  
xuxzh1 committed
333
334
335
336
337
338
339
340
341

            for (auto & client : clients) {
                if (client.i_batch < (int) i || client.i_batch >= (int) (i + n_tokens)) {
                    continue;
                }

                //printf("client %d, seq %d, token %d, pos %d, batch %d\n",
                //        client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);

xuxzh1's avatar
update  
xuxzh1 committed
342
                const llama_token id = common_sampler_sample(client.smpl, ctx, client.i_batch - i);
xuxzh1's avatar
init  
xuxzh1 committed
343

xuxzh1's avatar
update  
xuxzh1 committed
344
                common_sampler_accept(client.smpl, id, true);
xuxzh1's avatar
init  
xuxzh1 committed
345
346
347
348
349
350
351

                if (client.n_decoded == 1) {
                    // start measuring generation time after the first token to make sure all concurrent clients
                    // have their prompt already processed
                    client.t_start_gen = ggml_time_us();
                }

xuxzh1's avatar
update  
xuxzh1 committed
352
                const std::string token_str = common_token_to_piece(ctx, id);
xuxzh1's avatar
init  
xuxzh1 committed
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371

                client.response += token_str;
                client.sampled = id;

                //printf("client %d, seq %d, token %d, pos %d, batch %d: %s\n",
                //        client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str());

                if (client.n_decoded > 2 &&
                        (llama_token_is_eog(model, id) ||
                         (params.n_predict > 0 && client.n_decoded + client.n_prompt >= params.n_predict) ||
                         client.response.find("User:") != std::string::npos ||
                         client.response.find('\n') != std::string::npos)) {
                    // basic reverse prompt
                    const size_t pos = client.response.find("User:");
                    if (pos != std::string::npos) {
                        client.response = client.response.substr(0, pos);
                    }

                    // delete only the generated part of the sequence, i.e. keep the system prompt in the cache
xuxzh1's avatar
update  
xuxzh1 committed
372
                    llama_kv_cache_seq_rm(ctx,    client.id + 1, -1, -1);
xuxzh1's avatar
init  
xuxzh1 committed
373
374
375
376
                    llama_kv_cache_seq_cp(ctx, 0, client.id + 1, -1, -1);

                    const auto t_main_end = ggml_time_us();

xuxzh1's avatar
update  
xuxzh1 committed
377
                    LOG_INF("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \n\nInput:    %s\n\033[35mResponse: %s\033[0m\n\n",
xuxzh1's avatar
init  
xuxzh1 committed
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
                            client.id, client.seq_id, n_seq, client.n_prompt, client.n_decoded,
                            (t_main_end - client.t_start_prompt) / 1e6,
                            (double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6,
                            n_cache_miss,
                            ::trim(client.input).c_str(),
                            ::trim(client.response).c_str());

                    n_total_prompt += client.n_prompt;
                    n_total_gen    += client.n_decoded;

                    client.seq_id = -1;
                }

                client.i_batch = -1;
            }
        }
    }

    const auto t_main_end = ggml_time_us();

    print_date_time();

xuxzh1's avatar
update  
xuxzh1 committed
400
    LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
xuxzh1's avatar
init  
xuxzh1 committed
401
402
403
    if (params.prompt_file.empty()) {
        params.prompt_file = "used built-in defaults";
    }
xuxzh1's avatar
update  
xuxzh1 committed
404
405
    LOG_INF("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
    LOG_INF("Model and path used:  \033[32m%s\033[0m\n\n", params.model.c_str());
xuxzh1's avatar
init  
xuxzh1 committed
406

xuxzh1's avatar
update  
xuxzh1 committed
407
408
409
410
    LOG_INF("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt              ) / (t_main_end - t_main_start) * 1e6);
    LOG_INF("Total gen tokens:    %6d, speed: %5.2f t/s\n", n_total_gen,    (double) (n_total_gen                 ) / (t_main_end - t_main_start) * 1e6);
    LOG_INF("Total speed (AVG):   %6s  speed: %5.2f t/s\n", "",             (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
    LOG_INF("Cache misses:        %6d\n", n_cache_miss);
xuxzh1's avatar
init  
xuxzh1 committed
411

xuxzh1's avatar
update  
xuxzh1 committed
412
    LOG_INF("\n");
xuxzh1's avatar
init  
xuxzh1 committed
413

xuxzh1's avatar
update  
xuxzh1 committed
414
415
    // TODO: print sampling/grammar timings for all clients
    llama_perf_context_print(ctx);
xuxzh1's avatar
init  
xuxzh1 committed
416
417
418
419
420
421
422
423

    llama_batch_free(batch);

    llama_free(ctx);
    llama_free_model(model);

    llama_backend_free();

xuxzh1's avatar
update  
xuxzh1 committed
424
    LOG("\n\n");
xuxzh1's avatar
init  
xuxzh1 committed
425
426
427

    return 0;
}