lookup.cpp 8.09 KB
Newer Older
xuxzh1's avatar
update  
xuxzh1 committed
1
#include "arg.h"
xuxzh1's avatar
init  
xuxzh1 committed
2
3
4
#include "ggml.h"
#include "common.h"
#include "ngram-cache.h"
xuxzh1's avatar
update  
xuxzh1 committed
5
6
7
#include "sampling.h"
#include "log.h"
#include "llama.h"
xuxzh1's avatar
init  
xuxzh1 committed
8
9
10
11
12
13
14
15

#include <cstdint>
#include <cstdio>
#include <fstream>
#include <string>
#include <vector>

int main(int argc, char ** argv){
xuxzh1's avatar
update  
xuxzh1 committed
16
    common_params params;
xuxzh1's avatar
init  
xuxzh1 committed
17

xuxzh1's avatar
update  
xuxzh1 committed
18
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
xuxzh1's avatar
init  
xuxzh1 committed
19
20
21
        return 1;
    }

xuxzh1's avatar
update  
xuxzh1 committed
22
23
    common_init();

xuxzh1's avatar
init  
xuxzh1 committed
24
    // max. number of additional tokens to draft if match is found
xuxzh1's avatar
update  
xuxzh1 committed
25
    const int n_draft = params.speculative.n_max;
xuxzh1's avatar
init  
xuxzh1 committed
26
27
28
29
30
31
32
33

    const bool dump_kv_cache = params.dump_kv_cache;

    // init llama.cpp
    llama_backend_init();
    llama_numa_init(params.numa);

    // load the model
xuxzh1's avatar
update  
xuxzh1 committed
34
    common_init_result llama_init = common_init_from_params(params);
xuxzh1's avatar
init  
xuxzh1 committed
35
36
37
38
39
40

    llama_model * model = llama_init.model;
    llama_context * ctx = llama_init.context;

    // tokenize the prompt
    std::vector<llama_token> inp;
xuxzh1's avatar
update  
xuxzh1 committed
41
    inp = common_tokenize(ctx, params.prompt, true, true);
xuxzh1's avatar
init  
xuxzh1 committed
42

xuxzh1's avatar
update  
xuxzh1 committed
43
44
45
    common_ngram_cache ngram_cache_context;
    common_ngram_cache ngram_cache_dynamic;
    common_ngram_cache ngram_cache_static;
xuxzh1's avatar
init  
xuxzh1 committed
46
47
48
49
50
51
    int64_t t_draft_flat_us = 0;
    int64_t t_draft_us = 0;

    {
        // Fill up context ngram cache with tokens from user input:
        const int64_t t_start_draft_us = ggml_time_us();
xuxzh1's avatar
update  
xuxzh1 committed
52
        common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false);
xuxzh1's avatar
init  
xuxzh1 committed
53
54
55

        if (!params.lookup_cache_static.empty()) {
            try {
xuxzh1's avatar
update  
xuxzh1 committed
56
                ngram_cache_static = common_ngram_cache_load(params.lookup_cache_static);
xuxzh1's avatar
init  
xuxzh1 committed
57
            } catch (std::ifstream::failure const &) {
xuxzh1's avatar
update  
xuxzh1 committed
58
                LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
xuxzh1's avatar
init  
xuxzh1 committed
59
60
61
62
63
64
                exit(1);
            }
        }

        if (!params.lookup_cache_dynamic.empty()) {
            try {
xuxzh1's avatar
update  
xuxzh1 committed
65
                ngram_cache_dynamic = common_ngram_cache_load(params.lookup_cache_dynamic);
xuxzh1's avatar
init  
xuxzh1 committed
66
67
68
69
70
71
72
73
74
75
            } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
        }

        t_draft_flat_us += ggml_time_us() - t_start_draft_us;
    }

    const int max_context_size     = llama_n_ctx(ctx);
    const int max_tokens_list_size = max_context_size - 4;

    if ((int) inp.size() > max_tokens_list_size) {
xuxzh1's avatar
update  
xuxzh1 committed
76
        LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
xuxzh1's avatar
init  
xuxzh1 committed
77
78
79
        return 1;
    }

xuxzh1's avatar
update  
xuxzh1 committed
80
    LOG("\n\n");
xuxzh1's avatar
init  
xuxzh1 committed
81
82

    for (auto id : inp) {
xuxzh1's avatar
update  
xuxzh1 committed
83
        LOG("%s", common_token_to_piece(ctx, id).c_str());
xuxzh1's avatar
init  
xuxzh1 committed
84
85
86
87
88
89
90
91
    }

    fflush(stderr);

    const int n_input = inp.size();

    const auto t_enc_start = ggml_time_us();

xuxzh1's avatar
update  
xuxzh1 committed
92
93
    llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1));
    llama_decode(ctx, llama_batch_get_one(&inp.back(),           1));
xuxzh1's avatar
init  
xuxzh1 committed
94
95
96
97
98
99
100
101
102
103
104

    const auto t_enc_end = ggml_time_us();

    int n_predict = 0;
    int n_drafted = 0;
    int n_accept  = 0;

    int n_past = inp.size();

    bool has_eos = false;

xuxzh1's avatar
update  
xuxzh1 committed
105
    struct common_sampler * smpl = common_sampler_init(model, params.sampling);
xuxzh1's avatar
init  
xuxzh1 committed
106
107
108
109
110
111
112
113
114
115
116
117
118
119

    std::vector<llama_token> draft;

    llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, 1);

    // debug
    struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, 1);

    const auto t_dec_start = ggml_time_us();

    while (true) {
        // debug
        if (dump_kv_cache) {
            llama_kv_cache_view_update(ctx, &kvc_view);
xuxzh1's avatar
update  
xuxzh1 committed
120
            common_kv_cache_dump_view_seqs(kvc_view, 40);
xuxzh1's avatar
init  
xuxzh1 committed
121
122
123
        }

        // print current draft sequence
xuxzh1's avatar
update  
xuxzh1 committed
124
        LOG_DBG("drafted %s\n", string_from(ctx, draft).c_str());
xuxzh1's avatar
init  
xuxzh1 committed
125
126
127
128

        int i_dft = 0;
        while (true) {
            // sample from the target model
xuxzh1's avatar
update  
xuxzh1 committed
129
            llama_token id = common_sampler_sample(smpl, ctx, i_dft);
xuxzh1's avatar
init  
xuxzh1 committed
130

xuxzh1's avatar
update  
xuxzh1 committed
131
            common_sampler_accept(smpl, id, true);
xuxzh1's avatar
init  
xuxzh1 committed
132

xuxzh1's avatar
update  
xuxzh1 committed
133
            const std::string token_str = common_token_to_piece(ctx, id);
xuxzh1's avatar
init  
xuxzh1 committed
134
135

            if (!params.use_color) {
xuxzh1's avatar
update  
xuxzh1 committed
136
                LOG("%s", token_str.c_str());
xuxzh1's avatar
init  
xuxzh1 committed
137
138
139
140
141
142
143
144
145
146
            }

            if (llama_token_is_eog(model, id)) {
                has_eos = true;
            }

            ++n_predict;

            // check if the target token matches the draft
            if (i_dft < (int) draft.size() && id == draft[i_dft]) {
xuxzh1's avatar
update  
xuxzh1 committed
147
                LOG_DBG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
xuxzh1's avatar
init  
xuxzh1 committed
148
149
150
151
152
153
154
                ++n_accept;
                ++n_past;
                ++i_dft;
                inp.push_back(id);
                {
                    // Update context ngram cache with the newly accepted token:
                    const int64_t t_start_draft_us = ggml_time_us();
xuxzh1's avatar
update  
xuxzh1 committed
155
                    common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
xuxzh1's avatar
init  
xuxzh1 committed
156
157
158
159
160
                    t_draft_us += ggml_time_us() - t_start_draft_us;
                }

                if (params.use_color) {
                    // color accepted draft token
xuxzh1's avatar
update  
xuxzh1 committed
161
                    LOG("\033[34m%s\033[0m", token_str.c_str());
xuxzh1's avatar
init  
xuxzh1 committed
162
163
164
165
166
167
                    fflush(stdout);
                }
                continue;
            }

            if (params.use_color) {
xuxzh1's avatar
update  
xuxzh1 committed
168
                LOG("%s", token_str.c_str());
xuxzh1's avatar
init  
xuxzh1 committed
169
170
171
172
            }
            fflush(stdout);


xuxzh1's avatar
update  
xuxzh1 committed
173
            LOG_DBG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
xuxzh1's avatar
init  
xuxzh1 committed
174
175
176
177
178
179
180

            draft.clear();
            draft.push_back(id);
            inp.push_back(id);
            {
                // Update context ngram cache with the newly accepted token:
                const int64_t t_start_draft_us = ggml_time_us();
xuxzh1's avatar
update  
xuxzh1 committed
181
                common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
xuxzh1's avatar
init  
xuxzh1 committed
182
183
184
185
186
187
188
189
190
191
192
193
194
                t_draft_us += ggml_time_us() - t_start_draft_us;
            }
            break;
        }

        if ((params.n_predict > 0 && n_predict > params.n_predict) || has_eos) {
            break;
        }

        // KV cache management
        // clean the cache of draft tokens that weren't accepted
        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);

xuxzh1's avatar
update  
xuxzh1 committed
195
196
        common_batch_clear(batch_tgt);
        common_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
xuxzh1's avatar
init  
xuxzh1 committed
197
198
199
200
201
202

        // Draft already contains a single token sampled from the model:
        GGML_ASSERT(draft.size() == 1);
        GGML_ASSERT(draft[0] == inp.back());
        const int64_t t_start_draft_us = ggml_time_us();

xuxzh1's avatar
update  
xuxzh1 committed
203
        common_ngram_cache_draft(inp, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
xuxzh1's avatar
init  
xuxzh1 committed
204
205

        for (size_t i = 1; i < draft.size(); ++i) {
xuxzh1's avatar
update  
xuxzh1 committed
206
            common_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
xuxzh1's avatar
init  
xuxzh1 committed
207
208
209
210
211
212
213
214
215
216
217
218
219
220
        }

        t_draft_us += ggml_time_us() - t_start_draft_us;
        n_drafted += draft.size() - 1;

        llama_decode(ctx, batch_tgt);
        ++n_past;

        draft.erase(draft.begin());
    }

    auto t_dec_end = ggml_time_us();

    // Update dynamic ngram cache with context ngram cache and save it to disk:
xuxzh1's avatar
update  
xuxzh1 committed
221
222
    common_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
    common_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic);
xuxzh1's avatar
init  
xuxzh1 committed
223

xuxzh1's avatar
update  
xuxzh1 committed
224
    LOG("\n\n");
xuxzh1's avatar
init  
xuxzh1 committed
225

xuxzh1's avatar
update  
xuxzh1 committed
226
227
    LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
    LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
xuxzh1's avatar
init  
xuxzh1 committed
228

xuxzh1's avatar
update  
xuxzh1 committed
229
230
231
232
233
234
    LOG_INF("\n");
    LOG_INF("n_draft      = %d\n", n_draft);
    LOG_INF("n_predict    = %d\n", n_predict);
    LOG_INF("n_drafted    = %d\n", n_drafted);
    LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
    LOG_INF("t_draft      = %.2f ms, %.2f us per token, %.2f tokens per second\n",
xuxzh1's avatar
init  
xuxzh1 committed
235
            t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
xuxzh1's avatar
update  
xuxzh1 committed
236
237
238
239
240
    LOG_INF("n_accept     = %d\n", n_accept);
    LOG_INF("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);

    LOG_INF("\ntarget:\n\n");
    common_perf_print(ctx, smpl);
xuxzh1's avatar
init  
xuxzh1 committed
241

xuxzh1's avatar
update  
xuxzh1 committed
242
    common_sampler_free(smpl);
xuxzh1's avatar
init  
xuxzh1 committed
243
244
245
246
247
248
249
250

    llama_batch_free(batch_tgt);

    llama_free(ctx);
    llama_free_model(model);

    llama_backend_free();

xuxzh1's avatar
update  
xuxzh1 committed
251
    LOG("\n\n");
xuxzh1's avatar
init  
xuxzh1 committed
252
253
254

    return 0;
}