eval-callback.cpp 5.98 KB
Newer Older
xuxzh1's avatar
update  
xuxzh1 committed
1
#include "arg.h"
xuxzh1's avatar
init  
xuxzh1 committed
2
#include "common.h"
xuxzh1's avatar
update  
xuxzh1 committed
3
#include "log.h"
xuxzh1's avatar
init  
xuxzh1 committed
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
#include "llama.h"
#include "ggml.h"

#include <cstdio>
#include <string>
#include <vector>

/**
 * This the arbitrary data which will be passed to each callback.
 * Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor.
 */
struct callback_data {
    std::vector<uint8_t> data;
};

static std::string ggml_ne_string(const ggml_tensor * t) {
    std::string str;
    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
        str += std::to_string(t->ne[i]);
        if (i + 1 < GGML_MAX_DIMS) {
            str += ", ";
        }
    }
    return str;
}

static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
    GGML_ASSERT(n > 0);
    float sum = 0;
    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
xuxzh1's avatar
update  
xuxzh1 committed
34
        LOG("                                     [\n");
xuxzh1's avatar
init  
xuxzh1 committed
35
36
        for (int64_t i2 = 0; i2 < ne[2]; i2++) {
            if (i2 == n && ne[2] > 2*n) {
xuxzh1's avatar
update  
xuxzh1 committed
37
                LOG("                                      ..., \n");
xuxzh1's avatar
init  
xuxzh1 committed
38
39
                i2 = ne[2] - n;
            }
xuxzh1's avatar
update  
xuxzh1 committed
40
            LOG("                                      [\n");
xuxzh1's avatar
init  
xuxzh1 committed
41
42
            for (int64_t i1 = 0; i1 < ne[1]; i1++) {
                if (i1 == n && ne[1] > 2*n) {
xuxzh1's avatar
update  
xuxzh1 committed
43
                    LOG("                                       ..., \n");
xuxzh1's avatar
init  
xuxzh1 committed
44
45
                    i1 = ne[1] - n;
                }
xuxzh1's avatar
update  
xuxzh1 committed
46
                LOG("                                       [");
xuxzh1's avatar
init  
xuxzh1 committed
47
48
                for (int64_t i0 = 0; i0 < ne[0]; i0++) {
                    if (i0 == n && ne[0] > 2*n) {
xuxzh1's avatar
update  
xuxzh1 committed
49
                        LOG("..., ");
xuxzh1's avatar
init  
xuxzh1 committed
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
                        i0 = ne[0] - n;
                    }
                    size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
                    float v;
                    if (type == GGML_TYPE_F16) {
                        v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
                    } else if (type == GGML_TYPE_F32) {
                        v = *(float *) &data[i];
                    } else if (type == GGML_TYPE_I32) {
                        v = (float) *(int32_t *) &data[i];
                    } else if (type == GGML_TYPE_I16) {
                        v = (float) *(int16_t *) &data[i];
                    } else if (type == GGML_TYPE_I8) {
                        v = (float) *(int8_t *) &data[i];
                    } else {
                        GGML_ABORT("fatal error");
                    }
xuxzh1's avatar
update  
xuxzh1 committed
67
                    LOG("%12.4f", v);
xuxzh1's avatar
init  
xuxzh1 committed
68
                    sum += v;
xuxzh1's avatar
update  
xuxzh1 committed
69
                    if (i0 < ne[0] - 1) LOG(", ");
xuxzh1's avatar
init  
xuxzh1 committed
70
                }
xuxzh1's avatar
update  
xuxzh1 committed
71
                LOG("],\n");
xuxzh1's avatar
init  
xuxzh1 committed
72
            }
xuxzh1's avatar
update  
xuxzh1 committed
73
            LOG("                                      ],\n");
xuxzh1's avatar
init  
xuxzh1 committed
74
        }
xuxzh1's avatar
update  
xuxzh1 committed
75
76
        LOG("                                     ]\n");
        LOG("                                     sum = %f\n", sum);
xuxzh1's avatar
init  
xuxzh1 committed
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
    }
}

/**
 * GGML operations callback during the graph execution.
 *
 * @param t current tensor
 * @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
 *            if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
 *            see ggml_backend_sched_eval_callback
 * @param user_data user data to pass at each call back
 * @return true to receive data or continue the graph, false otherwise
 */
static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
    auto * cb_data = (callback_data *) user_data;

    const struct ggml_tensor * src0 = t->src[0];
    const struct ggml_tensor * src1 = t->src[1];

    if (ask) {
        return true; // Always retrieve data
    }

    char src1_str[128] = {0};
    if (src1) {
        snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
    }

xuxzh1's avatar
update  
xuxzh1 committed
105
106
107
108
109
    LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
         t->name, ggml_type_name(t->type), ggml_op_desc(t),
         src0->name, ggml_ne_string(src0).c_str(),
         src1 ? src1_str : "",
         ggml_ne_string(t).c_str());
xuxzh1's avatar
init  
xuxzh1 committed
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128


    // copy the data from the GPU memory if needed
    const bool is_host = ggml_backend_buffer_is_host(t->buffer);

    if (!is_host) {
        auto n_bytes = ggml_nbytes(t);
        cb_data->data.resize(n_bytes);
        ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
    }

    if (!ggml_is_quantized(t->type)) {
        uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
        ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
    }

    return true;
}

xuxzh1's avatar
update  
xuxzh1 committed
129
130
static bool run(llama_context * ctx, const common_params & params) {
    const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
xuxzh1's avatar
init  
xuxzh1 committed
131

xuxzh1's avatar
update  
xuxzh1 committed
132
    std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);
xuxzh1's avatar
init  
xuxzh1 committed
133

xuxzh1's avatar
update  
xuxzh1 committed
134
135
    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
        LOG_ERR("%s : failed to eval\n", __func__);
xuxzh1's avatar
init  
xuxzh1 committed
136
137
138
139
140
141
142
143
144
        return false;
    }

    return true;
}

int main(int argc, char ** argv) {
    callback_data cb_data;

xuxzh1's avatar
update  
xuxzh1 committed
145
    common_params params;
xuxzh1's avatar
init  
xuxzh1 committed
146

xuxzh1's avatar
update  
xuxzh1 committed
147
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
xuxzh1's avatar
init  
xuxzh1 committed
148
149
150
        return 1;
    }

xuxzh1's avatar
update  
xuxzh1 committed
151
    common_init();
xuxzh1's avatar
init  
xuxzh1 committed
152
153
154
155
156
157
158
159
160
161
162

    llama_backend_init();
    llama_numa_init(params.numa);

    // pass the callback to the backend scheduler
    // it will be executed for each node during the graph computation
    params.cb_eval = ggml_debug;
    params.cb_eval_user_data = &cb_data;
    params.warmup = false;

    // init
xuxzh1's avatar
update  
xuxzh1 committed
163
    common_init_result llama_init = common_init_from_params(params);
xuxzh1's avatar
init  
xuxzh1 committed
164
165
166
167

    llama_model * model = llama_init.model;
    llama_context * ctx = llama_init.context;
    if (model == nullptr || ctx == nullptr) {
xuxzh1's avatar
update  
xuxzh1 committed
168
        LOG_ERR("%s : failed to init\n", __func__);
xuxzh1's avatar
init  
xuxzh1 committed
169
170
171
172
173
        return 1;
    }

    // print system information
    {
xuxzh1's avatar
update  
xuxzh1 committed
174
175
176
        LOG_INF("\n");
        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
        LOG_INF("\n");
xuxzh1's avatar
init  
xuxzh1 committed
177
178
179
180
181
182
183
    }

    bool OK = run(ctx, params);
    if (!OK) {
        return 1;
    }

xuxzh1's avatar
update  
xuxzh1 committed
184
185
    LOG("\n");
    llama_perf_context_print(ctx);
xuxzh1's avatar
init  
xuxzh1 committed
186
187
188
189
190
191
192
193

    llama_free(ctx);
    llama_free_model(model);

    llama_backend_free();

    return 0;
}